[i386] Remove ix86_frame::outlined_save_offset and machine_function::call_ms2sysv_pad_out
[official-gcc.git] / gcc / config / i386 / i386.c
blobcd20b1be6ec09d2b3973b2684d3a42c8b4ff44a6
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
89 /* This file should be included last. */
90 #include "target-def.h"
92 static rtx legitimize_dllimport_symbol (rtx, bool);
93 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
94 static rtx legitimize_pe_coff_symbol (rtx, bool);
95 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
96 static bool ix86_save_reg (unsigned int, bool, bool);
97 static bool ix86_function_naked (const_tree);
99 #ifndef CHECK_STACK_LIMIT
100 #define CHECK_STACK_LIMIT (-1)
101 #endif
103 /* Return index of given mode in mult and division cost tables. */
104 #define MODE_INDEX(mode) \
105 ((mode) == QImode ? 0 \
106 : (mode) == HImode ? 1 \
107 : (mode) == SImode ? 2 \
108 : (mode) == DImode ? 3 \
109 : 4)
111 /* Processor costs (relative to an add) */
112 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
113 #define COSTS_N_BYTES(N) ((N) * 2)
115 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
117 static stringop_algs ix86_size_memcpy[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
120 static stringop_algs ix86_size_memset[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
124 const
125 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
126 COSTS_N_BYTES (2), /* cost of an add instruction */
127 COSTS_N_BYTES (3), /* cost of a lea instruction */
128 COSTS_N_BYTES (2), /* variable shift costs */
129 COSTS_N_BYTES (3), /* constant shift costs */
130 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
131 COSTS_N_BYTES (3), /* HI */
132 COSTS_N_BYTES (3), /* SI */
133 COSTS_N_BYTES (3), /* DI */
134 COSTS_N_BYTES (5)}, /* other */
135 0, /* cost of multiply per each bit set */
136 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
137 COSTS_N_BYTES (3), /* HI */
138 COSTS_N_BYTES (3), /* SI */
139 COSTS_N_BYTES (3), /* DI */
140 COSTS_N_BYTES (5)}, /* other */
141 COSTS_N_BYTES (3), /* cost of movsx */
142 COSTS_N_BYTES (3), /* cost of movzx */
143 0, /* "large" insn */
144 2, /* MOVE_RATIO */
145 2, /* cost for loading QImode using movzbl */
146 {2, 2, 2}, /* cost of loading integer registers
147 in QImode, HImode and SImode.
148 Relative to reg-reg move (2). */
149 {2, 2, 2}, /* cost of storing integer registers */
150 2, /* cost of reg,reg fld/fst */
151 {2, 2, 2}, /* cost of loading fp registers
152 in SFmode, DFmode and XFmode */
153 {2, 2, 2}, /* cost of storing fp registers
154 in SFmode, DFmode and XFmode */
155 3, /* cost of moving MMX register */
156 {3, 3}, /* cost of loading MMX registers
157 in SImode and DImode */
158 {3, 3}, /* cost of storing MMX registers
159 in SImode and DImode */
160 3, /* cost of moving SSE register */
161 {3, 3, 3}, /* cost of loading SSE registers
162 in SImode, DImode and TImode */
163 {3, 3, 3}, /* cost of storing SSE registers
164 in SImode, DImode and TImode */
165 3, /* MMX or SSE register to integer */
166 0, /* size of l1 cache */
167 0, /* size of l2 cache */
168 0, /* size of prefetch block */
169 0, /* number of parallel prefetches */
170 2, /* Branch cost */
171 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
172 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
173 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
174 COSTS_N_BYTES (2), /* cost of FABS instruction. */
175 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
176 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
177 ix86_size_memcpy,
178 ix86_size_memset,
179 1, /* scalar_stmt_cost. */
180 1, /* scalar load_cost. */
181 1, /* scalar_store_cost. */
182 1, /* vec_stmt_cost. */
183 1, /* vec_to_scalar_cost. */
184 1, /* scalar_to_vec_cost. */
185 1, /* vec_align_load_cost. */
186 1, /* vec_unalign_load_cost. */
187 1, /* vec_store_cost. */
188 1, /* cond_taken_branch_cost. */
189 1, /* cond_not_taken_branch_cost. */
192 /* Processor costs (relative to an add) */
193 static stringop_algs i386_memcpy[2] = {
194 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
195 DUMMY_STRINGOP_ALGS};
196 static stringop_algs i386_memset[2] = {
197 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
198 DUMMY_STRINGOP_ALGS};
200 static const
201 struct processor_costs i386_cost = { /* 386 specific costs */
202 COSTS_N_INSNS (1), /* cost of an add instruction */
203 COSTS_N_INSNS (1), /* cost of a lea instruction */
204 COSTS_N_INSNS (3), /* variable shift costs */
205 COSTS_N_INSNS (2), /* constant shift costs */
206 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
207 COSTS_N_INSNS (6), /* HI */
208 COSTS_N_INSNS (6), /* SI */
209 COSTS_N_INSNS (6), /* DI */
210 COSTS_N_INSNS (6)}, /* other */
211 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
212 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
213 COSTS_N_INSNS (23), /* HI */
214 COSTS_N_INSNS (23), /* SI */
215 COSTS_N_INSNS (23), /* DI */
216 COSTS_N_INSNS (23)}, /* other */
217 COSTS_N_INSNS (3), /* cost of movsx */
218 COSTS_N_INSNS (2), /* cost of movzx */
219 15, /* "large" insn */
220 3, /* MOVE_RATIO */
221 4, /* cost for loading QImode using movzbl */
222 {2, 4, 2}, /* cost of loading integer registers
223 in QImode, HImode and SImode.
224 Relative to reg-reg move (2). */
225 {2, 4, 2}, /* cost of storing integer registers */
226 2, /* cost of reg,reg fld/fst */
227 {8, 8, 8}, /* cost of loading fp registers
228 in SFmode, DFmode and XFmode */
229 {8, 8, 8}, /* cost of storing fp registers
230 in SFmode, DFmode and XFmode */
231 2, /* cost of moving MMX register */
232 {4, 8}, /* cost of loading MMX registers
233 in SImode and DImode */
234 {4, 8}, /* cost of storing MMX registers
235 in SImode and DImode */
236 2, /* cost of moving SSE register */
237 {4, 8, 16}, /* cost of loading SSE registers
238 in SImode, DImode and TImode */
239 {4, 8, 16}, /* cost of storing SSE registers
240 in SImode, DImode and TImode */
241 3, /* MMX or SSE register to integer */
242 0, /* size of l1 cache */
243 0, /* size of l2 cache */
244 0, /* size of prefetch block */
245 0, /* number of parallel prefetches */
246 1, /* Branch cost */
247 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
248 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
249 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
250 COSTS_N_INSNS (22), /* cost of FABS instruction. */
251 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
252 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
253 i386_memcpy,
254 i386_memset,
255 1, /* scalar_stmt_cost. */
256 1, /* scalar load_cost. */
257 1, /* scalar_store_cost. */
258 1, /* vec_stmt_cost. */
259 1, /* vec_to_scalar_cost. */
260 1, /* scalar_to_vec_cost. */
261 1, /* vec_align_load_cost. */
262 2, /* vec_unalign_load_cost. */
263 1, /* vec_store_cost. */
264 3, /* cond_taken_branch_cost. */
265 1, /* cond_not_taken_branch_cost. */
268 static stringop_algs i486_memcpy[2] = {
269 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
270 DUMMY_STRINGOP_ALGS};
271 static stringop_algs i486_memset[2] = {
272 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
273 DUMMY_STRINGOP_ALGS};
275 static const
276 struct processor_costs i486_cost = { /* 486 specific costs */
277 COSTS_N_INSNS (1), /* cost of an add instruction */
278 COSTS_N_INSNS (1), /* cost of a lea instruction */
279 COSTS_N_INSNS (3), /* variable shift costs */
280 COSTS_N_INSNS (2), /* constant shift costs */
281 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
282 COSTS_N_INSNS (12), /* HI */
283 COSTS_N_INSNS (12), /* SI */
284 COSTS_N_INSNS (12), /* DI */
285 COSTS_N_INSNS (12)}, /* other */
286 1, /* cost of multiply per each bit set */
287 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
288 COSTS_N_INSNS (40), /* HI */
289 COSTS_N_INSNS (40), /* SI */
290 COSTS_N_INSNS (40), /* DI */
291 COSTS_N_INSNS (40)}, /* other */
292 COSTS_N_INSNS (3), /* cost of movsx */
293 COSTS_N_INSNS (2), /* cost of movzx */
294 15, /* "large" insn */
295 3, /* MOVE_RATIO */
296 4, /* cost for loading QImode using movzbl */
297 {2, 4, 2}, /* cost of loading integer registers
298 in QImode, HImode and SImode.
299 Relative to reg-reg move (2). */
300 {2, 4, 2}, /* cost of storing integer registers */
301 2, /* cost of reg,reg fld/fst */
302 {8, 8, 8}, /* cost of loading fp registers
303 in SFmode, DFmode and XFmode */
304 {8, 8, 8}, /* cost of storing fp registers
305 in SFmode, DFmode and XFmode */
306 2, /* cost of moving MMX register */
307 {4, 8}, /* cost of loading MMX registers
308 in SImode and DImode */
309 {4, 8}, /* cost of storing MMX registers
310 in SImode and DImode */
311 2, /* cost of moving SSE register */
312 {4, 8, 16}, /* cost of loading SSE registers
313 in SImode, DImode and TImode */
314 {4, 8, 16}, /* cost of storing SSE registers
315 in SImode, DImode and TImode */
316 3, /* MMX or SSE register to integer */
317 4, /* size of l1 cache. 486 has 8kB cache
318 shared for code and data, so 4kB is
319 not really precise. */
320 4, /* size of l2 cache */
321 0, /* size of prefetch block */
322 0, /* number of parallel prefetches */
323 1, /* Branch cost */
324 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
325 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
326 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
327 COSTS_N_INSNS (3), /* cost of FABS instruction. */
328 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
329 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
330 i486_memcpy,
331 i486_memset,
332 1, /* scalar_stmt_cost. */
333 1, /* scalar load_cost. */
334 1, /* scalar_store_cost. */
335 1, /* vec_stmt_cost. */
336 1, /* vec_to_scalar_cost. */
337 1, /* scalar_to_vec_cost. */
338 1, /* vec_align_load_cost. */
339 2, /* vec_unalign_load_cost. */
340 1, /* vec_store_cost. */
341 3, /* cond_taken_branch_cost. */
342 1, /* cond_not_taken_branch_cost. */
345 static stringop_algs pentium_memcpy[2] = {
346 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
347 DUMMY_STRINGOP_ALGS};
348 static stringop_algs pentium_memset[2] = {
349 {libcall, {{-1, rep_prefix_4_byte, false}}},
350 DUMMY_STRINGOP_ALGS};
352 static const
353 struct processor_costs pentium_cost = {
354 COSTS_N_INSNS (1), /* cost of an add instruction */
355 COSTS_N_INSNS (1), /* cost of a lea instruction */
356 COSTS_N_INSNS (4), /* variable shift costs */
357 COSTS_N_INSNS (1), /* constant shift costs */
358 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
359 COSTS_N_INSNS (11), /* HI */
360 COSTS_N_INSNS (11), /* SI */
361 COSTS_N_INSNS (11), /* DI */
362 COSTS_N_INSNS (11)}, /* other */
363 0, /* cost of multiply per each bit set */
364 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
365 COSTS_N_INSNS (25), /* HI */
366 COSTS_N_INSNS (25), /* SI */
367 COSTS_N_INSNS (25), /* DI */
368 COSTS_N_INSNS (25)}, /* other */
369 COSTS_N_INSNS (3), /* cost of movsx */
370 COSTS_N_INSNS (2), /* cost of movzx */
371 8, /* "large" insn */
372 6, /* MOVE_RATIO */
373 6, /* cost for loading QImode using movzbl */
374 {2, 4, 2}, /* cost of loading integer registers
375 in QImode, HImode and SImode.
376 Relative to reg-reg move (2). */
377 {2, 4, 2}, /* cost of storing integer registers */
378 2, /* cost of reg,reg fld/fst */
379 {2, 2, 6}, /* cost of loading fp registers
380 in SFmode, DFmode and XFmode */
381 {4, 4, 6}, /* cost of storing fp registers
382 in SFmode, DFmode and XFmode */
383 8, /* cost of moving MMX register */
384 {8, 8}, /* cost of loading MMX registers
385 in SImode and DImode */
386 {8, 8}, /* cost of storing MMX registers
387 in SImode and DImode */
388 2, /* cost of moving SSE register */
389 {4, 8, 16}, /* cost of loading SSE registers
390 in SImode, DImode and TImode */
391 {4, 8, 16}, /* cost of storing SSE registers
392 in SImode, DImode and TImode */
393 3, /* MMX or SSE register to integer */
394 8, /* size of l1 cache. */
395 8, /* size of l2 cache */
396 0, /* size of prefetch block */
397 0, /* number of parallel prefetches */
398 2, /* Branch cost */
399 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
400 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
401 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
402 COSTS_N_INSNS (1), /* cost of FABS instruction. */
403 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
404 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
405 pentium_memcpy,
406 pentium_memset,
407 1, /* scalar_stmt_cost. */
408 1, /* scalar load_cost. */
409 1, /* scalar_store_cost. */
410 1, /* vec_stmt_cost. */
411 1, /* vec_to_scalar_cost. */
412 1, /* scalar_to_vec_cost. */
413 1, /* vec_align_load_cost. */
414 2, /* vec_unalign_load_cost. */
415 1, /* vec_store_cost. */
416 3, /* cond_taken_branch_cost. */
417 1, /* cond_not_taken_branch_cost. */
420 static const
421 struct processor_costs lakemont_cost = {
422 COSTS_N_INSNS (1), /* cost of an add instruction */
423 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
424 COSTS_N_INSNS (1), /* variable shift costs */
425 COSTS_N_INSNS (1), /* constant shift costs */
426 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
427 COSTS_N_INSNS (11), /* HI */
428 COSTS_N_INSNS (11), /* SI */
429 COSTS_N_INSNS (11), /* DI */
430 COSTS_N_INSNS (11)}, /* other */
431 0, /* cost of multiply per each bit set */
432 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
433 COSTS_N_INSNS (25), /* HI */
434 COSTS_N_INSNS (25), /* SI */
435 COSTS_N_INSNS (25), /* DI */
436 COSTS_N_INSNS (25)}, /* other */
437 COSTS_N_INSNS (3), /* cost of movsx */
438 COSTS_N_INSNS (2), /* cost of movzx */
439 8, /* "large" insn */
440 17, /* MOVE_RATIO */
441 6, /* cost for loading QImode using movzbl */
442 {2, 4, 2}, /* cost of loading integer registers
443 in QImode, HImode and SImode.
444 Relative to reg-reg move (2). */
445 {2, 4, 2}, /* cost of storing integer registers */
446 2, /* cost of reg,reg fld/fst */
447 {2, 2, 6}, /* cost of loading fp registers
448 in SFmode, DFmode and XFmode */
449 {4, 4, 6}, /* cost of storing fp registers
450 in SFmode, DFmode and XFmode */
451 8, /* cost of moving MMX register */
452 {8, 8}, /* cost of loading MMX registers
453 in SImode and DImode */
454 {8, 8}, /* cost of storing MMX registers
455 in SImode and DImode */
456 2, /* cost of moving SSE register */
457 {4, 8, 16}, /* cost of loading SSE registers
458 in SImode, DImode and TImode */
459 {4, 8, 16}, /* cost of storing SSE registers
460 in SImode, DImode and TImode */
461 3, /* MMX or SSE register to integer */
462 8, /* size of l1 cache. */
463 8, /* size of l2 cache */
464 0, /* size of prefetch block */
465 0, /* number of parallel prefetches */
466 2, /* Branch cost */
467 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
468 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
469 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
470 COSTS_N_INSNS (1), /* cost of FABS instruction. */
471 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
472 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
473 pentium_memcpy,
474 pentium_memset,
475 1, /* scalar_stmt_cost. */
476 1, /* scalar load_cost. */
477 1, /* scalar_store_cost. */
478 1, /* vec_stmt_cost. */
479 1, /* vec_to_scalar_cost. */
480 1, /* scalar_to_vec_cost. */
481 1, /* vec_align_load_cost. */
482 2, /* vec_unalign_load_cost. */
483 1, /* vec_store_cost. */
484 3, /* cond_taken_branch_cost. */
485 1, /* cond_not_taken_branch_cost. */
488 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
489 (we ensure the alignment). For small blocks inline loop is still a
490 noticeable win, for bigger blocks either rep movsl or rep movsb is
491 way to go. Rep movsb has apparently more expensive startup time in CPU,
492 but after 4K the difference is down in the noise. */
493 static stringop_algs pentiumpro_memcpy[2] = {
494 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
495 {8192, rep_prefix_4_byte, false},
496 {-1, rep_prefix_1_byte, false}}},
497 DUMMY_STRINGOP_ALGS};
498 static stringop_algs pentiumpro_memset[2] = {
499 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
500 {8192, rep_prefix_4_byte, false},
501 {-1, libcall, false}}},
502 DUMMY_STRINGOP_ALGS};
503 static const
504 struct processor_costs pentiumpro_cost = {
505 COSTS_N_INSNS (1), /* cost of an add instruction */
506 COSTS_N_INSNS (1), /* cost of a lea instruction */
507 COSTS_N_INSNS (1), /* variable shift costs */
508 COSTS_N_INSNS (1), /* constant shift costs */
509 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
510 COSTS_N_INSNS (4), /* HI */
511 COSTS_N_INSNS (4), /* SI */
512 COSTS_N_INSNS (4), /* DI */
513 COSTS_N_INSNS (4)}, /* other */
514 0, /* cost of multiply per each bit set */
515 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
516 COSTS_N_INSNS (17), /* HI */
517 COSTS_N_INSNS (17), /* SI */
518 COSTS_N_INSNS (17), /* DI */
519 COSTS_N_INSNS (17)}, /* other */
520 COSTS_N_INSNS (1), /* cost of movsx */
521 COSTS_N_INSNS (1), /* cost of movzx */
522 8, /* "large" insn */
523 6, /* MOVE_RATIO */
524 2, /* cost for loading QImode using movzbl */
525 {4, 4, 4}, /* cost of loading integer registers
526 in QImode, HImode and SImode.
527 Relative to reg-reg move (2). */
528 {2, 2, 2}, /* cost of storing integer registers */
529 2, /* cost of reg,reg fld/fst */
530 {2, 2, 6}, /* cost of loading fp registers
531 in SFmode, DFmode and XFmode */
532 {4, 4, 6}, /* cost of storing fp registers
533 in SFmode, DFmode and XFmode */
534 2, /* cost of moving MMX register */
535 {2, 2}, /* cost of loading MMX registers
536 in SImode and DImode */
537 {2, 2}, /* cost of storing MMX registers
538 in SImode and DImode */
539 2, /* cost of moving SSE register */
540 {2, 2, 8}, /* cost of loading SSE registers
541 in SImode, DImode and TImode */
542 {2, 2, 8}, /* cost of storing SSE registers
543 in SImode, DImode and TImode */
544 3, /* MMX or SSE register to integer */
545 8, /* size of l1 cache. */
546 256, /* size of l2 cache */
547 32, /* size of prefetch block */
548 6, /* number of parallel prefetches */
549 2, /* Branch cost */
550 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
551 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
552 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
553 COSTS_N_INSNS (2), /* cost of FABS instruction. */
554 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
555 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
556 pentiumpro_memcpy,
557 pentiumpro_memset,
558 1, /* scalar_stmt_cost. */
559 1, /* scalar load_cost. */
560 1, /* scalar_store_cost. */
561 1, /* vec_stmt_cost. */
562 1, /* vec_to_scalar_cost. */
563 1, /* scalar_to_vec_cost. */
564 1, /* vec_align_load_cost. */
565 2, /* vec_unalign_load_cost. */
566 1, /* vec_store_cost. */
567 3, /* cond_taken_branch_cost. */
568 1, /* cond_not_taken_branch_cost. */
571 static stringop_algs geode_memcpy[2] = {
572 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
573 DUMMY_STRINGOP_ALGS};
574 static stringop_algs geode_memset[2] = {
575 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS};
577 static const
578 struct processor_costs geode_cost = {
579 COSTS_N_INSNS (1), /* cost of an add instruction */
580 COSTS_N_INSNS (1), /* cost of a lea instruction */
581 COSTS_N_INSNS (2), /* variable shift costs */
582 COSTS_N_INSNS (1), /* constant shift costs */
583 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
584 COSTS_N_INSNS (4), /* HI */
585 COSTS_N_INSNS (7), /* SI */
586 COSTS_N_INSNS (7), /* DI */
587 COSTS_N_INSNS (7)}, /* other */
588 0, /* cost of multiply per each bit set */
589 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
590 COSTS_N_INSNS (23), /* HI */
591 COSTS_N_INSNS (39), /* SI */
592 COSTS_N_INSNS (39), /* DI */
593 COSTS_N_INSNS (39)}, /* other */
594 COSTS_N_INSNS (1), /* cost of movsx */
595 COSTS_N_INSNS (1), /* cost of movzx */
596 8, /* "large" insn */
597 4, /* MOVE_RATIO */
598 1, /* cost for loading QImode using movzbl */
599 {1, 1, 1}, /* cost of loading integer registers
600 in QImode, HImode and SImode.
601 Relative to reg-reg move (2). */
602 {1, 1, 1}, /* cost of storing integer registers */
603 1, /* cost of reg,reg fld/fst */
604 {1, 1, 1}, /* cost of loading fp registers
605 in SFmode, DFmode and XFmode */
606 {4, 6, 6}, /* cost of storing fp registers
607 in SFmode, DFmode and XFmode */
609 2, /* cost of moving MMX register */
610 {2, 2}, /* cost of loading MMX registers
611 in SImode and DImode */
612 {2, 2}, /* cost of storing MMX registers
613 in SImode and DImode */
614 2, /* cost of moving SSE register */
615 {2, 2, 8}, /* cost of loading SSE registers
616 in SImode, DImode and TImode */
617 {2, 2, 8}, /* cost of storing SSE registers
618 in SImode, DImode and TImode */
619 3, /* MMX or SSE register to integer */
620 64, /* size of l1 cache. */
621 128, /* size of l2 cache. */
622 32, /* size of prefetch block */
623 1, /* number of parallel prefetches */
624 1, /* Branch cost */
625 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
626 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
627 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
628 COSTS_N_INSNS (1), /* cost of FABS instruction. */
629 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
630 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
631 geode_memcpy,
632 geode_memset,
633 1, /* scalar_stmt_cost. */
634 1, /* scalar load_cost. */
635 1, /* scalar_store_cost. */
636 1, /* vec_stmt_cost. */
637 1, /* vec_to_scalar_cost. */
638 1, /* scalar_to_vec_cost. */
639 1, /* vec_align_load_cost. */
640 2, /* vec_unalign_load_cost. */
641 1, /* vec_store_cost. */
642 3, /* cond_taken_branch_cost. */
643 1, /* cond_not_taken_branch_cost. */
646 static stringop_algs k6_memcpy[2] = {
647 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
648 DUMMY_STRINGOP_ALGS};
649 static stringop_algs k6_memset[2] = {
650 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static const
653 struct processor_costs k6_cost = {
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (2), /* cost of a lea instruction */
656 COSTS_N_INSNS (1), /* variable shift costs */
657 COSTS_N_INSNS (1), /* constant shift costs */
658 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (3), /* HI */
660 COSTS_N_INSNS (3), /* SI */
661 COSTS_N_INSNS (3), /* DI */
662 COSTS_N_INSNS (3)}, /* other */
663 0, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (18), /* HI */
666 COSTS_N_INSNS (18), /* SI */
667 COSTS_N_INSNS (18), /* DI */
668 COSTS_N_INSNS (18)}, /* other */
669 COSTS_N_INSNS (2), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 8, /* "large" insn */
672 4, /* MOVE_RATIO */
673 3, /* cost for loading QImode using movzbl */
674 {4, 5, 4}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 3, 2}, /* cost of storing integer registers */
678 4, /* cost of reg,reg fld/fst */
679 {6, 6, 6}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {4, 4, 4}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {2, 2}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {2, 2}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {2, 2, 8}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {2, 2, 8}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 6, /* MMX or SSE register to integer */
694 32, /* size of l1 cache. */
695 32, /* size of l2 cache. Some models
696 have integrated l2 cache, but
697 optimizing for k6 is not important
698 enough to worry about that. */
699 32, /* size of prefetch block */
700 1, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (2), /* cost of FABS instruction. */
706 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
708 k6_memcpy,
709 k6_memset,
710 1, /* scalar_stmt_cost. */
711 1, /* scalar load_cost. */
712 1, /* scalar_store_cost. */
713 1, /* vec_stmt_cost. */
714 1, /* vec_to_scalar_cost. */
715 1, /* scalar_to_vec_cost. */
716 1, /* vec_align_load_cost. */
717 2, /* vec_unalign_load_cost. */
718 1, /* vec_store_cost. */
719 3, /* cond_taken_branch_cost. */
720 1, /* cond_not_taken_branch_cost. */
723 /* For some reason, Athlon deals better with REP prefix (relative to loops)
724 compared to K8. Alignment becomes important after 8 bytes for memcpy and
725 128 bytes for memset. */
726 static stringop_algs athlon_memcpy[2] = {
727 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
728 DUMMY_STRINGOP_ALGS};
729 static stringop_algs athlon_memset[2] = {
730 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 DUMMY_STRINGOP_ALGS};
732 static const
733 struct processor_costs athlon_cost = {
734 COSTS_N_INSNS (1), /* cost of an add instruction */
735 COSTS_N_INSNS (2), /* cost of a lea instruction */
736 COSTS_N_INSNS (1), /* variable shift costs */
737 COSTS_N_INSNS (1), /* constant shift costs */
738 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
739 COSTS_N_INSNS (5), /* HI */
740 COSTS_N_INSNS (5), /* SI */
741 COSTS_N_INSNS (5), /* DI */
742 COSTS_N_INSNS (5)}, /* other */
743 0, /* cost of multiply per each bit set */
744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
745 COSTS_N_INSNS (26), /* HI */
746 COSTS_N_INSNS (42), /* SI */
747 COSTS_N_INSNS (74), /* DI */
748 COSTS_N_INSNS (74)}, /* other */
749 COSTS_N_INSNS (1), /* cost of movsx */
750 COSTS_N_INSNS (1), /* cost of movzx */
751 8, /* "large" insn */
752 9, /* MOVE_RATIO */
753 4, /* cost for loading QImode using movzbl */
754 {3, 4, 3}, /* cost of loading integer registers
755 in QImode, HImode and SImode.
756 Relative to reg-reg move (2). */
757 {3, 4, 3}, /* cost of storing integer registers */
758 4, /* cost of reg,reg fld/fst */
759 {4, 4, 12}, /* cost of loading fp registers
760 in SFmode, DFmode and XFmode */
761 {6, 6, 8}, /* cost of storing fp registers
762 in SFmode, DFmode and XFmode */
763 2, /* cost of moving MMX register */
764 {4, 4}, /* cost of loading MMX registers
765 in SImode and DImode */
766 {4, 4}, /* cost of storing MMX registers
767 in SImode and DImode */
768 2, /* cost of moving SSE register */
769 {4, 4, 6}, /* cost of loading SSE registers
770 in SImode, DImode and TImode */
771 {4, 4, 5}, /* cost of storing SSE registers
772 in SImode, DImode and TImode */
773 5, /* MMX or SSE register to integer */
774 64, /* size of l1 cache. */
775 256, /* size of l2 cache. */
776 64, /* size of prefetch block */
777 6, /* number of parallel prefetches */
778 5, /* Branch cost */
779 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
780 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
781 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
782 COSTS_N_INSNS (2), /* cost of FABS instruction. */
783 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
784 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
785 athlon_memcpy,
786 athlon_memset,
787 1, /* scalar_stmt_cost. */
788 1, /* scalar load_cost. */
789 1, /* scalar_store_cost. */
790 1, /* vec_stmt_cost. */
791 1, /* vec_to_scalar_cost. */
792 1, /* scalar_to_vec_cost. */
793 1, /* vec_align_load_cost. */
794 2, /* vec_unalign_load_cost. */
795 1, /* vec_store_cost. */
796 3, /* cond_taken_branch_cost. */
797 1, /* cond_not_taken_branch_cost. */
800 /* K8 has optimized REP instruction for medium sized blocks, but for very
801 small blocks it is better to use loop. For large blocks, libcall can
802 do nontemporary accesses and beat inline considerably. */
803 static stringop_algs k8_memcpy[2] = {
804 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
805 {-1, rep_prefix_4_byte, false}}},
806 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
807 {-1, libcall, false}}}};
808 static stringop_algs k8_memset[2] = {
809 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
810 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
811 {libcall, {{48, unrolled_loop, false},
812 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
813 static const
814 struct processor_costs k8_cost = {
815 COSTS_N_INSNS (1), /* cost of an add instruction */
816 COSTS_N_INSNS (2), /* cost of a lea instruction */
817 COSTS_N_INSNS (1), /* variable shift costs */
818 COSTS_N_INSNS (1), /* constant shift costs */
819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
820 COSTS_N_INSNS (4), /* HI */
821 COSTS_N_INSNS (3), /* SI */
822 COSTS_N_INSNS (4), /* DI */
823 COSTS_N_INSNS (5)}, /* other */
824 0, /* cost of multiply per each bit set */
825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
826 COSTS_N_INSNS (26), /* HI */
827 COSTS_N_INSNS (42), /* SI */
828 COSTS_N_INSNS (74), /* DI */
829 COSTS_N_INSNS (74)}, /* other */
830 COSTS_N_INSNS (1), /* cost of movsx */
831 COSTS_N_INSNS (1), /* cost of movzx */
832 8, /* "large" insn */
833 9, /* MOVE_RATIO */
834 4, /* cost for loading QImode using movzbl */
835 {3, 4, 3}, /* cost of loading integer registers
836 in QImode, HImode and SImode.
837 Relative to reg-reg move (2). */
838 {3, 4, 3}, /* cost of storing integer registers */
839 4, /* cost of reg,reg fld/fst */
840 {4, 4, 12}, /* cost of loading fp registers
841 in SFmode, DFmode and XFmode */
842 {6, 6, 8}, /* cost of storing fp registers
843 in SFmode, DFmode and XFmode */
844 2, /* cost of moving MMX register */
845 {3, 3}, /* cost of loading MMX registers
846 in SImode and DImode */
847 {4, 4}, /* cost of storing MMX registers
848 in SImode and DImode */
849 2, /* cost of moving SSE register */
850 {4, 3, 6}, /* cost of loading SSE registers
851 in SImode, DImode and TImode */
852 {4, 4, 5}, /* cost of storing SSE registers
853 in SImode, DImode and TImode */
854 5, /* MMX or SSE register to integer */
855 64, /* size of l1 cache. */
856 512, /* size of l2 cache. */
857 64, /* size of prefetch block */
858 /* New AMD processors never drop prefetches; if they cannot be performed
859 immediately, they are queued. We set number of simultaneous prefetches
860 to a large constant to reflect this (it probably is not a good idea not
861 to limit number of prefetches at all, as their execution also takes some
862 time). */
863 100, /* number of parallel prefetches */
864 3, /* Branch cost */
865 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
866 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
867 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
868 COSTS_N_INSNS (2), /* cost of FABS instruction. */
869 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
870 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
872 k8_memcpy,
873 k8_memset,
874 4, /* scalar_stmt_cost. */
875 2, /* scalar load_cost. */
876 2, /* scalar_store_cost. */
877 5, /* vec_stmt_cost. */
878 0, /* vec_to_scalar_cost. */
879 2, /* scalar_to_vec_cost. */
880 2, /* vec_align_load_cost. */
881 3, /* vec_unalign_load_cost. */
882 3, /* vec_store_cost. */
883 3, /* cond_taken_branch_cost. */
884 2, /* cond_not_taken_branch_cost. */
887 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
888 very small blocks it is better to use loop. For large blocks, libcall can
889 do nontemporary accesses and beat inline considerably. */
890 static stringop_algs amdfam10_memcpy[2] = {
891 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
892 {-1, rep_prefix_4_byte, false}}},
893 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
894 {-1, libcall, false}}}};
895 static stringop_algs amdfam10_memset[2] = {
896 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
897 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
898 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
899 {-1, libcall, false}}}};
900 struct processor_costs amdfam10_cost = {
901 COSTS_N_INSNS (1), /* cost of an add instruction */
902 COSTS_N_INSNS (2), /* cost of a lea instruction */
903 COSTS_N_INSNS (1), /* variable shift costs */
904 COSTS_N_INSNS (1), /* constant shift costs */
905 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
906 COSTS_N_INSNS (4), /* HI */
907 COSTS_N_INSNS (3), /* SI */
908 COSTS_N_INSNS (4), /* DI */
909 COSTS_N_INSNS (5)}, /* other */
910 0, /* cost of multiply per each bit set */
911 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
912 COSTS_N_INSNS (35), /* HI */
913 COSTS_N_INSNS (51), /* SI */
914 COSTS_N_INSNS (83), /* DI */
915 COSTS_N_INSNS (83)}, /* other */
916 COSTS_N_INSNS (1), /* cost of movsx */
917 COSTS_N_INSNS (1), /* cost of movzx */
918 8, /* "large" insn */
919 9, /* MOVE_RATIO */
920 4, /* cost for loading QImode using movzbl */
921 {3, 4, 3}, /* cost of loading integer registers
922 in QImode, HImode and SImode.
923 Relative to reg-reg move (2). */
924 {3, 4, 3}, /* cost of storing integer registers */
925 4, /* cost of reg,reg fld/fst */
926 {4, 4, 12}, /* cost of loading fp registers
927 in SFmode, DFmode and XFmode */
928 {6, 6, 8}, /* cost of storing fp registers
929 in SFmode, DFmode and XFmode */
930 2, /* cost of moving MMX register */
931 {3, 3}, /* cost of loading MMX registers
932 in SImode and DImode */
933 {4, 4}, /* cost of storing MMX registers
934 in SImode and DImode */
935 2, /* cost of moving SSE register */
936 {4, 4, 3}, /* cost of loading SSE registers
937 in SImode, DImode and TImode */
938 {4, 4, 5}, /* cost of storing SSE registers
939 in SImode, DImode and TImode */
940 3, /* MMX or SSE register to integer */
941 /* On K8:
942 MOVD reg64, xmmreg Double FSTORE 4
943 MOVD reg32, xmmreg Double FSTORE 4
944 On AMDFAM10:
945 MOVD reg64, xmmreg Double FADD 3
946 1/1 1/1
947 MOVD reg32, xmmreg Double FADD 3
948 1/1 1/1 */
949 64, /* size of l1 cache. */
950 512, /* size of l2 cache. */
951 64, /* size of prefetch block */
952 /* New AMD processors never drop prefetches; if they cannot be performed
953 immediately, they are queued. We set number of simultaneous prefetches
954 to a large constant to reflect this (it probably is not a good idea not
955 to limit number of prefetches at all, as their execution also takes some
956 time). */
957 100, /* number of parallel prefetches */
958 2, /* Branch cost */
959 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
960 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
961 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
962 COSTS_N_INSNS (2), /* cost of FABS instruction. */
963 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
964 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
966 amdfam10_memcpy,
967 amdfam10_memset,
968 4, /* scalar_stmt_cost. */
969 2, /* scalar load_cost. */
970 2, /* scalar_store_cost. */
971 6, /* vec_stmt_cost. */
972 0, /* vec_to_scalar_cost. */
973 2, /* scalar_to_vec_cost. */
974 2, /* vec_align_load_cost. */
975 2, /* vec_unalign_load_cost. */
976 2, /* vec_store_cost. */
977 2, /* cond_taken_branch_cost. */
978 1, /* cond_not_taken_branch_cost. */
981 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
982 very small blocks it is better to use loop. For large blocks, libcall
983 can do nontemporary accesses and beat inline considerably. */
984 static stringop_algs bdver1_memcpy[2] = {
985 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
986 {-1, rep_prefix_4_byte, false}}},
987 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
988 {-1, libcall, false}}}};
989 static stringop_algs bdver1_memset[2] = {
990 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
991 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
992 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
993 {-1, libcall, false}}}};
995 const struct processor_costs bdver1_cost = {
996 COSTS_N_INSNS (1), /* cost of an add instruction */
997 COSTS_N_INSNS (1), /* cost of a lea instruction */
998 COSTS_N_INSNS (1), /* variable shift costs */
999 COSTS_N_INSNS (1), /* constant shift costs */
1000 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1001 COSTS_N_INSNS (4), /* HI */
1002 COSTS_N_INSNS (4), /* SI */
1003 COSTS_N_INSNS (6), /* DI */
1004 COSTS_N_INSNS (6)}, /* other */
1005 0, /* cost of multiply per each bit set */
1006 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1007 COSTS_N_INSNS (35), /* HI */
1008 COSTS_N_INSNS (51), /* SI */
1009 COSTS_N_INSNS (83), /* DI */
1010 COSTS_N_INSNS (83)}, /* other */
1011 COSTS_N_INSNS (1), /* cost of movsx */
1012 COSTS_N_INSNS (1), /* cost of movzx */
1013 8, /* "large" insn */
1014 9, /* MOVE_RATIO */
1015 4, /* cost for loading QImode using movzbl */
1016 {5, 5, 4}, /* cost of loading integer registers
1017 in QImode, HImode and SImode.
1018 Relative to reg-reg move (2). */
1019 {4, 4, 4}, /* cost of storing integer registers */
1020 2, /* cost of reg,reg fld/fst */
1021 {5, 5, 12}, /* cost of loading fp registers
1022 in SFmode, DFmode and XFmode */
1023 {4, 4, 8}, /* cost of storing fp registers
1024 in SFmode, DFmode and XFmode */
1025 2, /* cost of moving MMX register */
1026 {4, 4}, /* cost of loading MMX registers
1027 in SImode and DImode */
1028 {4, 4}, /* cost of storing MMX registers
1029 in SImode and DImode */
1030 2, /* cost of moving SSE register */
1031 {4, 4, 4}, /* cost of loading SSE registers
1032 in SImode, DImode and TImode */
1033 {4, 4, 4}, /* cost of storing SSE registers
1034 in SImode, DImode and TImode */
1035 2, /* MMX or SSE register to integer */
1036 /* On K8:
1037 MOVD reg64, xmmreg Double FSTORE 4
1038 MOVD reg32, xmmreg Double FSTORE 4
1039 On AMDFAM10:
1040 MOVD reg64, xmmreg Double FADD 3
1041 1/1 1/1
1042 MOVD reg32, xmmreg Double FADD 3
1043 1/1 1/1 */
1044 16, /* size of l1 cache. */
1045 2048, /* size of l2 cache. */
1046 64, /* size of prefetch block */
1047 /* New AMD processors never drop prefetches; if they cannot be performed
1048 immediately, they are queued. We set number of simultaneous prefetches
1049 to a large constant to reflect this (it probably is not a good idea not
1050 to limit number of prefetches at all, as their execution also takes some
1051 time). */
1052 100, /* number of parallel prefetches */
1053 2, /* Branch cost */
1054 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1055 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1056 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1057 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1058 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1059 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1061 bdver1_memcpy,
1062 bdver1_memset,
1063 6, /* scalar_stmt_cost. */
1064 4, /* scalar load_cost. */
1065 4, /* scalar_store_cost. */
1066 6, /* vec_stmt_cost. */
1067 0, /* vec_to_scalar_cost. */
1068 2, /* scalar_to_vec_cost. */
1069 4, /* vec_align_load_cost. */
1070 4, /* vec_unalign_load_cost. */
1071 4, /* vec_store_cost. */
1072 4, /* cond_taken_branch_cost. */
1073 2, /* cond_not_taken_branch_cost. */
1076 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1077 very small blocks it is better to use loop. For large blocks, libcall
1078 can do nontemporary accesses and beat inline considerably. */
1080 static stringop_algs bdver2_memcpy[2] = {
1081 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1082 {-1, rep_prefix_4_byte, false}}},
1083 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1085 static stringop_algs bdver2_memset[2] = {
1086 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1087 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1088 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1089 {-1, libcall, false}}}};
1091 const struct processor_costs bdver2_cost = {
1092 COSTS_N_INSNS (1), /* cost of an add instruction */
1093 COSTS_N_INSNS (1), /* cost of a lea instruction */
1094 COSTS_N_INSNS (1), /* variable shift costs */
1095 COSTS_N_INSNS (1), /* constant shift costs */
1096 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1097 COSTS_N_INSNS (4), /* HI */
1098 COSTS_N_INSNS (4), /* SI */
1099 COSTS_N_INSNS (6), /* DI */
1100 COSTS_N_INSNS (6)}, /* other */
1101 0, /* cost of multiply per each bit set */
1102 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1103 COSTS_N_INSNS (35), /* HI */
1104 COSTS_N_INSNS (51), /* SI */
1105 COSTS_N_INSNS (83), /* DI */
1106 COSTS_N_INSNS (83)}, /* other */
1107 COSTS_N_INSNS (1), /* cost of movsx */
1108 COSTS_N_INSNS (1), /* cost of movzx */
1109 8, /* "large" insn */
1110 9, /* MOVE_RATIO */
1111 4, /* cost for loading QImode using movzbl */
1112 {5, 5, 4}, /* cost of loading integer registers
1113 in QImode, HImode and SImode.
1114 Relative to reg-reg move (2). */
1115 {4, 4, 4}, /* cost of storing integer registers */
1116 2, /* cost of reg,reg fld/fst */
1117 {5, 5, 12}, /* cost of loading fp registers
1118 in SFmode, DFmode and XFmode */
1119 {4, 4, 8}, /* cost of storing fp registers
1120 in SFmode, DFmode and XFmode */
1121 2, /* cost of moving MMX register */
1122 {4, 4}, /* cost of loading MMX registers
1123 in SImode and DImode */
1124 {4, 4}, /* cost of storing MMX registers
1125 in SImode and DImode */
1126 2, /* cost of moving SSE register */
1127 {4, 4, 4}, /* cost of loading SSE registers
1128 in SImode, DImode and TImode */
1129 {4, 4, 4}, /* cost of storing SSE registers
1130 in SImode, DImode and TImode */
1131 2, /* MMX or SSE register to integer */
1132 /* On K8:
1133 MOVD reg64, xmmreg Double FSTORE 4
1134 MOVD reg32, xmmreg Double FSTORE 4
1135 On AMDFAM10:
1136 MOVD reg64, xmmreg Double FADD 3
1137 1/1 1/1
1138 MOVD reg32, xmmreg Double FADD 3
1139 1/1 1/1 */
1140 16, /* size of l1 cache. */
1141 2048, /* size of l2 cache. */
1142 64, /* size of prefetch block */
1143 /* New AMD processors never drop prefetches; if they cannot be performed
1144 immediately, they are queued. We set number of simultaneous prefetches
1145 to a large constant to reflect this (it probably is not a good idea not
1146 to limit number of prefetches at all, as their execution also takes some
1147 time). */
1148 100, /* number of parallel prefetches */
1149 2, /* Branch cost */
1150 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1151 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1152 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1153 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1154 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1155 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1157 bdver2_memcpy,
1158 bdver2_memset,
1159 6, /* scalar_stmt_cost. */
1160 4, /* scalar load_cost. */
1161 4, /* scalar_store_cost. */
1162 6, /* vec_stmt_cost. */
1163 0, /* vec_to_scalar_cost. */
1164 2, /* scalar_to_vec_cost. */
1165 4, /* vec_align_load_cost. */
1166 4, /* vec_unalign_load_cost. */
1167 4, /* vec_store_cost. */
1168 4, /* cond_taken_branch_cost. */
1169 2, /* cond_not_taken_branch_cost. */
1173 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1174 very small blocks it is better to use loop. For large blocks, libcall
1175 can do nontemporary accesses and beat inline considerably. */
1176 static stringop_algs bdver3_memcpy[2] = {
1177 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1178 {-1, rep_prefix_4_byte, false}}},
1179 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1180 {-1, libcall, false}}}};
1181 static stringop_algs bdver3_memset[2] = {
1182 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1183 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1184 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1185 {-1, libcall, false}}}};
1186 struct processor_costs bdver3_cost = {
1187 COSTS_N_INSNS (1), /* cost of an add instruction */
1188 COSTS_N_INSNS (1), /* cost of a lea instruction */
1189 COSTS_N_INSNS (1), /* variable shift costs */
1190 COSTS_N_INSNS (1), /* constant shift costs */
1191 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1192 COSTS_N_INSNS (4), /* HI */
1193 COSTS_N_INSNS (4), /* SI */
1194 COSTS_N_INSNS (6), /* DI */
1195 COSTS_N_INSNS (6)}, /* other */
1196 0, /* cost of multiply per each bit set */
1197 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1198 COSTS_N_INSNS (35), /* HI */
1199 COSTS_N_INSNS (51), /* SI */
1200 COSTS_N_INSNS (83), /* DI */
1201 COSTS_N_INSNS (83)}, /* other */
1202 COSTS_N_INSNS (1), /* cost of movsx */
1203 COSTS_N_INSNS (1), /* cost of movzx */
1204 8, /* "large" insn */
1205 9, /* MOVE_RATIO */
1206 4, /* cost for loading QImode using movzbl */
1207 {5, 5, 4}, /* cost of loading integer registers
1208 in QImode, HImode and SImode.
1209 Relative to reg-reg move (2). */
1210 {4, 4, 4}, /* cost of storing integer registers */
1211 2, /* cost of reg,reg fld/fst */
1212 {5, 5, 12}, /* cost of loading fp registers
1213 in SFmode, DFmode and XFmode */
1214 {4, 4, 8}, /* cost of storing fp registers
1215 in SFmode, DFmode and XFmode */
1216 2, /* cost of moving MMX register */
1217 {4, 4}, /* cost of loading MMX registers
1218 in SImode and DImode */
1219 {4, 4}, /* cost of storing MMX registers
1220 in SImode and DImode */
1221 2, /* cost of moving SSE register */
1222 {4, 4, 4}, /* cost of loading SSE registers
1223 in SImode, DImode and TImode */
1224 {4, 4, 4}, /* cost of storing SSE registers
1225 in SImode, DImode and TImode */
1226 2, /* MMX or SSE register to integer */
1227 16, /* size of l1 cache. */
1228 2048, /* size of l2 cache. */
1229 64, /* size of prefetch block */
1230 /* New AMD processors never drop prefetches; if they cannot be performed
1231 immediately, they are queued. We set number of simultaneous prefetches
1232 to a large constant to reflect this (it probably is not a good idea not
1233 to limit number of prefetches at all, as their execution also takes some
1234 time). */
1235 100, /* number of parallel prefetches */
1236 2, /* Branch cost */
1237 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1238 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1239 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1240 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1241 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1242 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1244 bdver3_memcpy,
1245 bdver3_memset,
1246 6, /* scalar_stmt_cost. */
1247 4, /* scalar load_cost. */
1248 4, /* scalar_store_cost. */
1249 6, /* vec_stmt_cost. */
1250 0, /* vec_to_scalar_cost. */
1251 2, /* scalar_to_vec_cost. */
1252 4, /* vec_align_load_cost. */
1253 4, /* vec_unalign_load_cost. */
1254 4, /* vec_store_cost. */
1255 4, /* cond_taken_branch_cost. */
1256 2, /* cond_not_taken_branch_cost. */
1259 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1260 very small blocks it is better to use loop. For large blocks, libcall
1261 can do nontemporary accesses and beat inline considerably. */
1262 static stringop_algs bdver4_memcpy[2] = {
1263 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1264 {-1, rep_prefix_4_byte, false}}},
1265 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1266 {-1, libcall, false}}}};
1267 static stringop_algs bdver4_memset[2] = {
1268 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1269 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1270 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1271 {-1, libcall, false}}}};
1272 struct processor_costs bdver4_cost = {
1273 COSTS_N_INSNS (1), /* cost of an add instruction */
1274 COSTS_N_INSNS (1), /* cost of a lea instruction */
1275 COSTS_N_INSNS (1), /* variable shift costs */
1276 COSTS_N_INSNS (1), /* constant shift costs */
1277 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1278 COSTS_N_INSNS (4), /* HI */
1279 COSTS_N_INSNS (4), /* SI */
1280 COSTS_N_INSNS (6), /* DI */
1281 COSTS_N_INSNS (6)}, /* other */
1282 0, /* cost of multiply per each bit set */
1283 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1284 COSTS_N_INSNS (35), /* HI */
1285 COSTS_N_INSNS (51), /* SI */
1286 COSTS_N_INSNS (83), /* DI */
1287 COSTS_N_INSNS (83)}, /* other */
1288 COSTS_N_INSNS (1), /* cost of movsx */
1289 COSTS_N_INSNS (1), /* cost of movzx */
1290 8, /* "large" insn */
1291 9, /* MOVE_RATIO */
1292 4, /* cost for loading QImode using movzbl */
1293 {5, 5, 4}, /* cost of loading integer registers
1294 in QImode, HImode and SImode.
1295 Relative to reg-reg move (2). */
1296 {4, 4, 4}, /* cost of storing integer registers */
1297 2, /* cost of reg,reg fld/fst */
1298 {5, 5, 12}, /* cost of loading fp registers
1299 in SFmode, DFmode and XFmode */
1300 {4, 4, 8}, /* cost of storing fp registers
1301 in SFmode, DFmode and XFmode */
1302 2, /* cost of moving MMX register */
1303 {4, 4}, /* cost of loading MMX registers
1304 in SImode and DImode */
1305 {4, 4}, /* cost of storing MMX registers
1306 in SImode and DImode */
1307 2, /* cost of moving SSE register */
1308 {4, 4, 4}, /* cost of loading SSE registers
1309 in SImode, DImode and TImode */
1310 {4, 4, 4}, /* cost of storing SSE registers
1311 in SImode, DImode and TImode */
1312 2, /* MMX or SSE register to integer */
1313 16, /* size of l1 cache. */
1314 2048, /* size of l2 cache. */
1315 64, /* size of prefetch block */
1316 /* New AMD processors never drop prefetches; if they cannot be performed
1317 immediately, they are queued. We set number of simultaneous prefetches
1318 to a large constant to reflect this (it probably is not a good idea not
1319 to limit number of prefetches at all, as their execution also takes some
1320 time). */
1321 100, /* number of parallel prefetches */
1322 2, /* Branch cost */
1323 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1324 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1325 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1326 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1327 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1328 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1330 bdver4_memcpy,
1331 bdver4_memset,
1332 6, /* scalar_stmt_cost. */
1333 4, /* scalar load_cost. */
1334 4, /* scalar_store_cost. */
1335 6, /* vec_stmt_cost. */
1336 0, /* vec_to_scalar_cost. */
1337 2, /* scalar_to_vec_cost. */
1338 4, /* vec_align_load_cost. */
1339 4, /* vec_unalign_load_cost. */
1340 4, /* vec_store_cost. */
1341 4, /* cond_taken_branch_cost. */
1342 2, /* cond_not_taken_branch_cost. */
1346 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1347 very small blocks it is better to use loop. For large blocks, libcall
1348 can do nontemporary accesses and beat inline considerably. */
1349 static stringop_algs znver1_memcpy[2] = {
1350 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1351 {-1, rep_prefix_4_byte, false}}},
1352 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1353 {-1, libcall, false}}}};
1354 static stringop_algs znver1_memset[2] = {
1355 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1356 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1357 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1358 {-1, libcall, false}}}};
1359 struct processor_costs znver1_cost = {
1360 COSTS_N_INSNS (1), /* cost of an add instruction. */
1361 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1362 COSTS_N_INSNS (1), /* variable shift costs. */
1363 COSTS_N_INSNS (1), /* constant shift costs. */
1364 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1365 COSTS_N_INSNS (3), /* HI. */
1366 COSTS_N_INSNS (3), /* SI. */
1367 COSTS_N_INSNS (4), /* DI. */
1368 COSTS_N_INSNS (4)}, /* other. */
1369 0, /* cost of multiply per each bit
1370 set. */
1371 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1372 COSTS_N_INSNS (35), /* HI. */
1373 COSTS_N_INSNS (51), /* SI. */
1374 COSTS_N_INSNS (83), /* DI. */
1375 COSTS_N_INSNS (83)}, /* other. */
1376 COSTS_N_INSNS (1), /* cost of movsx. */
1377 COSTS_N_INSNS (1), /* cost of movzx. */
1378 8, /* "large" insn. */
1379 9, /* MOVE_RATIO. */
1380 4, /* cost for loading QImode using
1381 movzbl. */
1382 {5, 5, 4}, /* cost of loading integer registers
1383 in QImode, HImode and SImode.
1384 Relative to reg-reg move (2). */
1385 {4, 4, 4}, /* cost of storing integer
1386 registers. */
1387 2, /* cost of reg,reg fld/fst. */
1388 {5, 5, 12}, /* cost of loading fp registers
1389 in SFmode, DFmode and XFmode. */
1390 {4, 4, 8}, /* cost of storing fp registers
1391 in SFmode, DFmode and XFmode. */
1392 2, /* cost of moving MMX register. */
1393 {4, 4}, /* cost of loading MMX registers
1394 in SImode and DImode. */
1395 {4, 4}, /* cost of storing MMX registers
1396 in SImode and DImode. */
1397 2, /* cost of moving SSE register. */
1398 {4, 4, 4}, /* cost of loading SSE registers
1399 in SImode, DImode and TImode. */
1400 {4, 4, 4}, /* cost of storing SSE registers
1401 in SImode, DImode and TImode. */
1402 2, /* MMX or SSE register to integer. */
1403 32, /* size of l1 cache. */
1404 512, /* size of l2 cache. */
1405 64, /* size of prefetch block. */
1406 /* New AMD processors never drop prefetches; if they cannot be performed
1407 immediately, they are queued. We set number of simultaneous prefetches
1408 to a large constant to reflect this (it probably is not a good idea not
1409 to limit number of prefetches at all, as their execution also takes some
1410 time). */
1411 100, /* number of parallel prefetches. */
1412 2, /* Branch cost. */
1413 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1414 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1415 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1416 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1417 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1418 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1420 znver1_memcpy,
1421 znver1_memset,
1422 6, /* scalar_stmt_cost. */
1423 4, /* scalar load_cost. */
1424 4, /* scalar_store_cost. */
1425 6, /* vec_stmt_cost. */
1426 0, /* vec_to_scalar_cost. */
1427 2, /* scalar_to_vec_cost. */
1428 4, /* vec_align_load_cost. */
1429 4, /* vec_unalign_load_cost. */
1430 4, /* vec_store_cost. */
1431 4, /* cond_taken_branch_cost. */
1432 2, /* cond_not_taken_branch_cost. */
1435 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1436 very small blocks it is better to use loop. For large blocks, libcall can
1437 do nontemporary accesses and beat inline considerably. */
1438 static stringop_algs btver1_memcpy[2] = {
1439 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1440 {-1, rep_prefix_4_byte, false}}},
1441 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1442 {-1, libcall, false}}}};
1443 static stringop_algs btver1_memset[2] = {
1444 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1445 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1446 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1447 {-1, libcall, false}}}};
1448 const struct processor_costs btver1_cost = {
1449 COSTS_N_INSNS (1), /* cost of an add instruction */
1450 COSTS_N_INSNS (2), /* cost of a lea instruction */
1451 COSTS_N_INSNS (1), /* variable shift costs */
1452 COSTS_N_INSNS (1), /* constant shift costs */
1453 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1454 COSTS_N_INSNS (4), /* HI */
1455 COSTS_N_INSNS (3), /* SI */
1456 COSTS_N_INSNS (4), /* DI */
1457 COSTS_N_INSNS (5)}, /* other */
1458 0, /* cost of multiply per each bit set */
1459 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1460 COSTS_N_INSNS (35), /* HI */
1461 COSTS_N_INSNS (51), /* SI */
1462 COSTS_N_INSNS (83), /* DI */
1463 COSTS_N_INSNS (83)}, /* other */
1464 COSTS_N_INSNS (1), /* cost of movsx */
1465 COSTS_N_INSNS (1), /* cost of movzx */
1466 8, /* "large" insn */
1467 9, /* MOVE_RATIO */
1468 4, /* cost for loading QImode using movzbl */
1469 {3, 4, 3}, /* cost of loading integer registers
1470 in QImode, HImode and SImode.
1471 Relative to reg-reg move (2). */
1472 {3, 4, 3}, /* cost of storing integer registers */
1473 4, /* cost of reg,reg fld/fst */
1474 {4, 4, 12}, /* cost of loading fp registers
1475 in SFmode, DFmode and XFmode */
1476 {6, 6, 8}, /* cost of storing fp registers
1477 in SFmode, DFmode and XFmode */
1478 2, /* cost of moving MMX register */
1479 {3, 3}, /* cost of loading MMX registers
1480 in SImode and DImode */
1481 {4, 4}, /* cost of storing MMX registers
1482 in SImode and DImode */
1483 2, /* cost of moving SSE register */
1484 {4, 4, 3}, /* cost of loading SSE registers
1485 in SImode, DImode and TImode */
1486 {4, 4, 5}, /* cost of storing SSE registers
1487 in SImode, DImode and TImode */
1488 3, /* MMX or SSE register to integer */
1489 /* On K8:
1490 MOVD reg64, xmmreg Double FSTORE 4
1491 MOVD reg32, xmmreg Double FSTORE 4
1492 On AMDFAM10:
1493 MOVD reg64, xmmreg Double FADD 3
1494 1/1 1/1
1495 MOVD reg32, xmmreg Double FADD 3
1496 1/1 1/1 */
1497 32, /* size of l1 cache. */
1498 512, /* size of l2 cache. */
1499 64, /* size of prefetch block */
1500 100, /* number of parallel prefetches */
1501 2, /* Branch cost */
1502 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1503 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1504 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1505 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1506 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1507 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1509 btver1_memcpy,
1510 btver1_memset,
1511 4, /* scalar_stmt_cost. */
1512 2, /* scalar load_cost. */
1513 2, /* scalar_store_cost. */
1514 6, /* vec_stmt_cost. */
1515 0, /* vec_to_scalar_cost. */
1516 2, /* scalar_to_vec_cost. */
1517 2, /* vec_align_load_cost. */
1518 2, /* vec_unalign_load_cost. */
1519 2, /* vec_store_cost. */
1520 2, /* cond_taken_branch_cost. */
1521 1, /* cond_not_taken_branch_cost. */
1524 static stringop_algs btver2_memcpy[2] = {
1525 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1526 {-1, rep_prefix_4_byte, false}}},
1527 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1528 {-1, libcall, false}}}};
1529 static stringop_algs btver2_memset[2] = {
1530 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1531 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1532 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1533 {-1, libcall, false}}}};
1534 const struct processor_costs btver2_cost = {
1535 COSTS_N_INSNS (1), /* cost of an add instruction */
1536 COSTS_N_INSNS (2), /* cost of a lea instruction */
1537 COSTS_N_INSNS (1), /* variable shift costs */
1538 COSTS_N_INSNS (1), /* constant shift costs */
1539 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1540 COSTS_N_INSNS (4), /* HI */
1541 COSTS_N_INSNS (3), /* SI */
1542 COSTS_N_INSNS (4), /* DI */
1543 COSTS_N_INSNS (5)}, /* other */
1544 0, /* cost of multiply per each bit set */
1545 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1546 COSTS_N_INSNS (35), /* HI */
1547 COSTS_N_INSNS (51), /* SI */
1548 COSTS_N_INSNS (83), /* DI */
1549 COSTS_N_INSNS (83)}, /* other */
1550 COSTS_N_INSNS (1), /* cost of movsx */
1551 COSTS_N_INSNS (1), /* cost of movzx */
1552 8, /* "large" insn */
1553 9, /* MOVE_RATIO */
1554 4, /* cost for loading QImode using movzbl */
1555 {3, 4, 3}, /* cost of loading integer registers
1556 in QImode, HImode and SImode.
1557 Relative to reg-reg move (2). */
1558 {3, 4, 3}, /* cost of storing integer registers */
1559 4, /* cost of reg,reg fld/fst */
1560 {4, 4, 12}, /* cost of loading fp registers
1561 in SFmode, DFmode and XFmode */
1562 {6, 6, 8}, /* cost of storing fp registers
1563 in SFmode, DFmode and XFmode */
1564 2, /* cost of moving MMX register */
1565 {3, 3}, /* cost of loading MMX registers
1566 in SImode and DImode */
1567 {4, 4}, /* cost of storing MMX registers
1568 in SImode and DImode */
1569 2, /* cost of moving SSE register */
1570 {4, 4, 3}, /* cost of loading SSE registers
1571 in SImode, DImode and TImode */
1572 {4, 4, 5}, /* cost of storing SSE registers
1573 in SImode, DImode and TImode */
1574 3, /* MMX or SSE register to integer */
1575 /* On K8:
1576 MOVD reg64, xmmreg Double FSTORE 4
1577 MOVD reg32, xmmreg Double FSTORE 4
1578 On AMDFAM10:
1579 MOVD reg64, xmmreg Double FADD 3
1580 1/1 1/1
1581 MOVD reg32, xmmreg Double FADD 3
1582 1/1 1/1 */
1583 32, /* size of l1 cache. */
1584 2048, /* size of l2 cache. */
1585 64, /* size of prefetch block */
1586 100, /* number of parallel prefetches */
1587 2, /* Branch cost */
1588 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1589 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1590 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1591 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1592 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1593 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1594 btver2_memcpy,
1595 btver2_memset,
1596 4, /* scalar_stmt_cost. */
1597 2, /* scalar load_cost. */
1598 2, /* scalar_store_cost. */
1599 6, /* vec_stmt_cost. */
1600 0, /* vec_to_scalar_cost. */
1601 2, /* scalar_to_vec_cost. */
1602 2, /* vec_align_load_cost. */
1603 2, /* vec_unalign_load_cost. */
1604 2, /* vec_store_cost. */
1605 2, /* cond_taken_branch_cost. */
1606 1, /* cond_not_taken_branch_cost. */
1609 static stringop_algs pentium4_memcpy[2] = {
1610 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1611 DUMMY_STRINGOP_ALGS};
1612 static stringop_algs pentium4_memset[2] = {
1613 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1614 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1615 DUMMY_STRINGOP_ALGS};
1617 static const
1618 struct processor_costs pentium4_cost = {
1619 COSTS_N_INSNS (1), /* cost of an add instruction */
1620 COSTS_N_INSNS (3), /* cost of a lea instruction */
1621 COSTS_N_INSNS (4), /* variable shift costs */
1622 COSTS_N_INSNS (4), /* constant shift costs */
1623 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1624 COSTS_N_INSNS (15), /* HI */
1625 COSTS_N_INSNS (15), /* SI */
1626 COSTS_N_INSNS (15), /* DI */
1627 COSTS_N_INSNS (15)}, /* other */
1628 0, /* cost of multiply per each bit set */
1629 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1630 COSTS_N_INSNS (56), /* HI */
1631 COSTS_N_INSNS (56), /* SI */
1632 COSTS_N_INSNS (56), /* DI */
1633 COSTS_N_INSNS (56)}, /* other */
1634 COSTS_N_INSNS (1), /* cost of movsx */
1635 COSTS_N_INSNS (1), /* cost of movzx */
1636 16, /* "large" insn */
1637 6, /* MOVE_RATIO */
1638 2, /* cost for loading QImode using movzbl */
1639 {4, 5, 4}, /* cost of loading integer registers
1640 in QImode, HImode and SImode.
1641 Relative to reg-reg move (2). */
1642 {2, 3, 2}, /* cost of storing integer registers */
1643 2, /* cost of reg,reg fld/fst */
1644 {2, 2, 6}, /* cost of loading fp registers
1645 in SFmode, DFmode and XFmode */
1646 {4, 4, 6}, /* cost of storing fp registers
1647 in SFmode, DFmode and XFmode */
1648 2, /* cost of moving MMX register */
1649 {2, 2}, /* cost of loading MMX registers
1650 in SImode and DImode */
1651 {2, 2}, /* cost of storing MMX registers
1652 in SImode and DImode */
1653 12, /* cost of moving SSE register */
1654 {12, 12, 12}, /* cost of loading SSE registers
1655 in SImode, DImode and TImode */
1656 {2, 2, 8}, /* cost of storing SSE registers
1657 in SImode, DImode and TImode */
1658 10, /* MMX or SSE register to integer */
1659 8, /* size of l1 cache. */
1660 256, /* size of l2 cache. */
1661 64, /* size of prefetch block */
1662 6, /* number of parallel prefetches */
1663 2, /* Branch cost */
1664 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1665 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1666 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1667 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1668 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1669 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1670 pentium4_memcpy,
1671 pentium4_memset,
1672 1, /* scalar_stmt_cost. */
1673 1, /* scalar load_cost. */
1674 1, /* scalar_store_cost. */
1675 1, /* vec_stmt_cost. */
1676 1, /* vec_to_scalar_cost. */
1677 1, /* scalar_to_vec_cost. */
1678 1, /* vec_align_load_cost. */
1679 2, /* vec_unalign_load_cost. */
1680 1, /* vec_store_cost. */
1681 3, /* cond_taken_branch_cost. */
1682 1, /* cond_not_taken_branch_cost. */
1685 static stringop_algs nocona_memcpy[2] = {
1686 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1687 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1688 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1690 static stringop_algs nocona_memset[2] = {
1691 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1692 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1693 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1694 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1696 static const
1697 struct processor_costs nocona_cost = {
1698 COSTS_N_INSNS (1), /* cost of an add instruction */
1699 COSTS_N_INSNS (1), /* cost of a lea instruction */
1700 COSTS_N_INSNS (1), /* variable shift costs */
1701 COSTS_N_INSNS (1), /* constant shift costs */
1702 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1703 COSTS_N_INSNS (10), /* HI */
1704 COSTS_N_INSNS (10), /* SI */
1705 COSTS_N_INSNS (10), /* DI */
1706 COSTS_N_INSNS (10)}, /* other */
1707 0, /* cost of multiply per each bit set */
1708 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1709 COSTS_N_INSNS (66), /* HI */
1710 COSTS_N_INSNS (66), /* SI */
1711 COSTS_N_INSNS (66), /* DI */
1712 COSTS_N_INSNS (66)}, /* other */
1713 COSTS_N_INSNS (1), /* cost of movsx */
1714 COSTS_N_INSNS (1), /* cost of movzx */
1715 16, /* "large" insn */
1716 17, /* MOVE_RATIO */
1717 4, /* cost for loading QImode using movzbl */
1718 {4, 4, 4}, /* cost of loading integer registers
1719 in QImode, HImode and SImode.
1720 Relative to reg-reg move (2). */
1721 {4, 4, 4}, /* cost of storing integer registers */
1722 3, /* cost of reg,reg fld/fst */
1723 {12, 12, 12}, /* cost of loading fp registers
1724 in SFmode, DFmode and XFmode */
1725 {4, 4, 4}, /* cost of storing fp registers
1726 in SFmode, DFmode and XFmode */
1727 6, /* cost of moving MMX register */
1728 {12, 12}, /* cost of loading MMX registers
1729 in SImode and DImode */
1730 {12, 12}, /* cost of storing MMX registers
1731 in SImode and DImode */
1732 6, /* cost of moving SSE register */
1733 {12, 12, 12}, /* cost of loading SSE registers
1734 in SImode, DImode and TImode */
1735 {12, 12, 12}, /* cost of storing SSE registers
1736 in SImode, DImode and TImode */
1737 8, /* MMX or SSE register to integer */
1738 8, /* size of l1 cache. */
1739 1024, /* size of l2 cache. */
1740 64, /* size of prefetch block */
1741 8, /* number of parallel prefetches */
1742 1, /* Branch cost */
1743 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1744 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1745 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1746 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1747 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1748 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1749 nocona_memcpy,
1750 nocona_memset,
1751 1, /* scalar_stmt_cost. */
1752 1, /* scalar load_cost. */
1753 1, /* scalar_store_cost. */
1754 1, /* vec_stmt_cost. */
1755 1, /* vec_to_scalar_cost. */
1756 1, /* scalar_to_vec_cost. */
1757 1, /* vec_align_load_cost. */
1758 2, /* vec_unalign_load_cost. */
1759 1, /* vec_store_cost. */
1760 3, /* cond_taken_branch_cost. */
1761 1, /* cond_not_taken_branch_cost. */
1764 static stringop_algs atom_memcpy[2] = {
1765 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1766 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1767 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1768 static stringop_algs atom_memset[2] = {
1769 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1770 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1771 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1772 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1773 static const
1774 struct processor_costs atom_cost = {
1775 COSTS_N_INSNS (1), /* cost of an add instruction */
1776 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1777 COSTS_N_INSNS (1), /* variable shift costs */
1778 COSTS_N_INSNS (1), /* constant shift costs */
1779 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1780 COSTS_N_INSNS (4), /* HI */
1781 COSTS_N_INSNS (3), /* SI */
1782 COSTS_N_INSNS (4), /* DI */
1783 COSTS_N_INSNS (2)}, /* other */
1784 0, /* cost of multiply per each bit set */
1785 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1786 COSTS_N_INSNS (26), /* HI */
1787 COSTS_N_INSNS (42), /* SI */
1788 COSTS_N_INSNS (74), /* DI */
1789 COSTS_N_INSNS (74)}, /* other */
1790 COSTS_N_INSNS (1), /* cost of movsx */
1791 COSTS_N_INSNS (1), /* cost of movzx */
1792 8, /* "large" insn */
1793 17, /* MOVE_RATIO */
1794 4, /* cost for loading QImode using movzbl */
1795 {4, 4, 4}, /* cost of loading integer registers
1796 in QImode, HImode and SImode.
1797 Relative to reg-reg move (2). */
1798 {4, 4, 4}, /* cost of storing integer registers */
1799 4, /* cost of reg,reg fld/fst */
1800 {12, 12, 12}, /* cost of loading fp registers
1801 in SFmode, DFmode and XFmode */
1802 {6, 6, 8}, /* cost of storing fp registers
1803 in SFmode, DFmode and XFmode */
1804 2, /* cost of moving MMX register */
1805 {8, 8}, /* cost of loading MMX registers
1806 in SImode and DImode */
1807 {8, 8}, /* cost of storing MMX registers
1808 in SImode and DImode */
1809 2, /* cost of moving SSE register */
1810 {8, 8, 8}, /* cost of loading SSE registers
1811 in SImode, DImode and TImode */
1812 {8, 8, 8}, /* cost of storing SSE registers
1813 in SImode, DImode and TImode */
1814 5, /* MMX or SSE register to integer */
1815 32, /* size of l1 cache. */
1816 256, /* size of l2 cache. */
1817 64, /* size of prefetch block */
1818 6, /* number of parallel prefetches */
1819 3, /* Branch cost */
1820 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1821 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1822 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1823 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1824 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1825 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1826 atom_memcpy,
1827 atom_memset,
1828 1, /* scalar_stmt_cost. */
1829 1, /* scalar load_cost. */
1830 1, /* scalar_store_cost. */
1831 1, /* vec_stmt_cost. */
1832 1, /* vec_to_scalar_cost. */
1833 1, /* scalar_to_vec_cost. */
1834 1, /* vec_align_load_cost. */
1835 2, /* vec_unalign_load_cost. */
1836 1, /* vec_store_cost. */
1837 3, /* cond_taken_branch_cost. */
1838 1, /* cond_not_taken_branch_cost. */
1841 static stringop_algs slm_memcpy[2] = {
1842 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1843 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1844 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1845 static stringop_algs slm_memset[2] = {
1846 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1847 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1848 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1849 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1850 static const
1851 struct processor_costs slm_cost = {
1852 COSTS_N_INSNS (1), /* cost of an add instruction */
1853 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1854 COSTS_N_INSNS (1), /* variable shift costs */
1855 COSTS_N_INSNS (1), /* constant shift costs */
1856 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1857 COSTS_N_INSNS (3), /* HI */
1858 COSTS_N_INSNS (3), /* SI */
1859 COSTS_N_INSNS (4), /* DI */
1860 COSTS_N_INSNS (2)}, /* other */
1861 0, /* cost of multiply per each bit set */
1862 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1863 COSTS_N_INSNS (26), /* HI */
1864 COSTS_N_INSNS (42), /* SI */
1865 COSTS_N_INSNS (74), /* DI */
1866 COSTS_N_INSNS (74)}, /* other */
1867 COSTS_N_INSNS (1), /* cost of movsx */
1868 COSTS_N_INSNS (1), /* cost of movzx */
1869 8, /* "large" insn */
1870 17, /* MOVE_RATIO */
1871 4, /* cost for loading QImode using movzbl */
1872 {4, 4, 4}, /* cost of loading integer registers
1873 in QImode, HImode and SImode.
1874 Relative to reg-reg move (2). */
1875 {4, 4, 4}, /* cost of storing integer registers */
1876 4, /* cost of reg,reg fld/fst */
1877 {12, 12, 12}, /* cost of loading fp registers
1878 in SFmode, DFmode and XFmode */
1879 {6, 6, 8}, /* cost of storing fp registers
1880 in SFmode, DFmode and XFmode */
1881 2, /* cost of moving MMX register */
1882 {8, 8}, /* cost of loading MMX registers
1883 in SImode and DImode */
1884 {8, 8}, /* cost of storing MMX registers
1885 in SImode and DImode */
1886 2, /* cost of moving SSE register */
1887 {8, 8, 8}, /* cost of loading SSE registers
1888 in SImode, DImode and TImode */
1889 {8, 8, 8}, /* cost of storing SSE registers
1890 in SImode, DImode and TImode */
1891 5, /* MMX or SSE register to integer */
1892 32, /* size of l1 cache. */
1893 256, /* size of l2 cache. */
1894 64, /* size of prefetch block */
1895 6, /* number of parallel prefetches */
1896 3, /* Branch cost */
1897 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1898 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1899 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1900 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1901 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1902 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1903 slm_memcpy,
1904 slm_memset,
1905 1, /* scalar_stmt_cost. */
1906 1, /* scalar load_cost. */
1907 1, /* scalar_store_cost. */
1908 1, /* vec_stmt_cost. */
1909 4, /* vec_to_scalar_cost. */
1910 1, /* scalar_to_vec_cost. */
1911 1, /* vec_align_load_cost. */
1912 2, /* vec_unalign_load_cost. */
1913 1, /* vec_store_cost. */
1914 3, /* cond_taken_branch_cost. */
1915 1, /* cond_not_taken_branch_cost. */
1918 static stringop_algs intel_memcpy[2] = {
1919 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1920 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1921 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1922 static stringop_algs intel_memset[2] = {
1923 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1924 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1925 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1926 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1927 static const
1928 struct processor_costs intel_cost = {
1929 COSTS_N_INSNS (1), /* cost of an add instruction */
1930 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1931 COSTS_N_INSNS (1), /* variable shift costs */
1932 COSTS_N_INSNS (1), /* constant shift costs */
1933 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1934 COSTS_N_INSNS (3), /* HI */
1935 COSTS_N_INSNS (3), /* SI */
1936 COSTS_N_INSNS (4), /* DI */
1937 COSTS_N_INSNS (2)}, /* other */
1938 0, /* cost of multiply per each bit set */
1939 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1940 COSTS_N_INSNS (26), /* HI */
1941 COSTS_N_INSNS (42), /* SI */
1942 COSTS_N_INSNS (74), /* DI */
1943 COSTS_N_INSNS (74)}, /* other */
1944 COSTS_N_INSNS (1), /* cost of movsx */
1945 COSTS_N_INSNS (1), /* cost of movzx */
1946 8, /* "large" insn */
1947 17, /* MOVE_RATIO */
1948 4, /* cost for loading QImode using movzbl */
1949 {4, 4, 4}, /* cost of loading integer registers
1950 in QImode, HImode and SImode.
1951 Relative to reg-reg move (2). */
1952 {4, 4, 4}, /* cost of storing integer registers */
1953 4, /* cost of reg,reg fld/fst */
1954 {12, 12, 12}, /* cost of loading fp registers
1955 in SFmode, DFmode and XFmode */
1956 {6, 6, 8}, /* cost of storing fp registers
1957 in SFmode, DFmode and XFmode */
1958 2, /* cost of moving MMX register */
1959 {8, 8}, /* cost of loading MMX registers
1960 in SImode and DImode */
1961 {8, 8}, /* cost of storing MMX registers
1962 in SImode and DImode */
1963 2, /* cost of moving SSE register */
1964 {8, 8, 8}, /* cost of loading SSE registers
1965 in SImode, DImode and TImode */
1966 {8, 8, 8}, /* cost of storing SSE registers
1967 in SImode, DImode and TImode */
1968 5, /* MMX or SSE register to integer */
1969 32, /* size of l1 cache. */
1970 256, /* size of l2 cache. */
1971 64, /* size of prefetch block */
1972 6, /* number of parallel prefetches */
1973 3, /* Branch cost */
1974 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1975 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1976 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1977 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1978 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1979 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1980 intel_memcpy,
1981 intel_memset,
1982 1, /* scalar_stmt_cost. */
1983 1, /* scalar load_cost. */
1984 1, /* scalar_store_cost. */
1985 1, /* vec_stmt_cost. */
1986 4, /* vec_to_scalar_cost. */
1987 1, /* scalar_to_vec_cost. */
1988 1, /* vec_align_load_cost. */
1989 2, /* vec_unalign_load_cost. */
1990 1, /* vec_store_cost. */
1991 3, /* cond_taken_branch_cost. */
1992 1, /* cond_not_taken_branch_cost. */
1995 /* Generic should produce code tuned for Core-i7 (and newer chips)
1996 and btver1 (and newer chips). */
1998 static stringop_algs generic_memcpy[2] = {
1999 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2000 {-1, libcall, false}}},
2001 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2002 {-1, libcall, false}}}};
2003 static stringop_algs generic_memset[2] = {
2004 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2005 {-1, libcall, false}}},
2006 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2007 {-1, libcall, false}}}};
2008 static const
2009 struct processor_costs generic_cost = {
2010 COSTS_N_INSNS (1), /* cost of an add instruction */
2011 /* On all chips taken into consideration lea is 2 cycles and more. With
2012 this cost however our current implementation of synth_mult results in
2013 use of unnecessary temporary registers causing regression on several
2014 SPECfp benchmarks. */
2015 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2016 COSTS_N_INSNS (1), /* variable shift costs */
2017 COSTS_N_INSNS (1), /* constant shift costs */
2018 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2019 COSTS_N_INSNS (4), /* HI */
2020 COSTS_N_INSNS (3), /* SI */
2021 COSTS_N_INSNS (4), /* DI */
2022 COSTS_N_INSNS (2)}, /* other */
2023 0, /* cost of multiply per each bit set */
2024 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2025 COSTS_N_INSNS (26), /* HI */
2026 COSTS_N_INSNS (42), /* SI */
2027 COSTS_N_INSNS (74), /* DI */
2028 COSTS_N_INSNS (74)}, /* other */
2029 COSTS_N_INSNS (1), /* cost of movsx */
2030 COSTS_N_INSNS (1), /* cost of movzx */
2031 8, /* "large" insn */
2032 17, /* MOVE_RATIO */
2033 4, /* cost for loading QImode using movzbl */
2034 {4, 4, 4}, /* cost of loading integer registers
2035 in QImode, HImode and SImode.
2036 Relative to reg-reg move (2). */
2037 {4, 4, 4}, /* cost of storing integer registers */
2038 4, /* cost of reg,reg fld/fst */
2039 {12, 12, 12}, /* cost of loading fp registers
2040 in SFmode, DFmode and XFmode */
2041 {6, 6, 8}, /* cost of storing fp registers
2042 in SFmode, DFmode and XFmode */
2043 2, /* cost of moving MMX register */
2044 {8, 8}, /* cost of loading MMX registers
2045 in SImode and DImode */
2046 {8, 8}, /* cost of storing MMX registers
2047 in SImode and DImode */
2048 2, /* cost of moving SSE register */
2049 {8, 8, 8}, /* cost of loading SSE registers
2050 in SImode, DImode and TImode */
2051 {8, 8, 8}, /* cost of storing SSE registers
2052 in SImode, DImode and TImode */
2053 5, /* MMX or SSE register to integer */
2054 32, /* size of l1 cache. */
2055 512, /* size of l2 cache. */
2056 64, /* size of prefetch block */
2057 6, /* number of parallel prefetches */
2058 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2059 value is increased to perhaps more appropriate value of 5. */
2060 3, /* Branch cost */
2061 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2062 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2063 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2064 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2065 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2066 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2067 generic_memcpy,
2068 generic_memset,
2069 1, /* scalar_stmt_cost. */
2070 1, /* scalar load_cost. */
2071 1, /* scalar_store_cost. */
2072 1, /* vec_stmt_cost. */
2073 1, /* vec_to_scalar_cost. */
2074 1, /* scalar_to_vec_cost. */
2075 1, /* vec_align_load_cost. */
2076 2, /* vec_unalign_load_cost. */
2077 1, /* vec_store_cost. */
2078 3, /* cond_taken_branch_cost. */
2079 1, /* cond_not_taken_branch_cost. */
2082 /* core_cost should produce code tuned for Core familly of CPUs. */
2083 static stringop_algs core_memcpy[2] = {
2084 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2085 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2086 {-1, libcall, false}}}};
2087 static stringop_algs core_memset[2] = {
2088 {libcall, {{6, loop_1_byte, true},
2089 {24, loop, true},
2090 {8192, rep_prefix_4_byte, true},
2091 {-1, libcall, false}}},
2092 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2093 {-1, libcall, false}}}};
2095 static const
2096 struct processor_costs core_cost = {
2097 COSTS_N_INSNS (1), /* cost of an add instruction */
2098 /* On all chips taken into consideration lea is 2 cycles and more. With
2099 this cost however our current implementation of synth_mult results in
2100 use of unnecessary temporary registers causing regression on several
2101 SPECfp benchmarks. */
2102 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2103 COSTS_N_INSNS (1), /* variable shift costs */
2104 COSTS_N_INSNS (1), /* constant shift costs */
2105 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2106 COSTS_N_INSNS (4), /* HI */
2107 COSTS_N_INSNS (3), /* SI */
2108 COSTS_N_INSNS (4), /* DI */
2109 COSTS_N_INSNS (2)}, /* other */
2110 0, /* cost of multiply per each bit set */
2111 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2112 COSTS_N_INSNS (26), /* HI */
2113 COSTS_N_INSNS (42), /* SI */
2114 COSTS_N_INSNS (74), /* DI */
2115 COSTS_N_INSNS (74)}, /* other */
2116 COSTS_N_INSNS (1), /* cost of movsx */
2117 COSTS_N_INSNS (1), /* cost of movzx */
2118 8, /* "large" insn */
2119 17, /* MOVE_RATIO */
2120 4, /* cost for loading QImode using movzbl */
2121 {4, 4, 4}, /* cost of loading integer registers
2122 in QImode, HImode and SImode.
2123 Relative to reg-reg move (2). */
2124 {4, 4, 4}, /* cost of storing integer registers */
2125 4, /* cost of reg,reg fld/fst */
2126 {12, 12, 12}, /* cost of loading fp registers
2127 in SFmode, DFmode and XFmode */
2128 {6, 6, 8}, /* cost of storing fp registers
2129 in SFmode, DFmode and XFmode */
2130 2, /* cost of moving MMX register */
2131 {8, 8}, /* cost of loading MMX registers
2132 in SImode and DImode */
2133 {8, 8}, /* cost of storing MMX registers
2134 in SImode and DImode */
2135 2, /* cost of moving SSE register */
2136 {8, 8, 8}, /* cost of loading SSE registers
2137 in SImode, DImode and TImode */
2138 {8, 8, 8}, /* cost of storing SSE registers
2139 in SImode, DImode and TImode */
2140 5, /* MMX or SSE register to integer */
2141 64, /* size of l1 cache. */
2142 512, /* size of l2 cache. */
2143 64, /* size of prefetch block */
2144 6, /* number of parallel prefetches */
2145 /* FIXME perhaps more appropriate value is 5. */
2146 3, /* Branch cost */
2147 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2148 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2149 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2150 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2151 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2152 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2153 core_memcpy,
2154 core_memset,
2155 1, /* scalar_stmt_cost. */
2156 1, /* scalar load_cost. */
2157 1, /* scalar_store_cost. */
2158 1, /* vec_stmt_cost. */
2159 1, /* vec_to_scalar_cost. */
2160 1, /* scalar_to_vec_cost. */
2161 1, /* vec_align_load_cost. */
2162 2, /* vec_unalign_load_cost. */
2163 1, /* vec_store_cost. */
2164 3, /* cond_taken_branch_cost. */
2165 1, /* cond_not_taken_branch_cost. */
2169 /* Set by -mtune. */
2170 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2172 /* Set by -mtune or -Os. */
2173 const struct processor_costs *ix86_cost = &pentium_cost;
2175 /* Processor feature/optimization bitmasks. */
2176 #define m_386 (1U<<PROCESSOR_I386)
2177 #define m_486 (1U<<PROCESSOR_I486)
2178 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2179 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2180 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2181 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2182 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2183 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2184 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2185 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2186 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2187 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2188 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2189 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2190 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2191 #define m_KNL (1U<<PROCESSOR_KNL)
2192 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2193 #define m_INTEL (1U<<PROCESSOR_INTEL)
2195 #define m_GEODE (1U<<PROCESSOR_GEODE)
2196 #define m_K6 (1U<<PROCESSOR_K6)
2197 #define m_K6_GEODE (m_K6 | m_GEODE)
2198 #define m_K8 (1U<<PROCESSOR_K8)
2199 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2200 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2201 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2202 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2203 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2204 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2205 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2206 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2207 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2208 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2209 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2210 #define m_BTVER (m_BTVER1 | m_BTVER2)
2211 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2212 | m_ZNVER1)
2214 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2216 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2217 #undef DEF_TUNE
2218 #define DEF_TUNE(tune, name, selector) name,
2219 #include "x86-tune.def"
2220 #undef DEF_TUNE
2223 /* Feature tests against the various tunings. */
2224 unsigned char ix86_tune_features[X86_TUNE_LAST];
2226 /* Feature tests against the various tunings used to create ix86_tune_features
2227 based on the processor mask. */
2228 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2229 #undef DEF_TUNE
2230 #define DEF_TUNE(tune, name, selector) selector,
2231 #include "x86-tune.def"
2232 #undef DEF_TUNE
2235 /* Feature tests against the various architecture variations. */
2236 unsigned char ix86_arch_features[X86_ARCH_LAST];
2238 /* Feature tests against the various architecture variations, used to create
2239 ix86_arch_features based on the processor mask. */
2240 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2241 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2242 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2244 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2245 ~m_386,
2247 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2248 ~(m_386 | m_486),
2250 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2251 ~m_386,
2253 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2254 ~m_386,
2257 /* In case the average insn count for single function invocation is
2258 lower than this constant, emit fast (but longer) prologue and
2259 epilogue code. */
2260 #define FAST_PROLOGUE_INSN_COUNT 20
2262 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2263 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2264 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2265 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2267 /* Array of the smallest class containing reg number REGNO, indexed by
2268 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2270 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2272 /* ax, dx, cx, bx */
2273 AREG, DREG, CREG, BREG,
2274 /* si, di, bp, sp */
2275 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2276 /* FP registers */
2277 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2278 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2279 /* arg pointer */
2280 NON_Q_REGS,
2281 /* flags, fpsr, fpcr, frame */
2282 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2283 /* SSE registers */
2284 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2285 SSE_REGS, SSE_REGS,
2286 /* MMX registers */
2287 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2288 MMX_REGS, MMX_REGS,
2289 /* REX registers */
2290 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2291 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2292 /* SSE REX registers */
2293 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2294 SSE_REGS, SSE_REGS,
2295 /* AVX-512 SSE registers */
2296 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2297 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2298 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2299 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2300 /* Mask registers. */
2301 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2302 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2303 /* MPX bound registers */
2304 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2307 /* The "default" register map used in 32bit mode. */
2309 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2311 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2312 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2313 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2314 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2315 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2316 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2317 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2318 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2319 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2320 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2321 101, 102, 103, 104, /* bound registers */
2324 /* The "default" register map used in 64bit mode. */
2326 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2328 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2329 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2330 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2331 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2332 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2333 8,9,10,11,12,13,14,15, /* extended integer registers */
2334 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2335 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2336 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2337 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2338 126, 127, 128, 129, /* bound registers */
2341 /* Define the register numbers to be used in Dwarf debugging information.
2342 The SVR4 reference port C compiler uses the following register numbers
2343 in its Dwarf output code:
2344 0 for %eax (gcc regno = 0)
2345 1 for %ecx (gcc regno = 2)
2346 2 for %edx (gcc regno = 1)
2347 3 for %ebx (gcc regno = 3)
2348 4 for %esp (gcc regno = 7)
2349 5 for %ebp (gcc regno = 6)
2350 6 for %esi (gcc regno = 4)
2351 7 for %edi (gcc regno = 5)
2352 The following three DWARF register numbers are never generated by
2353 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2354 believes these numbers have these meanings.
2355 8 for %eip (no gcc equivalent)
2356 9 for %eflags (gcc regno = 17)
2357 10 for %trapno (no gcc equivalent)
2358 It is not at all clear how we should number the FP stack registers
2359 for the x86 architecture. If the version of SDB on x86/svr4 were
2360 a bit less brain dead with respect to floating-point then we would
2361 have a precedent to follow with respect to DWARF register numbers
2362 for x86 FP registers, but the SDB on x86/svr4 is so completely
2363 broken with respect to FP registers that it is hardly worth thinking
2364 of it as something to strive for compatibility with.
2365 The version of x86/svr4 SDB I have at the moment does (partially)
2366 seem to believe that DWARF register number 11 is associated with
2367 the x86 register %st(0), but that's about all. Higher DWARF
2368 register numbers don't seem to be associated with anything in
2369 particular, and even for DWARF regno 11, SDB only seems to under-
2370 stand that it should say that a variable lives in %st(0) (when
2371 asked via an `=' command) if we said it was in DWARF regno 11,
2372 but SDB still prints garbage when asked for the value of the
2373 variable in question (via a `/' command).
2374 (Also note that the labels SDB prints for various FP stack regs
2375 when doing an `x' command are all wrong.)
2376 Note that these problems generally don't affect the native SVR4
2377 C compiler because it doesn't allow the use of -O with -g and
2378 because when it is *not* optimizing, it allocates a memory
2379 location for each floating-point variable, and the memory
2380 location is what gets described in the DWARF AT_location
2381 attribute for the variable in question.
2382 Regardless of the severe mental illness of the x86/svr4 SDB, we
2383 do something sensible here and we use the following DWARF
2384 register numbers. Note that these are all stack-top-relative
2385 numbers.
2386 11 for %st(0) (gcc regno = 8)
2387 12 for %st(1) (gcc regno = 9)
2388 13 for %st(2) (gcc regno = 10)
2389 14 for %st(3) (gcc regno = 11)
2390 15 for %st(4) (gcc regno = 12)
2391 16 for %st(5) (gcc regno = 13)
2392 17 for %st(6) (gcc regno = 14)
2393 18 for %st(7) (gcc regno = 15)
2395 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2397 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2398 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2399 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2400 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2401 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2402 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2403 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2404 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2405 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2406 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2407 101, 102, 103, 104, /* bound registers */
2410 /* Define parameter passing and return registers. */
2412 static int const x86_64_int_parameter_registers[6] =
2414 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2417 static int const x86_64_ms_abi_int_parameter_registers[4] =
2419 CX_REG, DX_REG, R8_REG, R9_REG
2422 static int const x86_64_int_return_registers[4] =
2424 AX_REG, DX_REG, DI_REG, SI_REG
2427 /* Additional registers that are clobbered by SYSV calls. */
2429 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2430 static int const x86_64_ms_sysv_extra_clobbered_registers
2431 [NUM_X86_64_MS_CLOBBERED_REGS] =
2433 SI_REG, DI_REG,
2434 XMM6_REG, XMM7_REG,
2435 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2436 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2439 enum xlogue_stub {
2440 XLOGUE_STUB_SAVE,
2441 XLOGUE_STUB_RESTORE,
2442 XLOGUE_STUB_RESTORE_TAIL,
2443 XLOGUE_STUB_SAVE_HFP,
2444 XLOGUE_STUB_RESTORE_HFP,
2445 XLOGUE_STUB_RESTORE_HFP_TAIL,
2447 XLOGUE_STUB_COUNT
2450 enum xlogue_stub_sets {
2451 XLOGUE_SET_ALIGNED,
2452 XLOGUE_SET_ALIGNED_PLUS_8,
2453 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2454 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2456 XLOGUE_SET_COUNT
2459 /* Register save/restore layout used by out-of-line stubs. */
2460 class xlogue_layout {
2461 public:
2462 struct reginfo
2464 unsigned regno;
2465 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2466 rsi) to where each register is stored. */
2469 unsigned get_nregs () const {return m_nregs;}
2470 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2472 const reginfo &get_reginfo (unsigned reg) const
2474 gcc_assert (reg < m_nregs);
2475 return m_regs[reg];
2478 static const char *get_stub_name (enum xlogue_stub stub,
2479 unsigned n_extra_args);
2481 /* Returns an rtx for the stub's symbol based upon
2482 1.) the specified stub (save, restore or restore_ret) and
2483 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2484 3.) rather or not stack alignment is being performed. */
2485 static rtx get_stub_rtx (enum xlogue_stub stub);
2487 /* Returns the amount of stack space (including padding) that the stub
2488 needs to store registers based upon data in the machine_function. */
2489 HOST_WIDE_INT get_stack_space_used () const
2491 const struct machine_function *m = cfun->machine;
2492 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2494 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2495 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
2498 /* Returns the offset for the base pointer used by the stub. */
2499 HOST_WIDE_INT get_stub_ptr_offset () const
2501 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2504 static const struct xlogue_layout &get_instance ();
2505 static unsigned count_stub_managed_regs ();
2506 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2508 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2509 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2510 static const unsigned MAX_REGS = 18;
2511 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2512 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2513 static const unsigned STUB_NAME_MAX_LEN = 16;
2514 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2515 static const unsigned REG_ORDER[MAX_REGS];
2516 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2518 private:
2519 xlogue_layout ();
2520 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2521 xlogue_layout (const xlogue_layout &);
2523 /* True if hard frame pointer is used. */
2524 bool m_hfp;
2526 /* Max number of register this layout manages. */
2527 unsigned m_nregs;
2529 /* Incoming offset from 16-byte alignment. */
2530 HOST_WIDE_INT m_stack_align_off_in;
2532 /* Register order and offsets. */
2533 struct reginfo m_regs[MAX_REGS];
2535 /* Lazy-inited cache of symbol names for stubs. */
2536 static char s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2537 [STUB_NAME_MAX_LEN];
2539 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2542 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2543 "savms64",
2544 "resms64",
2545 "resms64x",
2546 "savms64f",
2547 "resms64f",
2548 "resms64fx"
2551 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2552 /* The below offset values are where each register is stored for the layout
2553 relative to incoming stack pointer. The value of each m_regs[].offset will
2554 be relative to the incoming base pointer (rax or rsi) used by the stub.
2556 s_instances: 0 1 2 3
2557 Offset: realigned or aligned + 8
2558 Register aligned aligned + 8 aligned w/HFP w/HFP */
2559 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2560 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2561 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2562 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2563 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2564 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2565 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2566 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2567 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2568 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2569 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2570 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2571 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2572 BP_REG, /* 0xc0 0xc8 N/A N/A */
2573 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2574 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2575 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2576 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2579 /* Instantiate static const values. */
2580 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2581 const unsigned xlogue_layout::MIN_REGS;
2582 const unsigned xlogue_layout::MAX_REGS;
2583 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2584 const unsigned xlogue_layout::VARIANT_COUNT;
2585 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2587 /* Initialize xlogue_layout::s_stub_names to zero. */
2588 char xlogue_layout::s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2589 [STUB_NAME_MAX_LEN];
2591 /* Instantiates all xlogue_layout instances. */
2592 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2593 xlogue_layout (0, false),
2594 xlogue_layout (8, false),
2595 xlogue_layout (0, true),
2596 xlogue_layout (8, true)
2599 /* Return an appropriate const instance of xlogue_layout based upon values
2600 in cfun->machine and crtl. */
2601 const struct xlogue_layout &
2602 xlogue_layout::get_instance ()
2604 enum xlogue_stub_sets stub_set;
2605 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2607 if (stack_realign_fp)
2608 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2609 else if (frame_pointer_needed)
2610 stub_set = aligned_plus_8
2611 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2612 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2613 else
2614 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2616 return s_instances[stub_set];
2619 /* Determine how many clobbered registers can be saved by the stub.
2620 Returns the count of registers the stub will save and restore. */
2621 unsigned
2622 xlogue_layout::count_stub_managed_regs ()
2624 bool hfp = frame_pointer_needed || stack_realign_fp;
2625 unsigned i, count;
2626 unsigned regno;
2628 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2630 regno = REG_ORDER[i];
2631 if (regno == BP_REG && hfp)
2632 continue;
2633 if (!ix86_save_reg (regno, false, false))
2634 break;
2635 ++count;
2637 return count;
2640 /* Determine if register REGNO is a stub managed register given the
2641 total COUNT of stub managed registers. */
2642 bool
2643 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2645 bool hfp = frame_pointer_needed || stack_realign_fp;
2646 unsigned i;
2648 for (i = 0; i < count; ++i)
2650 gcc_assert (i < MAX_REGS);
2651 if (REG_ORDER[i] == BP_REG && hfp)
2652 ++count;
2653 else if (REG_ORDER[i] == regno)
2654 return true;
2656 return false;
2659 /* Constructor for xlogue_layout. */
2660 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2661 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2662 m_stack_align_off_in (stack_align_off_in)
2664 HOST_WIDE_INT offset = stack_align_off_in;
2665 unsigned i, j;
2667 for (i = j = 0; i < MAX_REGS; ++i)
2669 unsigned regno = REG_ORDER[i];
2671 if (regno == BP_REG && hfp)
2672 continue;
2673 if (SSE_REGNO_P (regno))
2675 offset += 16;
2676 /* Verify that SSE regs are always aligned. */
2677 gcc_assert (!((stack_align_off_in + offset) & 15));
2679 else
2680 offset += 8;
2682 m_regs[j].regno = regno;
2683 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2685 gcc_assert (j == m_nregs);
2688 const char *
2689 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2690 unsigned n_extra_regs)
2692 char *name = s_stub_names[stub][n_extra_regs];
2694 /* Lazy init */
2695 if (!*name)
2697 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%u",
2698 STUB_BASE_NAMES[stub], MIN_REGS + n_extra_regs);
2699 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2702 return name;
2705 /* Return rtx of a symbol ref for the entry point (based upon
2706 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2708 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2710 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2711 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2712 gcc_assert (stub < XLOGUE_STUB_COUNT);
2713 gcc_assert (crtl->stack_realign_finalized);
2715 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2718 /* Define the structure for the machine field in struct function. */
2720 struct GTY(()) stack_local_entry {
2721 unsigned short mode;
2722 unsigned short n;
2723 rtx rtl;
2724 struct stack_local_entry *next;
2727 /* Which cpu are we scheduling for. */
2728 enum attr_cpu ix86_schedule;
2730 /* Which cpu are we optimizing for. */
2731 enum processor_type ix86_tune;
2733 /* Which instruction set architecture to use. */
2734 enum processor_type ix86_arch;
2736 /* True if processor has SSE prefetch instruction. */
2737 unsigned char x86_prefetch_sse;
2739 /* -mstackrealign option */
2740 static const char ix86_force_align_arg_pointer_string[]
2741 = "force_align_arg_pointer";
2743 static rtx (*ix86_gen_leave) (void);
2744 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2745 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2746 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2747 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2748 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2749 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2750 static rtx (*ix86_gen_clzero) (rtx);
2751 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2752 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2753 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2754 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2756 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2758 /* Preferred alignment for stack boundary in bits. */
2759 unsigned int ix86_preferred_stack_boundary;
2761 /* Alignment for incoming stack boundary in bits specified at
2762 command line. */
2763 static unsigned int ix86_user_incoming_stack_boundary;
2765 /* Default alignment for incoming stack boundary in bits. */
2766 static unsigned int ix86_default_incoming_stack_boundary;
2768 /* Alignment for incoming stack boundary in bits. */
2769 unsigned int ix86_incoming_stack_boundary;
2771 /* Calling abi specific va_list type nodes. */
2772 static GTY(()) tree sysv_va_list_type_node;
2773 static GTY(()) tree ms_va_list_type_node;
2775 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2776 char internal_label_prefix[16];
2777 int internal_label_prefix_len;
2779 /* Fence to use after loop using movnt. */
2780 tree x86_mfence;
2782 /* Register class used for passing given 64bit part of the argument.
2783 These represent classes as documented by the PS ABI, with the exception
2784 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2785 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2787 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2788 whenever possible (upper half does contain padding). */
2789 enum x86_64_reg_class
2791 X86_64_NO_CLASS,
2792 X86_64_INTEGER_CLASS,
2793 X86_64_INTEGERSI_CLASS,
2794 X86_64_SSE_CLASS,
2795 X86_64_SSESF_CLASS,
2796 X86_64_SSEDF_CLASS,
2797 X86_64_SSEUP_CLASS,
2798 X86_64_X87_CLASS,
2799 X86_64_X87UP_CLASS,
2800 X86_64_COMPLEX_X87_CLASS,
2801 X86_64_MEMORY_CLASS
2804 #define MAX_CLASSES 8
2806 /* Table of constants used by fldpi, fldln2, etc.... */
2807 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2808 static bool ext_80387_constants_init;
2811 static struct machine_function * ix86_init_machine_status (void);
2812 static rtx ix86_function_value (const_tree, const_tree, bool);
2813 static bool ix86_function_value_regno_p (const unsigned int);
2814 static unsigned int ix86_function_arg_boundary (machine_mode,
2815 const_tree);
2816 static rtx ix86_static_chain (const_tree, bool);
2817 static int ix86_function_regparm (const_tree, const_tree);
2818 static void ix86_compute_frame_layout (void);
2819 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2820 rtx, rtx, int);
2821 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2822 static tree ix86_canonical_va_list_type (tree);
2823 static void predict_jump (int);
2824 static unsigned int split_stack_prologue_scratch_regno (void);
2825 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2827 enum ix86_function_specific_strings
2829 IX86_FUNCTION_SPECIFIC_ARCH,
2830 IX86_FUNCTION_SPECIFIC_TUNE,
2831 IX86_FUNCTION_SPECIFIC_MAX
2834 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2835 const char *, const char *, enum fpmath_unit,
2836 bool);
2837 static void ix86_function_specific_save (struct cl_target_option *,
2838 struct gcc_options *opts);
2839 static void ix86_function_specific_restore (struct gcc_options *opts,
2840 struct cl_target_option *);
2841 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2842 static void ix86_function_specific_print (FILE *, int,
2843 struct cl_target_option *);
2844 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2845 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2846 struct gcc_options *,
2847 struct gcc_options *,
2848 struct gcc_options *);
2849 static bool ix86_can_inline_p (tree, tree);
2850 static void ix86_set_current_function (tree);
2851 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2853 static enum calling_abi ix86_function_abi (const_tree);
2856 #ifndef SUBTARGET32_DEFAULT_CPU
2857 #define SUBTARGET32_DEFAULT_CPU "i386"
2858 #endif
2860 /* Whether -mtune= or -march= were specified */
2861 static int ix86_tune_defaulted;
2862 static int ix86_arch_specified;
2864 /* Vectorization library interface and handlers. */
2865 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2867 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2868 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2870 /* Processor target table, indexed by processor number */
2871 struct ptt
2873 const char *const name; /* processor name */
2874 const struct processor_costs *cost; /* Processor costs */
2875 const int align_loop; /* Default alignments. */
2876 const int align_loop_max_skip;
2877 const int align_jump;
2878 const int align_jump_max_skip;
2879 const int align_func;
2882 /* This table must be in sync with enum processor_type in i386.h. */
2883 static const struct ptt processor_target_table[PROCESSOR_max] =
2885 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2886 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2887 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2888 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2889 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2890 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2891 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2892 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2893 {"core2", &core_cost, 16, 10, 16, 10, 16},
2894 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2895 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2896 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2897 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2898 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2899 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2900 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2901 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2902 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2903 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2904 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2905 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2906 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2907 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2908 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2909 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2910 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2911 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2912 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2913 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2916 static unsigned int
2917 rest_of_handle_insert_vzeroupper (void)
2919 int i;
2921 /* vzeroupper instructions are inserted immediately after reload to
2922 account for possible spills from 256bit registers. The pass
2923 reuses mode switching infrastructure by re-running mode insertion
2924 pass, so disable entities that have already been processed. */
2925 for (i = 0; i < MAX_386_ENTITIES; i++)
2926 ix86_optimize_mode_switching[i] = 0;
2928 ix86_optimize_mode_switching[AVX_U128] = 1;
2930 /* Call optimize_mode_switching. */
2931 g->get_passes ()->execute_pass_mode_switching ();
2932 return 0;
2935 /* Return 1 if INSN uses or defines a hard register.
2936 Hard register uses in a memory address are ignored.
2937 Clobbers and flags definitions are ignored. */
2939 static bool
2940 has_non_address_hard_reg (rtx_insn *insn)
2942 df_ref ref;
2943 FOR_EACH_INSN_DEF (ref, insn)
2944 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2945 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2946 && DF_REF_REGNO (ref) != FLAGS_REG)
2947 return true;
2949 FOR_EACH_INSN_USE (ref, insn)
2950 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2951 return true;
2953 return false;
2956 /* Check if comparison INSN may be transformed
2957 into vector comparison. Currently we transform
2958 zero checks only which look like:
2960 (set (reg:CCZ 17 flags)
2961 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2962 (subreg:SI (reg:DI x) 0))
2963 (const_int 0 [0]))) */
2965 static bool
2966 convertible_comparison_p (rtx_insn *insn)
2968 if (!TARGET_SSE4_1)
2969 return false;
2971 rtx def_set = single_set (insn);
2973 gcc_assert (def_set);
2975 rtx src = SET_SRC (def_set);
2976 rtx dst = SET_DEST (def_set);
2978 gcc_assert (GET_CODE (src) == COMPARE);
2980 if (GET_CODE (dst) != REG
2981 || REGNO (dst) != FLAGS_REG
2982 || GET_MODE (dst) != CCZmode)
2983 return false;
2985 rtx op1 = XEXP (src, 0);
2986 rtx op2 = XEXP (src, 1);
2988 if (op2 != CONST0_RTX (GET_MODE (op2)))
2989 return false;
2991 if (GET_CODE (op1) != IOR)
2992 return false;
2994 op2 = XEXP (op1, 1);
2995 op1 = XEXP (op1, 0);
2997 if (!SUBREG_P (op1)
2998 || !SUBREG_P (op2)
2999 || GET_MODE (op1) != SImode
3000 || GET_MODE (op2) != SImode
3001 || ((SUBREG_BYTE (op1) != 0
3002 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3003 && (SUBREG_BYTE (op2) != 0
3004 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3005 return false;
3007 op1 = SUBREG_REG (op1);
3008 op2 = SUBREG_REG (op2);
3010 if (op1 != op2
3011 || !REG_P (op1)
3012 || GET_MODE (op1) != DImode)
3013 return false;
3015 return true;
3018 /* The DImode version of scalar_to_vector_candidate_p. */
3020 static bool
3021 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3023 rtx def_set = single_set (insn);
3025 if (!def_set)
3026 return false;
3028 if (has_non_address_hard_reg (insn))
3029 return false;
3031 rtx src = SET_SRC (def_set);
3032 rtx dst = SET_DEST (def_set);
3034 if (GET_CODE (src) == COMPARE)
3035 return convertible_comparison_p (insn);
3037 /* We are interested in DImode promotion only. */
3038 if ((GET_MODE (src) != DImode
3039 && !CONST_INT_P (src))
3040 || GET_MODE (dst) != DImode)
3041 return false;
3043 if (!REG_P (dst) && !MEM_P (dst))
3044 return false;
3046 switch (GET_CODE (src))
3048 case ASHIFTRT:
3049 if (!TARGET_AVX512VL)
3050 return false;
3051 /* FALLTHRU */
3053 case ASHIFT:
3054 case LSHIFTRT:
3055 if (!REG_P (XEXP (src, 1))
3056 && (!SUBREG_P (XEXP (src, 1))
3057 || SUBREG_BYTE (XEXP (src, 1)) != 0
3058 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3059 && (!CONST_INT_P (XEXP (src, 1))
3060 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3061 return false;
3063 if (GET_MODE (XEXP (src, 1)) != QImode
3064 && !CONST_INT_P (XEXP (src, 1)))
3065 return false;
3066 break;
3068 case PLUS:
3069 case MINUS:
3070 case IOR:
3071 case XOR:
3072 case AND:
3073 if (!REG_P (XEXP (src, 1))
3074 && !MEM_P (XEXP (src, 1))
3075 && !CONST_INT_P (XEXP (src, 1)))
3076 return false;
3078 if (GET_MODE (XEXP (src, 1)) != DImode
3079 && !CONST_INT_P (XEXP (src, 1)))
3080 return false;
3081 break;
3083 case NEG:
3084 case NOT:
3085 break;
3087 case REG:
3088 return true;
3090 case MEM:
3091 case CONST_INT:
3092 return REG_P (dst);
3094 default:
3095 return false;
3098 if (!REG_P (XEXP (src, 0))
3099 && !MEM_P (XEXP (src, 0))
3100 && !CONST_INT_P (XEXP (src, 0))
3101 /* Check for andnot case. */
3102 && (GET_CODE (src) != AND
3103 || GET_CODE (XEXP (src, 0)) != NOT
3104 || !REG_P (XEXP (XEXP (src, 0), 0))))
3105 return false;
3107 if (GET_MODE (XEXP (src, 0)) != DImode
3108 && !CONST_INT_P (XEXP (src, 0)))
3109 return false;
3111 return true;
3114 /* The TImode version of scalar_to_vector_candidate_p. */
3116 static bool
3117 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3119 rtx def_set = single_set (insn);
3121 if (!def_set)
3122 return false;
3124 if (has_non_address_hard_reg (insn))
3125 return false;
3127 rtx src = SET_SRC (def_set);
3128 rtx dst = SET_DEST (def_set);
3130 /* Only TImode load and store are allowed. */
3131 if (GET_MODE (dst) != TImode)
3132 return false;
3134 if (MEM_P (dst))
3136 /* Check for store. Memory must be aligned or unaligned store
3137 is optimal. Only support store from register, standard SSE
3138 constant or CONST_WIDE_INT generated from piecewise store.
3140 ??? Verify performance impact before enabling CONST_INT for
3141 __int128 store. */
3142 if (misaligned_operand (dst, TImode)
3143 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3144 return false;
3146 switch (GET_CODE (src))
3148 default:
3149 return false;
3151 case REG:
3152 case CONST_WIDE_INT:
3153 return true;
3155 case CONST_INT:
3156 return standard_sse_constant_p (src, TImode);
3159 else if (MEM_P (src))
3161 /* Check for load. Memory must be aligned or unaligned load is
3162 optimal. */
3163 return (REG_P (dst)
3164 && (!misaligned_operand (src, TImode)
3165 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3168 return false;
3171 /* Return 1 if INSN may be converted into vector
3172 instruction. */
3174 static bool
3175 scalar_to_vector_candidate_p (rtx_insn *insn)
3177 if (TARGET_64BIT)
3178 return timode_scalar_to_vector_candidate_p (insn);
3179 else
3180 return dimode_scalar_to_vector_candidate_p (insn);
3183 /* The DImode version of remove_non_convertible_regs. */
3185 static void
3186 dimode_remove_non_convertible_regs (bitmap candidates)
3188 bitmap_iterator bi;
3189 unsigned id;
3190 bitmap regs = BITMAP_ALLOC (NULL);
3192 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3194 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3195 rtx reg = SET_DEST (def_set);
3197 if (!REG_P (reg)
3198 || bitmap_bit_p (regs, REGNO (reg))
3199 || HARD_REGISTER_P (reg))
3200 continue;
3202 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3203 def;
3204 def = DF_REF_NEXT_REG (def))
3206 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3208 if (dump_file)
3209 fprintf (dump_file,
3210 "r%d has non convertible definition in insn %d\n",
3211 REGNO (reg), DF_REF_INSN_UID (def));
3213 bitmap_set_bit (regs, REGNO (reg));
3214 break;
3219 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3221 for (df_ref def = DF_REG_DEF_CHAIN (id);
3222 def;
3223 def = DF_REF_NEXT_REG (def))
3224 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3226 if (dump_file)
3227 fprintf (dump_file, "Removing insn %d from candidates list\n",
3228 DF_REF_INSN_UID (def));
3230 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3234 BITMAP_FREE (regs);
3237 /* For a register REGNO, scan instructions for its defs and uses.
3238 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3240 static void
3241 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3242 unsigned int regno)
3244 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3245 def;
3246 def = DF_REF_NEXT_REG (def))
3248 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3250 if (dump_file)
3251 fprintf (dump_file,
3252 "r%d has non convertible def in insn %d\n",
3253 regno, DF_REF_INSN_UID (def));
3255 bitmap_set_bit (regs, regno);
3256 break;
3260 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3261 ref;
3262 ref = DF_REF_NEXT_REG (ref))
3264 /* Debug instructions are skipped. */
3265 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3266 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3268 if (dump_file)
3269 fprintf (dump_file,
3270 "r%d has non convertible use in insn %d\n",
3271 regno, DF_REF_INSN_UID (ref));
3273 bitmap_set_bit (regs, regno);
3274 break;
3279 /* The TImode version of remove_non_convertible_regs. */
3281 static void
3282 timode_remove_non_convertible_regs (bitmap candidates)
3284 bitmap_iterator bi;
3285 unsigned id;
3286 bitmap regs = BITMAP_ALLOC (NULL);
3288 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3290 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3291 rtx dest = SET_DEST (def_set);
3292 rtx src = SET_SRC (def_set);
3294 if ((!REG_P (dest)
3295 || bitmap_bit_p (regs, REGNO (dest))
3296 || HARD_REGISTER_P (dest))
3297 && (!REG_P (src)
3298 || bitmap_bit_p (regs, REGNO (src))
3299 || HARD_REGISTER_P (src)))
3300 continue;
3302 if (REG_P (dest))
3303 timode_check_non_convertible_regs (candidates, regs,
3304 REGNO (dest));
3306 if (REG_P (src))
3307 timode_check_non_convertible_regs (candidates, regs,
3308 REGNO (src));
3311 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3313 for (df_ref def = DF_REG_DEF_CHAIN (id);
3314 def;
3315 def = DF_REF_NEXT_REG (def))
3316 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3318 if (dump_file)
3319 fprintf (dump_file, "Removing insn %d from candidates list\n",
3320 DF_REF_INSN_UID (def));
3322 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3325 for (df_ref ref = DF_REG_USE_CHAIN (id);
3326 ref;
3327 ref = DF_REF_NEXT_REG (ref))
3328 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3330 if (dump_file)
3331 fprintf (dump_file, "Removing insn %d from candidates list\n",
3332 DF_REF_INSN_UID (ref));
3334 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3338 BITMAP_FREE (regs);
3341 /* For a given bitmap of insn UIDs scans all instruction and
3342 remove insn from CANDIDATES in case it has both convertible
3343 and not convertible definitions.
3345 All insns in a bitmap are conversion candidates according to
3346 scalar_to_vector_candidate_p. Currently it implies all insns
3347 are single_set. */
3349 static void
3350 remove_non_convertible_regs (bitmap candidates)
3352 if (TARGET_64BIT)
3353 timode_remove_non_convertible_regs (candidates);
3354 else
3355 dimode_remove_non_convertible_regs (candidates);
3358 class scalar_chain
3360 public:
3361 scalar_chain ();
3362 virtual ~scalar_chain ();
3364 static unsigned max_id;
3366 /* ID of a chain. */
3367 unsigned int chain_id;
3368 /* A queue of instructions to be included into a chain. */
3369 bitmap queue;
3370 /* Instructions included into a chain. */
3371 bitmap insns;
3372 /* All registers defined by a chain. */
3373 bitmap defs;
3374 /* Registers used in both vector and sclar modes. */
3375 bitmap defs_conv;
3377 void build (bitmap candidates, unsigned insn_uid);
3378 virtual int compute_convert_gain () = 0;
3379 int convert ();
3381 protected:
3382 void add_to_queue (unsigned insn_uid);
3383 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3385 private:
3386 void add_insn (bitmap candidates, unsigned insn_uid);
3387 void analyze_register_chain (bitmap candidates, df_ref ref);
3388 virtual void mark_dual_mode_def (df_ref def) = 0;
3389 virtual void convert_insn (rtx_insn *insn) = 0;
3390 virtual void convert_registers () = 0;
3393 class dimode_scalar_chain : public scalar_chain
3395 public:
3396 int compute_convert_gain ();
3397 private:
3398 void mark_dual_mode_def (df_ref def);
3399 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3400 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3401 void convert_insn (rtx_insn *insn);
3402 void convert_op (rtx *op, rtx_insn *insn);
3403 void convert_reg (unsigned regno);
3404 void make_vector_copies (unsigned regno);
3405 void convert_registers ();
3406 int vector_const_cost (rtx exp);
3409 class timode_scalar_chain : public scalar_chain
3411 public:
3412 /* Convert from TImode to V1TImode is always faster. */
3413 int compute_convert_gain () { return 1; }
3415 private:
3416 void mark_dual_mode_def (df_ref def);
3417 void fix_debug_reg_uses (rtx reg);
3418 void convert_insn (rtx_insn *insn);
3419 /* We don't convert registers to difference size. */
3420 void convert_registers () {}
3423 unsigned scalar_chain::max_id = 0;
3425 /* Initialize new chain. */
3427 scalar_chain::scalar_chain ()
3429 chain_id = ++max_id;
3431 if (dump_file)
3432 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3434 bitmap_obstack_initialize (NULL);
3435 insns = BITMAP_ALLOC (NULL);
3436 defs = BITMAP_ALLOC (NULL);
3437 defs_conv = BITMAP_ALLOC (NULL);
3438 queue = NULL;
3441 /* Free chain's data. */
3443 scalar_chain::~scalar_chain ()
3445 BITMAP_FREE (insns);
3446 BITMAP_FREE (defs);
3447 BITMAP_FREE (defs_conv);
3448 bitmap_obstack_release (NULL);
3451 /* Add instruction into chains' queue. */
3453 void
3454 scalar_chain::add_to_queue (unsigned insn_uid)
3456 if (bitmap_bit_p (insns, insn_uid)
3457 || bitmap_bit_p (queue, insn_uid))
3458 return;
3460 if (dump_file)
3461 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3462 insn_uid, chain_id);
3463 bitmap_set_bit (queue, insn_uid);
3466 /* For DImode conversion, mark register defined by DEF as requiring
3467 conversion. */
3469 void
3470 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3472 gcc_assert (DF_REF_REG_DEF_P (def));
3474 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3475 return;
3477 if (dump_file)
3478 fprintf (dump_file,
3479 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3480 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3482 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3485 /* For TImode conversion, it is unused. */
3487 void
3488 timode_scalar_chain::mark_dual_mode_def (df_ref)
3490 gcc_unreachable ();
3493 /* Check REF's chain to add new insns into a queue
3494 and find registers requiring conversion. */
3496 void
3497 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3499 df_link *chain;
3501 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3502 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3503 add_to_queue (DF_REF_INSN_UID (ref));
3505 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3507 unsigned uid = DF_REF_INSN_UID (chain->ref);
3509 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3510 continue;
3512 if (!DF_REF_REG_MEM_P (chain->ref))
3514 if (bitmap_bit_p (insns, uid))
3515 continue;
3517 if (bitmap_bit_p (candidates, uid))
3519 add_to_queue (uid);
3520 continue;
3524 if (DF_REF_REG_DEF_P (chain->ref))
3526 if (dump_file)
3527 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3528 DF_REF_REGNO (chain->ref), uid);
3529 mark_dual_mode_def (chain->ref);
3531 else
3533 if (dump_file)
3534 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3535 DF_REF_REGNO (chain->ref), uid);
3536 mark_dual_mode_def (ref);
3541 /* Add instruction into a chain. */
3543 void
3544 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3546 if (bitmap_bit_p (insns, insn_uid))
3547 return;
3549 if (dump_file)
3550 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3552 bitmap_set_bit (insns, insn_uid);
3554 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3555 rtx def_set = single_set (insn);
3556 if (def_set && REG_P (SET_DEST (def_set))
3557 && !HARD_REGISTER_P (SET_DEST (def_set)))
3558 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3560 df_ref ref;
3561 df_ref def;
3562 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3563 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3564 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3565 def;
3566 def = DF_REF_NEXT_REG (def))
3567 analyze_register_chain (candidates, def);
3568 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3569 if (!DF_REF_REG_MEM_P (ref))
3570 analyze_register_chain (candidates, ref);
3573 /* Build new chain starting from insn INSN_UID recursively
3574 adding all dependent uses and definitions. */
3576 void
3577 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3579 queue = BITMAP_ALLOC (NULL);
3580 bitmap_set_bit (queue, insn_uid);
3582 if (dump_file)
3583 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3585 while (!bitmap_empty_p (queue))
3587 insn_uid = bitmap_first_set_bit (queue);
3588 bitmap_clear_bit (queue, insn_uid);
3589 bitmap_clear_bit (candidates, insn_uid);
3590 add_insn (candidates, insn_uid);
3593 if (dump_file)
3595 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3596 fprintf (dump_file, " insns: ");
3597 dump_bitmap (dump_file, insns);
3598 if (!bitmap_empty_p (defs_conv))
3600 bitmap_iterator bi;
3601 unsigned id;
3602 const char *comma = "";
3603 fprintf (dump_file, " defs to convert: ");
3604 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3606 fprintf (dump_file, "%sr%d", comma, id);
3607 comma = ", ";
3609 fprintf (dump_file, "\n");
3613 BITMAP_FREE (queue);
3616 /* Return a cost of building a vector costant
3617 instead of using a scalar one. */
3620 dimode_scalar_chain::vector_const_cost (rtx exp)
3622 gcc_assert (CONST_INT_P (exp));
3624 if (standard_sse_constant_p (exp, V2DImode))
3625 return COSTS_N_INSNS (1);
3626 return ix86_cost->sse_load[1];
3629 /* Compute a gain for chain conversion. */
3632 dimode_scalar_chain::compute_convert_gain ()
3634 bitmap_iterator bi;
3635 unsigned insn_uid;
3636 int gain = 0;
3637 int cost = 0;
3639 if (dump_file)
3640 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3642 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3644 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3645 rtx def_set = single_set (insn);
3646 rtx src = SET_SRC (def_set);
3647 rtx dst = SET_DEST (def_set);
3649 if (REG_P (src) && REG_P (dst))
3650 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3651 else if (REG_P (src) && MEM_P (dst))
3652 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3653 else if (MEM_P (src) && REG_P (dst))
3654 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3655 else if (GET_CODE (src) == ASHIFT
3656 || GET_CODE (src) == ASHIFTRT
3657 || GET_CODE (src) == LSHIFTRT)
3659 if (CONST_INT_P (XEXP (src, 0)))
3660 gain -= vector_const_cost (XEXP (src, 0));
3661 if (CONST_INT_P (XEXP (src, 1)))
3663 gain += ix86_cost->shift_const;
3664 if (INTVAL (XEXP (src, 1)) >= 32)
3665 gain -= COSTS_N_INSNS (1);
3667 else
3668 /* Additional gain for omitting two CMOVs. */
3669 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3671 else if (GET_CODE (src) == PLUS
3672 || GET_CODE (src) == MINUS
3673 || GET_CODE (src) == IOR
3674 || GET_CODE (src) == XOR
3675 || GET_CODE (src) == AND)
3677 gain += ix86_cost->add;
3678 /* Additional gain for andnot for targets without BMI. */
3679 if (GET_CODE (XEXP (src, 0)) == NOT
3680 && !TARGET_BMI)
3681 gain += 2 * ix86_cost->add;
3683 if (CONST_INT_P (XEXP (src, 0)))
3684 gain -= vector_const_cost (XEXP (src, 0));
3685 if (CONST_INT_P (XEXP (src, 1)))
3686 gain -= vector_const_cost (XEXP (src, 1));
3688 else if (GET_CODE (src) == NEG
3689 || GET_CODE (src) == NOT)
3690 gain += ix86_cost->add - COSTS_N_INSNS (1);
3691 else if (GET_CODE (src) == COMPARE)
3693 /* Assume comparison cost is the same. */
3695 else if (CONST_INT_P (src))
3697 if (REG_P (dst))
3698 gain += COSTS_N_INSNS (2);
3699 else if (MEM_P (dst))
3700 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3701 gain -= vector_const_cost (src);
3703 else
3704 gcc_unreachable ();
3707 if (dump_file)
3708 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3710 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3711 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3713 if (dump_file)
3714 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3716 gain -= cost;
3718 if (dump_file)
3719 fprintf (dump_file, " Total gain: %d\n", gain);
3721 return gain;
3724 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3727 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3729 if (x == reg)
3730 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3732 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3733 int i, j;
3734 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3736 if (fmt[i] == 'e')
3737 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3738 else if (fmt[i] == 'E')
3739 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3740 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3741 reg, new_reg);
3744 return x;
3747 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3749 void
3750 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3751 rtx reg, rtx new_reg)
3753 replace_with_subreg (single_set (insn), reg, new_reg);
3756 /* Insert generated conversion instruction sequence INSNS
3757 after instruction AFTER. New BB may be required in case
3758 instruction has EH region attached. */
3760 void
3761 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3763 if (!control_flow_insn_p (after))
3765 emit_insn_after (insns, after);
3766 return;
3769 basic_block bb = BLOCK_FOR_INSN (after);
3770 edge e = find_fallthru_edge (bb->succs);
3771 gcc_assert (e);
3773 basic_block new_bb = split_edge (e);
3774 emit_insn_after (insns, BB_HEAD (new_bb));
3777 /* Make vector copies for all register REGNO definitions
3778 and replace its uses in a chain. */
3780 void
3781 dimode_scalar_chain::make_vector_copies (unsigned regno)
3783 rtx reg = regno_reg_rtx[regno];
3784 rtx vreg = gen_reg_rtx (DImode);
3785 bool count_reg = false;
3786 df_ref ref;
3788 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3789 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3791 df_ref use;
3793 /* Detect the count register of a shift instruction. */
3794 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3795 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3797 rtx_insn *insn = DF_REF_INSN (use);
3798 rtx def_set = single_set (insn);
3800 gcc_assert (def_set);
3802 rtx src = SET_SRC (def_set);
3804 if ((GET_CODE (src) == ASHIFT
3805 || GET_CODE (src) == ASHIFTRT
3806 || GET_CODE (src) == LSHIFTRT)
3807 && !CONST_INT_P (XEXP (src, 1))
3808 && reg_or_subregno (XEXP (src, 1)) == regno)
3809 count_reg = true;
3812 start_sequence ();
3813 if (count_reg)
3815 rtx qreg = gen_lowpart (QImode, reg);
3816 rtx tmp = gen_reg_rtx (SImode);
3818 if (TARGET_ZERO_EXTEND_WITH_AND
3819 && optimize_function_for_speed_p (cfun))
3821 emit_move_insn (tmp, const0_rtx);
3822 emit_insn (gen_movstrictqi
3823 (gen_lowpart (QImode, tmp), qreg));
3825 else
3826 emit_insn (gen_rtx_SET
3827 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3829 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3831 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3832 emit_move_insn (slot, tmp);
3833 tmp = copy_rtx (slot);
3836 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3838 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3840 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3841 emit_move_insn (adjust_address (tmp, SImode, 0),
3842 gen_rtx_SUBREG (SImode, reg, 0));
3843 emit_move_insn (adjust_address (tmp, SImode, 4),
3844 gen_rtx_SUBREG (SImode, reg, 4));
3845 emit_move_insn (vreg, tmp);
3847 else if (TARGET_SSE4_1)
3849 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3850 CONST0_RTX (V4SImode),
3851 gen_rtx_SUBREG (SImode, reg, 0)));
3852 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3853 gen_rtx_SUBREG (V4SImode, vreg, 0),
3854 gen_rtx_SUBREG (SImode, reg, 4),
3855 GEN_INT (2)));
3857 else
3859 rtx tmp = gen_reg_rtx (DImode);
3860 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3861 CONST0_RTX (V4SImode),
3862 gen_rtx_SUBREG (SImode, reg, 0)));
3863 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3864 CONST0_RTX (V4SImode),
3865 gen_rtx_SUBREG (SImode, reg, 4)));
3866 emit_insn (gen_vec_interleave_lowv4si
3867 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3868 gen_rtx_SUBREG (V4SImode, vreg, 0),
3869 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3871 rtx_insn *seq = get_insns ();
3872 end_sequence ();
3873 rtx_insn *insn = DF_REF_INSN (ref);
3874 emit_conversion_insns (seq, insn);
3876 if (dump_file)
3877 fprintf (dump_file,
3878 " Copied r%d to a vector register r%d for insn %d\n",
3879 regno, REGNO (vreg), INSN_UID (insn));
3882 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3883 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3885 rtx_insn *insn = DF_REF_INSN (ref);
3886 if (count_reg)
3888 rtx def_set = single_set (insn);
3889 gcc_assert (def_set);
3891 rtx src = SET_SRC (def_set);
3893 if ((GET_CODE (src) == ASHIFT
3894 || GET_CODE (src) == ASHIFTRT
3895 || GET_CODE (src) == LSHIFTRT)
3896 && !CONST_INT_P (XEXP (src, 1))
3897 && reg_or_subregno (XEXP (src, 1)) == regno)
3898 XEXP (src, 1) = vreg;
3900 else
3901 replace_with_subreg_in_insn (insn, reg, vreg);
3903 if (dump_file)
3904 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3905 regno, REGNO (vreg), INSN_UID (insn));
3909 /* Convert all definitions of register REGNO
3910 and fix its uses. Scalar copies may be created
3911 in case register is used in not convertible insn. */
3913 void
3914 dimode_scalar_chain::convert_reg (unsigned regno)
3916 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3917 rtx reg = regno_reg_rtx[regno];
3918 rtx scopy = NULL_RTX;
3919 df_ref ref;
3920 bitmap conv;
3922 conv = BITMAP_ALLOC (NULL);
3923 bitmap_copy (conv, insns);
3925 if (scalar_copy)
3926 scopy = gen_reg_rtx (DImode);
3928 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3930 rtx_insn *insn = DF_REF_INSN (ref);
3931 rtx def_set = single_set (insn);
3932 rtx src = SET_SRC (def_set);
3933 rtx reg = DF_REF_REG (ref);
3935 if (!MEM_P (src))
3937 replace_with_subreg_in_insn (insn, reg, reg);
3938 bitmap_clear_bit (conv, INSN_UID (insn));
3941 if (scalar_copy)
3943 start_sequence ();
3944 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3946 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3947 emit_move_insn (tmp, reg);
3948 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3949 adjust_address (tmp, SImode, 0));
3950 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3951 adjust_address (tmp, SImode, 4));
3953 else if (TARGET_SSE4_1)
3955 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3956 emit_insn
3957 (gen_rtx_SET
3958 (gen_rtx_SUBREG (SImode, scopy, 0),
3959 gen_rtx_VEC_SELECT (SImode,
3960 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3962 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3963 emit_insn
3964 (gen_rtx_SET
3965 (gen_rtx_SUBREG (SImode, scopy, 4),
3966 gen_rtx_VEC_SELECT (SImode,
3967 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3969 else
3971 rtx vcopy = gen_reg_rtx (V2DImode);
3972 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3973 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3974 gen_rtx_SUBREG (SImode, vcopy, 0));
3975 emit_move_insn (vcopy,
3976 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3977 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3978 gen_rtx_SUBREG (SImode, vcopy, 0));
3980 rtx_insn *seq = get_insns ();
3981 end_sequence ();
3982 emit_conversion_insns (seq, insn);
3984 if (dump_file)
3985 fprintf (dump_file,
3986 " Copied r%d to a scalar register r%d for insn %d\n",
3987 regno, REGNO (scopy), INSN_UID (insn));
3991 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3992 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3994 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3996 rtx_insn *insn = DF_REF_INSN (ref);
3998 rtx def_set = single_set (insn);
3999 gcc_assert (def_set);
4001 rtx src = SET_SRC (def_set);
4002 rtx dst = SET_DEST (def_set);
4004 if ((GET_CODE (src) == ASHIFT
4005 || GET_CODE (src) == ASHIFTRT
4006 || GET_CODE (src) == LSHIFTRT)
4007 && !CONST_INT_P (XEXP (src, 1))
4008 && reg_or_subregno (XEXP (src, 1)) == regno)
4010 rtx tmp2 = gen_reg_rtx (V2DImode);
4012 start_sequence ();
4014 if (TARGET_SSE4_1)
4015 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4016 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4017 else
4019 rtx vec_cst
4020 = gen_rtx_CONST_VECTOR (V2DImode,
4021 gen_rtvec (2, GEN_INT (0xff),
4022 const0_rtx));
4023 vec_cst
4024 = validize_mem (force_const_mem (V2DImode, vec_cst));
4026 emit_insn (gen_rtx_SET
4027 (tmp2,
4028 gen_rtx_AND (V2DImode,
4029 gen_rtx_SUBREG (V2DImode, reg, 0),
4030 vec_cst)));
4032 rtx_insn *seq = get_insns ();
4033 end_sequence ();
4035 emit_insn_before (seq, insn);
4037 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4039 else if (!MEM_P (dst) || !REG_P (src))
4040 replace_with_subreg_in_insn (insn, reg, reg);
4042 bitmap_clear_bit (conv, INSN_UID (insn));
4045 /* Skip debug insns and uninitialized uses. */
4046 else if (DF_REF_CHAIN (ref)
4047 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4049 gcc_assert (scopy);
4050 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4051 df_insn_rescan (DF_REF_INSN (ref));
4054 BITMAP_FREE (conv);
4057 /* Convert operand OP in INSN. We should handle
4058 memory operands and uninitialized registers.
4059 All other register uses are converted during
4060 registers conversion. */
4062 void
4063 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4065 *op = copy_rtx_if_shared (*op);
4067 if (GET_CODE (*op) == NOT)
4069 convert_op (&XEXP (*op, 0), insn);
4070 PUT_MODE (*op, V2DImode);
4072 else if (MEM_P (*op))
4074 rtx tmp = gen_reg_rtx (DImode);
4076 emit_insn_before (gen_move_insn (tmp, *op), insn);
4077 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4079 if (dump_file)
4080 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4081 INSN_UID (insn), REGNO (tmp));
4083 else if (REG_P (*op))
4085 /* We may have not converted register usage in case
4086 this register has no definition. Otherwise it
4087 should be converted in convert_reg. */
4088 df_ref ref;
4089 FOR_EACH_INSN_USE (ref, insn)
4090 if (DF_REF_REGNO (ref) == REGNO (*op))
4092 gcc_assert (!DF_REF_CHAIN (ref));
4093 break;
4095 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4097 else if (CONST_INT_P (*op))
4099 rtx vec_cst;
4100 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4102 /* Prefer all ones vector in case of -1. */
4103 if (constm1_operand (*op, GET_MODE (*op)))
4104 vec_cst = CONSTM1_RTX (V2DImode);
4105 else
4106 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4107 gen_rtvec (2, *op, const0_rtx));
4109 if (!standard_sse_constant_p (vec_cst, V2DImode))
4111 start_sequence ();
4112 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4113 rtx_insn *seq = get_insns ();
4114 end_sequence ();
4115 emit_insn_before (seq, insn);
4118 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4119 *op = tmp;
4121 else
4123 gcc_assert (SUBREG_P (*op));
4124 gcc_assert (GET_MODE (*op) == V2DImode);
4128 /* Convert INSN to vector mode. */
4130 void
4131 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4133 rtx def_set = single_set (insn);
4134 rtx src = SET_SRC (def_set);
4135 rtx dst = SET_DEST (def_set);
4136 rtx subreg;
4138 if (MEM_P (dst) && !REG_P (src))
4140 /* There are no scalar integer instructions and therefore
4141 temporary register usage is required. */
4142 rtx tmp = gen_reg_rtx (DImode);
4143 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4144 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4147 switch (GET_CODE (src))
4149 case ASHIFT:
4150 case ASHIFTRT:
4151 case LSHIFTRT:
4152 convert_op (&XEXP (src, 0), insn);
4153 PUT_MODE (src, V2DImode);
4154 break;
4156 case PLUS:
4157 case MINUS:
4158 case IOR:
4159 case XOR:
4160 case AND:
4161 convert_op (&XEXP (src, 0), insn);
4162 convert_op (&XEXP (src, 1), insn);
4163 PUT_MODE (src, V2DImode);
4164 break;
4166 case NEG:
4167 src = XEXP (src, 0);
4168 convert_op (&src, insn);
4169 subreg = gen_reg_rtx (V2DImode);
4170 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4171 src = gen_rtx_MINUS (V2DImode, subreg, src);
4172 break;
4174 case NOT:
4175 src = XEXP (src, 0);
4176 convert_op (&src, insn);
4177 subreg = gen_reg_rtx (V2DImode);
4178 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4179 src = gen_rtx_XOR (V2DImode, src, subreg);
4180 break;
4182 case MEM:
4183 if (!REG_P (dst))
4184 convert_op (&src, insn);
4185 break;
4187 case REG:
4188 if (!MEM_P (dst))
4189 convert_op (&src, insn);
4190 break;
4192 case SUBREG:
4193 gcc_assert (GET_MODE (src) == V2DImode);
4194 break;
4196 case COMPARE:
4197 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4199 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4200 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4202 if (REG_P (src))
4203 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4204 else
4205 subreg = copy_rtx_if_shared (src);
4206 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4207 copy_rtx_if_shared (subreg),
4208 copy_rtx_if_shared (subreg)),
4209 insn);
4210 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4211 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4212 copy_rtx_if_shared (src)),
4213 UNSPEC_PTEST);
4214 break;
4216 case CONST_INT:
4217 convert_op (&src, insn);
4218 break;
4220 default:
4221 gcc_unreachable ();
4224 SET_SRC (def_set) = src;
4225 SET_DEST (def_set) = dst;
4227 /* Drop possible dead definitions. */
4228 PATTERN (insn) = def_set;
4230 INSN_CODE (insn) = -1;
4231 recog_memoized (insn);
4232 df_insn_rescan (insn);
4235 /* Fix uses of converted REG in debug insns. */
4237 void
4238 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4240 if (!flag_var_tracking)
4241 return;
4243 df_ref ref, next;
4244 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4246 rtx_insn *insn = DF_REF_INSN (ref);
4247 /* Make sure the next ref is for a different instruction,
4248 so that we're not affected by the rescan. */
4249 next = DF_REF_NEXT_REG (ref);
4250 while (next && DF_REF_INSN (next) == insn)
4251 next = DF_REF_NEXT_REG (next);
4253 if (DEBUG_INSN_P (insn))
4255 /* It may be a debug insn with a TImode variable in
4256 register. */
4257 bool changed = false;
4258 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4260 rtx *loc = DF_REF_LOC (ref);
4261 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4263 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4264 changed = true;
4267 if (changed)
4268 df_insn_rescan (insn);
4273 /* Convert INSN from TImode to V1T1mode. */
4275 void
4276 timode_scalar_chain::convert_insn (rtx_insn *insn)
4278 rtx def_set = single_set (insn);
4279 rtx src = SET_SRC (def_set);
4280 rtx dst = SET_DEST (def_set);
4282 switch (GET_CODE (dst))
4284 case REG:
4286 rtx tmp = find_reg_equal_equiv_note (insn);
4287 if (tmp)
4288 PUT_MODE (XEXP (tmp, 0), V1TImode);
4289 PUT_MODE (dst, V1TImode);
4290 fix_debug_reg_uses (dst);
4292 break;
4293 case MEM:
4294 PUT_MODE (dst, V1TImode);
4295 break;
4297 default:
4298 gcc_unreachable ();
4301 switch (GET_CODE (src))
4303 case REG:
4304 PUT_MODE (src, V1TImode);
4305 /* Call fix_debug_reg_uses only if SRC is never defined. */
4306 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4307 fix_debug_reg_uses (src);
4308 break;
4310 case MEM:
4311 PUT_MODE (src, V1TImode);
4312 break;
4314 case CONST_WIDE_INT:
4315 if (NONDEBUG_INSN_P (insn))
4317 /* Since there are no instructions to store 128-bit constant,
4318 temporary register usage is required. */
4319 rtx tmp = gen_reg_rtx (V1TImode);
4320 start_sequence ();
4321 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4322 src = validize_mem (force_const_mem (V1TImode, src));
4323 rtx_insn *seq = get_insns ();
4324 end_sequence ();
4325 if (seq)
4326 emit_insn_before (seq, insn);
4327 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4328 dst = tmp;
4330 break;
4332 case CONST_INT:
4333 switch (standard_sse_constant_p (src, TImode))
4335 case 1:
4336 src = CONST0_RTX (GET_MODE (dst));
4337 break;
4338 case 2:
4339 src = CONSTM1_RTX (GET_MODE (dst));
4340 break;
4341 default:
4342 gcc_unreachable ();
4344 if (NONDEBUG_INSN_P (insn))
4346 rtx tmp = gen_reg_rtx (V1TImode);
4347 /* Since there are no instructions to store standard SSE
4348 constant, temporary register usage is required. */
4349 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4350 dst = tmp;
4352 break;
4354 default:
4355 gcc_unreachable ();
4358 SET_SRC (def_set) = src;
4359 SET_DEST (def_set) = dst;
4361 /* Drop possible dead definitions. */
4362 PATTERN (insn) = def_set;
4364 INSN_CODE (insn) = -1;
4365 recog_memoized (insn);
4366 df_insn_rescan (insn);
4369 void
4370 dimode_scalar_chain::convert_registers ()
4372 bitmap_iterator bi;
4373 unsigned id;
4375 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4376 convert_reg (id);
4378 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4379 make_vector_copies (id);
4382 /* Convert whole chain creating required register
4383 conversions and copies. */
4386 scalar_chain::convert ()
4388 bitmap_iterator bi;
4389 unsigned id;
4390 int converted_insns = 0;
4392 if (!dbg_cnt (stv_conversion))
4393 return 0;
4395 if (dump_file)
4396 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4398 convert_registers ();
4400 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4402 convert_insn (DF_INSN_UID_GET (id)->insn);
4403 converted_insns++;
4406 return converted_insns;
4409 /* Main STV pass function. Find and convert scalar
4410 instructions into vector mode when profitable. */
4412 static unsigned int
4413 convert_scalars_to_vector ()
4415 basic_block bb;
4416 bitmap candidates;
4417 int converted_insns = 0;
4419 bitmap_obstack_initialize (NULL);
4420 candidates = BITMAP_ALLOC (NULL);
4422 calculate_dominance_info (CDI_DOMINATORS);
4423 df_set_flags (DF_DEFER_INSN_RESCAN);
4424 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4425 df_md_add_problem ();
4426 df_analyze ();
4428 /* Find all instructions we want to convert into vector mode. */
4429 if (dump_file)
4430 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4432 FOR_EACH_BB_FN (bb, cfun)
4434 rtx_insn *insn;
4435 FOR_BB_INSNS (bb, insn)
4436 if (scalar_to_vector_candidate_p (insn))
4438 if (dump_file)
4439 fprintf (dump_file, " insn %d is marked as a candidate\n",
4440 INSN_UID (insn));
4442 bitmap_set_bit (candidates, INSN_UID (insn));
4446 remove_non_convertible_regs (candidates);
4448 if (bitmap_empty_p (candidates))
4449 if (dump_file)
4450 fprintf (dump_file, "There are no candidates for optimization.\n");
4452 while (!bitmap_empty_p (candidates))
4454 unsigned uid = bitmap_first_set_bit (candidates);
4455 scalar_chain *chain;
4457 if (TARGET_64BIT)
4458 chain = new timode_scalar_chain;
4459 else
4460 chain = new dimode_scalar_chain;
4462 /* Find instructions chain we want to convert to vector mode.
4463 Check all uses and definitions to estimate all required
4464 conversions. */
4465 chain->build (candidates, uid);
4467 if (chain->compute_convert_gain () > 0)
4468 converted_insns += chain->convert ();
4469 else
4470 if (dump_file)
4471 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4472 chain->chain_id);
4474 delete chain;
4477 if (dump_file)
4478 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4480 BITMAP_FREE (candidates);
4481 bitmap_obstack_release (NULL);
4482 df_process_deferred_rescans ();
4484 /* Conversion means we may have 128bit register spills/fills
4485 which require aligned stack. */
4486 if (converted_insns)
4488 if (crtl->stack_alignment_needed < 128)
4489 crtl->stack_alignment_needed = 128;
4490 if (crtl->stack_alignment_estimated < 128)
4491 crtl->stack_alignment_estimated = 128;
4492 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4493 if (TARGET_64BIT)
4494 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4495 parm; parm = DECL_CHAIN (parm))
4497 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4498 continue;
4499 if (DECL_RTL_SET_P (parm)
4500 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4502 rtx r = DECL_RTL (parm);
4503 if (REG_P (r))
4504 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4506 if (DECL_INCOMING_RTL (parm)
4507 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4509 rtx r = DECL_INCOMING_RTL (parm);
4510 if (REG_P (r))
4511 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4516 return 0;
4519 namespace {
4521 const pass_data pass_data_insert_vzeroupper =
4523 RTL_PASS, /* type */
4524 "vzeroupper", /* name */
4525 OPTGROUP_NONE, /* optinfo_flags */
4526 TV_MACH_DEP, /* tv_id */
4527 0, /* properties_required */
4528 0, /* properties_provided */
4529 0, /* properties_destroyed */
4530 0, /* todo_flags_start */
4531 TODO_df_finish, /* todo_flags_finish */
4534 class pass_insert_vzeroupper : public rtl_opt_pass
4536 public:
4537 pass_insert_vzeroupper(gcc::context *ctxt)
4538 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4541 /* opt_pass methods: */
4542 virtual bool gate (function *)
4544 return TARGET_AVX && !TARGET_AVX512F
4545 && TARGET_VZEROUPPER && flag_expensive_optimizations
4546 && !optimize_size;
4549 virtual unsigned int execute (function *)
4551 return rest_of_handle_insert_vzeroupper ();
4554 }; // class pass_insert_vzeroupper
4556 const pass_data pass_data_stv =
4558 RTL_PASS, /* type */
4559 "stv", /* name */
4560 OPTGROUP_NONE, /* optinfo_flags */
4561 TV_MACH_DEP, /* tv_id */
4562 0, /* properties_required */
4563 0, /* properties_provided */
4564 0, /* properties_destroyed */
4565 0, /* todo_flags_start */
4566 TODO_df_finish, /* todo_flags_finish */
4569 class pass_stv : public rtl_opt_pass
4571 public:
4572 pass_stv (gcc::context *ctxt)
4573 : rtl_opt_pass (pass_data_stv, ctxt),
4574 timode_p (false)
4577 /* opt_pass methods: */
4578 virtual bool gate (function *)
4580 return (timode_p == !!TARGET_64BIT
4581 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4584 virtual unsigned int execute (function *)
4586 return convert_scalars_to_vector ();
4589 opt_pass *clone ()
4591 return new pass_stv (m_ctxt);
4594 void set_pass_param (unsigned int n, bool param)
4596 gcc_assert (n == 0);
4597 timode_p = param;
4600 private:
4601 bool timode_p;
4602 }; // class pass_stv
4604 } // anon namespace
4606 rtl_opt_pass *
4607 make_pass_insert_vzeroupper (gcc::context *ctxt)
4609 return new pass_insert_vzeroupper (ctxt);
4612 rtl_opt_pass *
4613 make_pass_stv (gcc::context *ctxt)
4615 return new pass_stv (ctxt);
4618 /* Return true if a red-zone is in use. */
4620 bool
4621 ix86_using_red_zone (void)
4623 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4626 /* Return a string that documents the current -m options. The caller is
4627 responsible for freeing the string. */
4629 static char *
4630 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4631 int flags, int flags2,
4632 const char *arch, const char *tune,
4633 enum fpmath_unit fpmath, bool add_nl_p)
4635 struct ix86_target_opts
4637 const char *option; /* option string */
4638 HOST_WIDE_INT mask; /* isa mask options */
4641 /* This table is ordered so that options like -msse4.2 that imply other
4642 ISAs come first. Target string will be displayed in the same order. */
4643 static struct ix86_target_opts isa2_opts[] =
4645 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4646 { "-msgx", OPTION_MASK_ISA_SGX },
4647 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4648 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4649 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4651 static struct ix86_target_opts isa_opts[] =
4653 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4654 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4655 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4656 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4657 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4658 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4659 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4660 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4661 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4662 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4663 { "-mfma", OPTION_MASK_ISA_FMA },
4664 { "-mxop", OPTION_MASK_ISA_XOP },
4665 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4666 { "-mf16c", OPTION_MASK_ISA_F16C },
4667 { "-mavx", OPTION_MASK_ISA_AVX },
4668 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4669 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4670 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4671 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4672 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4673 { "-msse3", OPTION_MASK_ISA_SSE3 },
4674 { "-maes", OPTION_MASK_ISA_AES },
4675 { "-msha", OPTION_MASK_ISA_SHA },
4676 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4677 { "-msse2", OPTION_MASK_ISA_SSE2 },
4678 { "-msse", OPTION_MASK_ISA_SSE },
4679 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4680 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4681 { "-mmmx", OPTION_MASK_ISA_MMX },
4682 { "-mrtm", OPTION_MASK_ISA_RTM },
4683 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4684 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4685 { "-madx", OPTION_MASK_ISA_ADX },
4686 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4687 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4688 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4689 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4690 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4691 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4692 { "-mabm", OPTION_MASK_ISA_ABM },
4693 { "-mbmi", OPTION_MASK_ISA_BMI },
4694 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4695 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4696 { "-mtbm", OPTION_MASK_ISA_TBM },
4697 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4698 { "-mcx16", OPTION_MASK_ISA_CX16 },
4699 { "-msahf", OPTION_MASK_ISA_SAHF },
4700 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4701 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4702 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4703 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4704 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4705 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4706 { "-mpku", OPTION_MASK_ISA_PKU },
4707 { "-mlwp", OPTION_MASK_ISA_LWP },
4708 { "-mhle", OPTION_MASK_ISA_HLE },
4709 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4710 { "-mmpx", OPTION_MASK_ISA_MPX },
4711 { "-mclwb", OPTION_MASK_ISA_CLWB }
4714 /* Flag options. */
4715 static struct ix86_target_opts flag_opts[] =
4717 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4718 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4719 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4720 { "-m80387", MASK_80387 },
4721 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4722 { "-malign-double", MASK_ALIGN_DOUBLE },
4723 { "-mcld", MASK_CLD },
4724 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4725 { "-mieee-fp", MASK_IEEE_FP },
4726 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4727 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4728 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4729 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4730 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4731 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4732 { "-mno-red-zone", MASK_NO_RED_ZONE },
4733 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4734 { "-mrecip", MASK_RECIP },
4735 { "-mrtd", MASK_RTD },
4736 { "-msseregparm", MASK_SSEREGPARM },
4737 { "-mstack-arg-probe", MASK_STACK_PROBE },
4738 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4739 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4740 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4741 { "-mvzeroupper", MASK_VZEROUPPER },
4742 { "-mstv", MASK_STV },
4743 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4744 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4745 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4746 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4749 /* Additional flag options. */
4750 static struct ix86_target_opts flag2_opts[] =
4752 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4755 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4756 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4758 char isa_other[40];
4759 char isa2_other[40];
4760 char flags_other[40];
4761 char flags2_other[40];
4762 unsigned num = 0;
4763 unsigned i, j;
4764 char *ret;
4765 char *ptr;
4766 size_t len;
4767 size_t line_len;
4768 size_t sep_len;
4769 const char *abi;
4771 memset (opts, '\0', sizeof (opts));
4773 /* Add -march= option. */
4774 if (arch)
4776 opts[num][0] = "-march=";
4777 opts[num++][1] = arch;
4780 /* Add -mtune= option. */
4781 if (tune)
4783 opts[num][0] = "-mtune=";
4784 opts[num++][1] = tune;
4787 /* Add -m32/-m64/-mx32. */
4788 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4790 if ((isa & OPTION_MASK_ABI_64) != 0)
4791 abi = "-m64";
4792 else
4793 abi = "-mx32";
4794 isa &= ~ (OPTION_MASK_ISA_64BIT
4795 | OPTION_MASK_ABI_64
4796 | OPTION_MASK_ABI_X32);
4798 else
4799 abi = "-m32";
4800 opts[num++][0] = abi;
4802 /* Pick out the options in isa2 options. */
4803 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4805 if ((isa2 & isa2_opts[i].mask) != 0)
4807 opts[num++][0] = isa2_opts[i].option;
4808 isa2 &= ~ isa2_opts[i].mask;
4812 if (isa2 && add_nl_p)
4814 opts[num++][0] = isa2_other;
4815 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4818 /* Pick out the options in isa options. */
4819 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4821 if ((isa & isa_opts[i].mask) != 0)
4823 opts[num++][0] = isa_opts[i].option;
4824 isa &= ~ isa_opts[i].mask;
4828 if (isa && add_nl_p)
4830 opts[num++][0] = isa_other;
4831 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4834 /* Add flag options. */
4835 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4837 if ((flags & flag_opts[i].mask) != 0)
4839 opts[num++][0] = flag_opts[i].option;
4840 flags &= ~ flag_opts[i].mask;
4844 if (flags && add_nl_p)
4846 opts[num++][0] = flags_other;
4847 sprintf (flags_other, "(other flags: %#x)", flags);
4850 /* Add additional flag options. */
4851 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4853 if ((flags2 & flag2_opts[i].mask) != 0)
4855 opts[num++][0] = flag2_opts[i].option;
4856 flags2 &= ~ flag2_opts[i].mask;
4860 if (flags2 && add_nl_p)
4862 opts[num++][0] = flags2_other;
4863 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4866 /* Add -fpmath= option. */
4867 if (fpmath)
4869 opts[num][0] = "-mfpmath=";
4870 switch ((int) fpmath)
4872 case FPMATH_387:
4873 opts[num++][1] = "387";
4874 break;
4876 case FPMATH_SSE:
4877 opts[num++][1] = "sse";
4878 break;
4880 case FPMATH_387 | FPMATH_SSE:
4881 opts[num++][1] = "sse+387";
4882 break;
4884 default:
4885 gcc_unreachable ();
4889 /* Any options? */
4890 if (num == 0)
4891 return NULL;
4893 gcc_assert (num < ARRAY_SIZE (opts));
4895 /* Size the string. */
4896 len = 0;
4897 sep_len = (add_nl_p) ? 3 : 1;
4898 for (i = 0; i < num; i++)
4900 len += sep_len;
4901 for (j = 0; j < 2; j++)
4902 if (opts[i][j])
4903 len += strlen (opts[i][j]);
4906 /* Build the string. */
4907 ret = ptr = (char *) xmalloc (len);
4908 line_len = 0;
4910 for (i = 0; i < num; i++)
4912 size_t len2[2];
4914 for (j = 0; j < 2; j++)
4915 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4917 if (i != 0)
4919 *ptr++ = ' ';
4920 line_len++;
4922 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4924 *ptr++ = '\\';
4925 *ptr++ = '\n';
4926 line_len = 0;
4930 for (j = 0; j < 2; j++)
4931 if (opts[i][j])
4933 memcpy (ptr, opts[i][j], len2[j]);
4934 ptr += len2[j];
4935 line_len += len2[j];
4939 *ptr = '\0';
4940 gcc_assert (ret + len >= ptr);
4942 return ret;
4945 /* Return true, if profiling code should be emitted before
4946 prologue. Otherwise it returns false.
4947 Note: For x86 with "hotfix" it is sorried. */
4948 static bool
4949 ix86_profile_before_prologue (void)
4951 return flag_fentry != 0;
4954 /* Function that is callable from the debugger to print the current
4955 options. */
4956 void ATTRIBUTE_UNUSED
4957 ix86_debug_options (void)
4959 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4960 target_flags, ix86_target_flags,
4961 ix86_arch_string,ix86_tune_string,
4962 ix86_fpmath, true);
4964 if (opts)
4966 fprintf (stderr, "%s\n\n", opts);
4967 free (opts);
4969 else
4970 fputs ("<no options>\n\n", stderr);
4972 return;
4975 /* Return true if T is one of the bytes we should avoid with
4976 -fmitigate-rop. */
4978 static bool
4979 ix86_rop_should_change_byte_p (int t)
4981 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4984 static const char *stringop_alg_names[] = {
4985 #define DEF_ENUM
4986 #define DEF_ALG(alg, name) #name,
4987 #include "stringop.def"
4988 #undef DEF_ENUM
4989 #undef DEF_ALG
4992 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4993 The string is of the following form (or comma separated list of it):
4995 strategy_alg:max_size:[align|noalign]
4997 where the full size range for the strategy is either [0, max_size] or
4998 [min_size, max_size], in which min_size is the max_size + 1 of the
4999 preceding range. The last size range must have max_size == -1.
5001 Examples:
5004 -mmemcpy-strategy=libcall:-1:noalign
5006 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5010 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5012 This is to tell the compiler to use the following strategy for memset
5013 1) when the expected size is between [1, 16], use rep_8byte strategy;
5014 2) when the size is between [17, 2048], use vector_loop;
5015 3) when the size is > 2048, use libcall. */
5017 struct stringop_size_range
5019 int max;
5020 stringop_alg alg;
5021 bool noalign;
5024 static void
5025 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5027 const struct stringop_algs *default_algs;
5028 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5029 char *curr_range_str, *next_range_str;
5030 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5031 int i = 0, n = 0;
5033 if (is_memset)
5034 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5035 else
5036 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5038 curr_range_str = strategy_str;
5042 int maxs;
5043 char alg_name[128];
5044 char align[16];
5045 next_range_str = strchr (curr_range_str, ',');
5046 if (next_range_str)
5047 *next_range_str++ = '\0';
5049 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5050 alg_name, &maxs, align))
5052 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5053 return;
5056 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5058 error ("size ranges of option %qs should be increasing", opt);
5059 return;
5062 for (i = 0; i < last_alg; i++)
5063 if (!strcmp (alg_name, stringop_alg_names[i]))
5064 break;
5066 if (i == last_alg)
5068 error ("wrong strategy name %qs specified for option %qs",
5069 alg_name, opt);
5071 auto_vec <const char *> candidates;
5072 for (i = 0; i < last_alg; i++)
5073 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5074 candidates.safe_push (stringop_alg_names[i]);
5076 char *s;
5077 const char *hint
5078 = candidates_list_and_hint (alg_name, s, candidates);
5079 if (hint)
5080 inform (input_location,
5081 "valid arguments to %qs are: %s; did you mean %qs?",
5082 opt, s, hint);
5083 else
5084 inform (input_location, "valid arguments to %qs are: %s",
5085 opt, s);
5086 XDELETEVEC (s);
5087 return;
5090 if ((stringop_alg) i == rep_prefix_8_byte
5091 && !TARGET_64BIT)
5093 /* rep; movq isn't available in 32-bit code. */
5094 error ("strategy name %qs specified for option %qs "
5095 "not supported for 32-bit code", alg_name, opt);
5096 return;
5099 input_ranges[n].max = maxs;
5100 input_ranges[n].alg = (stringop_alg) i;
5101 if (!strcmp (align, "align"))
5102 input_ranges[n].noalign = false;
5103 else if (!strcmp (align, "noalign"))
5104 input_ranges[n].noalign = true;
5105 else
5107 error ("unknown alignment %qs specified for option %qs", align, opt);
5108 return;
5110 n++;
5111 curr_range_str = next_range_str;
5113 while (curr_range_str);
5115 if (input_ranges[n - 1].max != -1)
5117 error ("the max value for the last size range should be -1"
5118 " for option %qs", opt);
5119 return;
5122 if (n > MAX_STRINGOP_ALGS)
5124 error ("too many size ranges specified in option %qs", opt);
5125 return;
5128 /* Now override the default algs array. */
5129 for (i = 0; i < n; i++)
5131 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5132 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5133 = input_ranges[i].alg;
5134 *const_cast<int *>(&default_algs->size[i].noalign)
5135 = input_ranges[i].noalign;
5140 /* parse -mtune-ctrl= option. When DUMP is true,
5141 print the features that are explicitly set. */
5143 static void
5144 parse_mtune_ctrl_str (bool dump)
5146 if (!ix86_tune_ctrl_string)
5147 return;
5149 char *next_feature_string = NULL;
5150 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5151 char *orig = curr_feature_string;
5152 int i;
5155 bool clear = false;
5157 next_feature_string = strchr (curr_feature_string, ',');
5158 if (next_feature_string)
5159 *next_feature_string++ = '\0';
5160 if (*curr_feature_string == '^')
5162 curr_feature_string++;
5163 clear = true;
5165 for (i = 0; i < X86_TUNE_LAST; i++)
5167 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5169 ix86_tune_features[i] = !clear;
5170 if (dump)
5171 fprintf (stderr, "Explicitly %s feature %s\n",
5172 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5173 break;
5176 if (i == X86_TUNE_LAST)
5177 error ("Unknown parameter to option -mtune-ctrl: %s",
5178 clear ? curr_feature_string - 1 : curr_feature_string);
5179 curr_feature_string = next_feature_string;
5181 while (curr_feature_string);
5182 free (orig);
5185 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5186 processor type. */
5188 static void
5189 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5191 unsigned int ix86_tune_mask = 1u << ix86_tune;
5192 int i;
5194 for (i = 0; i < X86_TUNE_LAST; ++i)
5196 if (ix86_tune_no_default)
5197 ix86_tune_features[i] = 0;
5198 else
5199 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5202 if (dump)
5204 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5205 for (i = 0; i < X86_TUNE_LAST; i++)
5206 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5207 ix86_tune_features[i] ? "on" : "off");
5210 parse_mtune_ctrl_str (dump);
5214 /* Default align_* from the processor table. */
5216 static void
5217 ix86_default_align (struct gcc_options *opts)
5219 if (opts->x_align_loops == 0)
5221 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5222 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5224 if (opts->x_align_jumps == 0)
5226 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5227 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5229 if (opts->x_align_functions == 0)
5231 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5235 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5237 static void
5238 ix86_override_options_after_change (void)
5240 ix86_default_align (&global_options);
5243 /* Override various settings based on options. If MAIN_ARGS_P, the
5244 options are from the command line, otherwise they are from
5245 attributes. Return true if there's an error related to march
5246 option. */
5248 static bool
5249 ix86_option_override_internal (bool main_args_p,
5250 struct gcc_options *opts,
5251 struct gcc_options *opts_set)
5253 int i;
5254 unsigned int ix86_arch_mask;
5255 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5257 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5258 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5259 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5260 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5261 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5262 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5263 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5264 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5265 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5266 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5267 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5268 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5269 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5270 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5271 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5272 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5273 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5274 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5275 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5276 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5277 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5278 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5279 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5280 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5281 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5282 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5283 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5284 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5285 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5286 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5287 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5288 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5289 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5290 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5291 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5292 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5293 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5294 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5295 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5296 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5297 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5298 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5299 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5300 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5301 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5302 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5303 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5304 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5305 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5306 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5307 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5308 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5309 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5310 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5311 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5312 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5313 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5314 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5315 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5316 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5317 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5318 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5319 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5320 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5322 #define PTA_CORE2 \
5323 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5324 | PTA_CX16 | PTA_FXSR)
5325 #define PTA_NEHALEM \
5326 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5327 #define PTA_WESTMERE \
5328 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5329 #define PTA_SANDYBRIDGE \
5330 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5331 #define PTA_IVYBRIDGE \
5332 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5333 #define PTA_HASWELL \
5334 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5335 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5336 #define PTA_BROADWELL \
5337 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5338 #define PTA_SKYLAKE \
5339 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5340 #define PTA_SKYLAKE_AVX512 \
5341 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5342 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5343 #define PTA_KNL \
5344 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5345 #define PTA_BONNELL \
5346 (PTA_CORE2 | PTA_MOVBE)
5347 #define PTA_SILVERMONT \
5348 (PTA_WESTMERE | PTA_MOVBE)
5350 /* if this reaches 64, need to widen struct pta flags below */
5352 static struct pta
5354 const char *const name; /* processor name or nickname. */
5355 const enum processor_type processor;
5356 const enum attr_cpu schedule;
5357 const unsigned HOST_WIDE_INT flags;
5359 const processor_alias_table[] =
5361 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5362 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5363 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5364 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5365 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5366 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5367 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5368 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5369 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5370 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5371 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5372 PTA_MMX | PTA_SSE | PTA_FXSR},
5373 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5374 PTA_MMX | PTA_SSE | PTA_FXSR},
5375 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5376 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5377 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5378 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5379 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5380 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5381 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5382 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5383 PTA_MMX | PTA_SSE | PTA_FXSR},
5384 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5385 PTA_MMX | PTA_SSE | PTA_FXSR},
5386 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5387 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5388 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5389 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5390 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5391 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5392 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5393 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5394 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5395 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5396 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5397 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5398 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5399 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5400 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5401 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5402 PTA_SANDYBRIDGE},
5403 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5404 PTA_SANDYBRIDGE},
5405 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5406 PTA_IVYBRIDGE},
5407 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5408 PTA_IVYBRIDGE},
5409 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5410 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5411 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5412 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5413 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5414 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5415 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5416 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5417 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5418 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5419 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5420 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5421 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5422 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5423 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5424 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5425 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5426 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5427 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5428 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5429 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5430 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5431 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5432 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5433 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5434 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5435 {"x86-64", PROCESSOR_K8, CPU_K8,
5436 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5437 {"eden-x2", PROCESSOR_K8, CPU_K8,
5438 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5439 {"nano", PROCESSOR_K8, CPU_K8,
5440 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5441 | PTA_SSSE3 | PTA_FXSR},
5442 {"nano-1000", PROCESSOR_K8, CPU_K8,
5443 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5444 | PTA_SSSE3 | PTA_FXSR},
5445 {"nano-2000", PROCESSOR_K8, CPU_K8,
5446 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5447 | PTA_SSSE3 | PTA_FXSR},
5448 {"nano-3000", PROCESSOR_K8, CPU_K8,
5449 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5450 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5451 {"nano-x2", PROCESSOR_K8, CPU_K8,
5452 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5453 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5454 {"eden-x4", PROCESSOR_K8, CPU_K8,
5455 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5456 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5457 {"nano-x4", PROCESSOR_K8, CPU_K8,
5458 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5459 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5460 {"k8", PROCESSOR_K8, CPU_K8,
5461 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5462 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5463 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5464 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5465 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5466 {"opteron", PROCESSOR_K8, CPU_K8,
5467 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5468 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5469 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5470 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5471 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5472 {"athlon64", PROCESSOR_K8, CPU_K8,
5473 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5474 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5475 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5476 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5477 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5478 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5479 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5480 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5481 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5482 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5483 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5484 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5485 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5486 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5487 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5488 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5489 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5490 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5491 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5492 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5493 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5494 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5495 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5496 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5497 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5498 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5499 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5500 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5501 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5502 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5503 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5504 | PTA_XSAVEOPT | PTA_FSGSBASE},
5505 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5506 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5507 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5508 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5509 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5510 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5511 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5512 | PTA_MOVBE | PTA_MWAITX},
5513 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5514 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5515 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5516 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5517 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5518 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5519 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5520 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5521 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5522 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5523 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5524 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5525 | PTA_FXSR | PTA_XSAVE},
5526 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5527 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5528 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5529 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5530 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5531 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5533 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5534 PTA_64BIT
5535 | PTA_HLE /* flags are only used for -march switch. */ },
5538 /* -mrecip options. */
5539 static struct
5541 const char *string; /* option name */
5542 unsigned int mask; /* mask bits to set */
5544 const recip_options[] =
5546 { "all", RECIP_MASK_ALL },
5547 { "none", RECIP_MASK_NONE },
5548 { "div", RECIP_MASK_DIV },
5549 { "sqrt", RECIP_MASK_SQRT },
5550 { "vec-div", RECIP_MASK_VEC_DIV },
5551 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5554 int const pta_size = ARRAY_SIZE (processor_alias_table);
5556 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5557 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5558 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5559 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5560 #ifdef TARGET_BI_ARCH
5561 else
5563 #if TARGET_BI_ARCH == 1
5564 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5565 is on and OPTION_MASK_ABI_X32 is off. We turn off
5566 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5567 -mx32. */
5568 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5569 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5570 #else
5571 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5572 on and OPTION_MASK_ABI_64 is off. We turn off
5573 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5574 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5575 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5576 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5577 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5578 #endif
5579 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5580 && TARGET_IAMCU_P (opts->x_target_flags))
5581 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5582 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5584 #endif
5586 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5588 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5589 OPTION_MASK_ABI_64 for TARGET_X32. */
5590 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5591 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5593 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5594 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5595 | OPTION_MASK_ABI_X32
5596 | OPTION_MASK_ABI_64);
5597 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5599 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5600 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5602 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5605 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5606 SUBTARGET_OVERRIDE_OPTIONS;
5607 #endif
5609 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5610 SUBSUBTARGET_OVERRIDE_OPTIONS;
5611 #endif
5613 /* -fPIC is the default for x86_64. */
5614 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5615 opts->x_flag_pic = 2;
5617 /* Need to check -mtune=generic first. */
5618 if (opts->x_ix86_tune_string)
5620 /* As special support for cross compilers we read -mtune=native
5621 as -mtune=generic. With native compilers we won't see the
5622 -mtune=native, as it was changed by the driver. */
5623 if (!strcmp (opts->x_ix86_tune_string, "native"))
5625 opts->x_ix86_tune_string = "generic";
5627 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5628 warning (OPT_Wdeprecated,
5629 main_args_p
5630 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5631 "or %<-mtune=generic%> instead as appropriate")
5632 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5633 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5634 " instead as appropriate"));
5636 else
5638 if (opts->x_ix86_arch_string)
5639 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5640 if (!opts->x_ix86_tune_string)
5642 opts->x_ix86_tune_string
5643 = processor_target_table[TARGET_CPU_DEFAULT].name;
5644 ix86_tune_defaulted = 1;
5647 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5648 or defaulted. We need to use a sensible tune option. */
5649 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5651 opts->x_ix86_tune_string = "generic";
5655 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5656 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5658 /* rep; movq isn't available in 32-bit code. */
5659 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5660 opts->x_ix86_stringop_alg = no_stringop;
5663 if (!opts->x_ix86_arch_string)
5664 opts->x_ix86_arch_string
5665 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5666 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5667 else
5668 ix86_arch_specified = 1;
5670 if (opts_set->x_ix86_pmode)
5672 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5673 && opts->x_ix86_pmode == PMODE_SI)
5674 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5675 && opts->x_ix86_pmode == PMODE_DI))
5676 error ("address mode %qs not supported in the %s bit mode",
5677 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5678 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5680 else
5681 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5682 ? PMODE_DI : PMODE_SI;
5684 if (!opts_set->x_ix86_abi)
5685 opts->x_ix86_abi = DEFAULT_ABI;
5687 /* For targets using ms ABI enable ms-extensions, if not
5688 explicit turned off. For non-ms ABI we turn off this
5689 option. */
5690 if (!opts_set->x_flag_ms_extensions)
5691 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5693 if (opts_set->x_ix86_cmodel)
5695 switch (opts->x_ix86_cmodel)
5697 case CM_SMALL:
5698 case CM_SMALL_PIC:
5699 if (opts->x_flag_pic)
5700 opts->x_ix86_cmodel = CM_SMALL_PIC;
5701 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5702 error ("code model %qs not supported in the %s bit mode",
5703 "small", "32");
5704 break;
5706 case CM_MEDIUM:
5707 case CM_MEDIUM_PIC:
5708 if (opts->x_flag_pic)
5709 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5710 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5711 error ("code model %qs not supported in the %s bit mode",
5712 "medium", "32");
5713 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5714 error ("code model %qs not supported in x32 mode",
5715 "medium");
5716 break;
5718 case CM_LARGE:
5719 case CM_LARGE_PIC:
5720 if (opts->x_flag_pic)
5721 opts->x_ix86_cmodel = CM_LARGE_PIC;
5722 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5723 error ("code model %qs not supported in the %s bit mode",
5724 "large", "32");
5725 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5726 error ("code model %qs not supported in x32 mode",
5727 "large");
5728 break;
5730 case CM_32:
5731 if (opts->x_flag_pic)
5732 error ("code model %s does not support PIC mode", "32");
5733 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5734 error ("code model %qs not supported in the %s bit mode",
5735 "32", "64");
5736 break;
5738 case CM_KERNEL:
5739 if (opts->x_flag_pic)
5741 error ("code model %s does not support PIC mode", "kernel");
5742 opts->x_ix86_cmodel = CM_32;
5744 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5745 error ("code model %qs not supported in the %s bit mode",
5746 "kernel", "32");
5747 break;
5749 default:
5750 gcc_unreachable ();
5753 else
5755 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5756 use of rip-relative addressing. This eliminates fixups that
5757 would otherwise be needed if this object is to be placed in a
5758 DLL, and is essentially just as efficient as direct addressing. */
5759 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5760 && (TARGET_RDOS || TARGET_PECOFF))
5761 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5762 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5763 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5764 else
5765 opts->x_ix86_cmodel = CM_32;
5767 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5769 error ("-masm=intel not supported in this configuration");
5770 opts->x_ix86_asm_dialect = ASM_ATT;
5772 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5773 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5774 sorry ("%i-bit mode not compiled in",
5775 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5777 for (i = 0; i < pta_size; i++)
5778 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5780 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5782 error (main_args_p
5783 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5784 "switch")
5785 : G_("%<generic%> CPU can be used only for "
5786 "%<target(\"tune=\")%> attribute"));
5787 return false;
5789 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5791 error (main_args_p
5792 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5793 "switch")
5794 : G_("%<intel%> CPU can be used only for "
5795 "%<target(\"tune=\")%> attribute"));
5796 return false;
5799 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5800 && !(processor_alias_table[i].flags & PTA_64BIT))
5802 error ("CPU you selected does not support x86-64 "
5803 "instruction set");
5804 return false;
5807 ix86_schedule = processor_alias_table[i].schedule;
5808 ix86_arch = processor_alias_table[i].processor;
5809 /* Default cpu tuning to the architecture. */
5810 ix86_tune = ix86_arch;
5812 if (processor_alias_table[i].flags & PTA_MMX
5813 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5814 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5815 if (processor_alias_table[i].flags & PTA_3DNOW
5816 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5817 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5818 if (processor_alias_table[i].flags & PTA_3DNOW_A
5819 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5820 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5821 if (processor_alias_table[i].flags & PTA_SSE
5822 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5823 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5824 if (processor_alias_table[i].flags & PTA_SSE2
5825 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5826 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5827 if (processor_alias_table[i].flags & PTA_SSE3
5828 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5829 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5830 if (processor_alias_table[i].flags & PTA_SSSE3
5831 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5832 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5833 if (processor_alias_table[i].flags & PTA_SSE4_1
5834 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5835 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5836 if (processor_alias_table[i].flags & PTA_SSE4_2
5837 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5838 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5839 if (processor_alias_table[i].flags & PTA_AVX
5840 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5841 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5842 if (processor_alias_table[i].flags & PTA_AVX2
5843 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5844 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5845 if (processor_alias_table[i].flags & PTA_FMA
5846 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5847 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5848 if (processor_alias_table[i].flags & PTA_SSE4A
5849 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5850 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5851 if (processor_alias_table[i].flags & PTA_FMA4
5852 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5853 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5854 if (processor_alias_table[i].flags & PTA_XOP
5855 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5856 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5857 if (processor_alias_table[i].flags & PTA_LWP
5858 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5859 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5860 if (processor_alias_table[i].flags & PTA_ABM
5861 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5862 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5863 if (processor_alias_table[i].flags & PTA_BMI
5864 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5865 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5866 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5867 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5868 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5869 if (processor_alias_table[i].flags & PTA_TBM
5870 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5871 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5872 if (processor_alias_table[i].flags & PTA_BMI2
5873 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5874 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5875 if (processor_alias_table[i].flags & PTA_CX16
5876 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5877 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5878 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5879 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5880 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5881 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5882 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5883 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5884 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5885 if (processor_alias_table[i].flags & PTA_MOVBE
5886 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5887 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5888 if (processor_alias_table[i].flags & PTA_AES
5889 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5890 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5891 if (processor_alias_table[i].flags & PTA_SHA
5892 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5893 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5894 if (processor_alias_table[i].flags & PTA_PCLMUL
5895 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5896 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5897 if (processor_alias_table[i].flags & PTA_FSGSBASE
5898 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5899 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5900 if (processor_alias_table[i].flags & PTA_RDRND
5901 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5902 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5903 if (processor_alias_table[i].flags & PTA_F16C
5904 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5905 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5906 if (processor_alias_table[i].flags & PTA_RTM
5907 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5908 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5909 if (processor_alias_table[i].flags & PTA_HLE
5910 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5911 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5912 if (processor_alias_table[i].flags & PTA_PRFCHW
5913 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5914 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5915 if (processor_alias_table[i].flags & PTA_RDSEED
5916 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5917 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5918 if (processor_alias_table[i].flags & PTA_ADX
5919 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5920 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5921 if (processor_alias_table[i].flags & PTA_FXSR
5922 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5923 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5924 if (processor_alias_table[i].flags & PTA_XSAVE
5925 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5926 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5927 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5928 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5929 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5930 if (processor_alias_table[i].flags & PTA_AVX512F
5931 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5932 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5933 if (processor_alias_table[i].flags & PTA_AVX512ER
5934 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5935 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5936 if (processor_alias_table[i].flags & PTA_AVX512PF
5937 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5938 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5939 if (processor_alias_table[i].flags & PTA_AVX512CD
5940 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5941 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5942 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5943 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5944 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5945 if (processor_alias_table[i].flags & PTA_CLWB
5946 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5947 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5948 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5949 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5950 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5951 if (processor_alias_table[i].flags & PTA_CLZERO
5952 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5953 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5954 if (processor_alias_table[i].flags & PTA_XSAVEC
5955 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5956 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5957 if (processor_alias_table[i].flags & PTA_XSAVES
5958 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5959 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5960 if (processor_alias_table[i].flags & PTA_AVX512DQ
5961 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5962 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5963 if (processor_alias_table[i].flags & PTA_AVX512BW
5964 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5965 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5966 if (processor_alias_table[i].flags & PTA_AVX512VL
5967 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5968 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5969 if (processor_alias_table[i].flags & PTA_MPX
5970 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5971 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5972 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5973 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5974 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5975 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5976 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5977 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5979 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5980 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5981 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5982 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5983 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5984 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5985 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5986 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5987 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5988 if (processor_alias_table[i].flags & PTA_SGX
5989 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5990 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5992 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
5993 x86_prefetch_sse = true;
5994 if (processor_alias_table[i].flags & PTA_MWAITX
5995 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
5996 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
5997 if (processor_alias_table[i].flags & PTA_PKU
5998 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
5999 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6001 /* Don't enable x87 instructions if only
6002 general registers are allowed. */
6003 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6004 && !(opts_set->x_target_flags & MASK_80387))
6006 if (processor_alias_table[i].flags & PTA_NO_80387)
6007 opts->x_target_flags &= ~MASK_80387;
6008 else
6009 opts->x_target_flags |= MASK_80387;
6011 break;
6014 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6015 error ("Intel MPX does not support x32");
6017 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6018 error ("Intel MPX does not support x32");
6020 if (i == pta_size)
6022 error (main_args_p
6023 ? G_("bad value (%qs) for %<-march=%> switch")
6024 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6025 opts->x_ix86_arch_string);
6027 auto_vec <const char *> candidates;
6028 for (i = 0; i < pta_size; i++)
6029 if (strcmp (processor_alias_table[i].name, "generic")
6030 && strcmp (processor_alias_table[i].name, "intel")
6031 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6032 || (processor_alias_table[i].flags & PTA_64BIT)))
6033 candidates.safe_push (processor_alias_table[i].name);
6035 char *s;
6036 const char *hint
6037 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6038 if (hint)
6039 inform (input_location,
6040 main_args_p
6041 ? G_("valid arguments to %<-march=%> switch are: "
6042 "%s; did you mean %qs?")
6043 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6044 "%s; did you mean %qs?"), s, hint);
6045 else
6046 inform (input_location,
6047 main_args_p
6048 ? G_("valid arguments to %<-march=%> switch are: %s")
6049 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6050 "are: %s"), s);
6051 XDELETEVEC (s);
6054 ix86_arch_mask = 1u << ix86_arch;
6055 for (i = 0; i < X86_ARCH_LAST; ++i)
6056 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6058 for (i = 0; i < pta_size; i++)
6059 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6061 ix86_schedule = processor_alias_table[i].schedule;
6062 ix86_tune = processor_alias_table[i].processor;
6063 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6065 if (!(processor_alias_table[i].flags & PTA_64BIT))
6067 if (ix86_tune_defaulted)
6069 opts->x_ix86_tune_string = "x86-64";
6070 for (i = 0; i < pta_size; i++)
6071 if (! strcmp (opts->x_ix86_tune_string,
6072 processor_alias_table[i].name))
6073 break;
6074 ix86_schedule = processor_alias_table[i].schedule;
6075 ix86_tune = processor_alias_table[i].processor;
6077 else
6078 error ("CPU you selected does not support x86-64 "
6079 "instruction set");
6082 /* Intel CPUs have always interpreted SSE prefetch instructions as
6083 NOPs; so, we can enable SSE prefetch instructions even when
6084 -mtune (rather than -march) points us to a processor that has them.
6085 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6086 higher processors. */
6087 if (TARGET_CMOV
6088 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6089 x86_prefetch_sse = true;
6090 break;
6093 if (ix86_tune_specified && i == pta_size)
6095 error (main_args_p
6096 ? G_("bad value (%qs) for %<-mtune=%> switch")
6097 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6098 opts->x_ix86_tune_string);
6100 auto_vec <const char *> candidates;
6101 for (i = 0; i < pta_size; i++)
6102 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6103 || (processor_alias_table[i].flags & PTA_64BIT))
6104 candidates.safe_push (processor_alias_table[i].name);
6106 char *s;
6107 const char *hint
6108 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6109 if (hint)
6110 inform (input_location,
6111 main_args_p
6112 ? G_("valid arguments to %<-mtune=%> switch are: "
6113 "%s; did you mean %qs?")
6114 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6115 "%s; did you mean %qs?"), s, hint);
6116 else
6117 inform (input_location,
6118 main_args_p
6119 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6120 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6121 "are: %s"), s);
6122 XDELETEVEC (s);
6125 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6127 #ifndef USE_IX86_FRAME_POINTER
6128 #define USE_IX86_FRAME_POINTER 0
6129 #endif
6131 #ifndef USE_X86_64_FRAME_POINTER
6132 #define USE_X86_64_FRAME_POINTER 0
6133 #endif
6135 /* Set the default values for switches whose default depends on TARGET_64BIT
6136 in case they weren't overwritten by command line options. */
6137 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6139 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6140 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6141 if (opts->x_flag_asynchronous_unwind_tables
6142 && !opts_set->x_flag_unwind_tables
6143 && TARGET_64BIT_MS_ABI)
6144 opts->x_flag_unwind_tables = 1;
6145 if (opts->x_flag_asynchronous_unwind_tables == 2)
6146 opts->x_flag_unwind_tables
6147 = opts->x_flag_asynchronous_unwind_tables = 1;
6148 if (opts->x_flag_pcc_struct_return == 2)
6149 opts->x_flag_pcc_struct_return = 0;
6151 else
6153 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6154 opts->x_flag_omit_frame_pointer
6155 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6156 if (opts->x_flag_asynchronous_unwind_tables == 2)
6157 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6158 if (opts->x_flag_pcc_struct_return == 2)
6160 /* Intel MCU psABI specifies that -freg-struct-return should
6161 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6162 we check -miamcu so that -freg-struct-return is always
6163 turned on if -miamcu is used. */
6164 if (TARGET_IAMCU_P (opts->x_target_flags))
6165 opts->x_flag_pcc_struct_return = 0;
6166 else
6167 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6171 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6172 /* TODO: ix86_cost should be chosen at instruction or function granuality
6173 so for cold code we use size_cost even in !optimize_size compilation. */
6174 if (opts->x_optimize_size)
6175 ix86_cost = &ix86_size_cost;
6176 else
6177 ix86_cost = ix86_tune_cost;
6179 /* Arrange to set up i386_stack_locals for all functions. */
6180 init_machine_status = ix86_init_machine_status;
6182 /* Validate -mregparm= value. */
6183 if (opts_set->x_ix86_regparm)
6185 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6186 warning (0, "-mregparm is ignored in 64-bit mode");
6187 else if (TARGET_IAMCU_P (opts->x_target_flags))
6188 warning (0, "-mregparm is ignored for Intel MCU psABI");
6189 if (opts->x_ix86_regparm > REGPARM_MAX)
6191 error ("-mregparm=%d is not between 0 and %d",
6192 opts->x_ix86_regparm, REGPARM_MAX);
6193 opts->x_ix86_regparm = 0;
6196 if (TARGET_IAMCU_P (opts->x_target_flags)
6197 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6198 opts->x_ix86_regparm = REGPARM_MAX;
6200 /* Default align_* from the processor table. */
6201 ix86_default_align (opts);
6203 /* Provide default for -mbranch-cost= value. */
6204 if (!opts_set->x_ix86_branch_cost)
6205 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6207 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6209 opts->x_target_flags
6210 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6212 /* Enable by default the SSE and MMX builtins. Do allow the user to
6213 explicitly disable any of these. In particular, disabling SSE and
6214 MMX for kernel code is extremely useful. */
6215 if (!ix86_arch_specified)
6216 opts->x_ix86_isa_flags
6217 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6218 | TARGET_SUBTARGET64_ISA_DEFAULT)
6219 & ~opts->x_ix86_isa_flags_explicit);
6221 if (TARGET_RTD_P (opts->x_target_flags))
6222 warning (0,
6223 main_args_p
6224 ? G_("%<-mrtd%> is ignored in 64bit mode")
6225 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6227 else
6229 opts->x_target_flags
6230 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6232 if (!ix86_arch_specified)
6233 opts->x_ix86_isa_flags
6234 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6236 /* i386 ABI does not specify red zone. It still makes sense to use it
6237 when programmer takes care to stack from being destroyed. */
6238 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6239 opts->x_target_flags |= MASK_NO_RED_ZONE;
6242 /* Keep nonleaf frame pointers. */
6243 if (opts->x_flag_omit_frame_pointer)
6244 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6245 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6246 opts->x_flag_omit_frame_pointer = 1;
6248 /* If we're doing fast math, we don't care about comparison order
6249 wrt NaNs. This lets us use a shorter comparison sequence. */
6250 if (opts->x_flag_finite_math_only)
6251 opts->x_target_flags &= ~MASK_IEEE_FP;
6253 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6254 since the insns won't need emulation. */
6255 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6256 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6258 /* Likewise, if the target doesn't have a 387, or we've specified
6259 software floating point, don't use 387 inline intrinsics. */
6260 if (!TARGET_80387_P (opts->x_target_flags))
6261 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6263 /* Turn on MMX builtins for -msse. */
6264 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6265 opts->x_ix86_isa_flags
6266 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6268 /* Enable SSE prefetch. */
6269 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6270 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6271 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6272 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6273 x86_prefetch_sse = true;
6275 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6276 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6277 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6278 opts->x_ix86_isa_flags
6279 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6281 /* Enable lzcnt instruction for -mabm. */
6282 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6283 opts->x_ix86_isa_flags
6284 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6286 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6287 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6288 opts->x_ix86_isa_flags
6289 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6290 & ~opts->x_ix86_isa_flags_explicit);
6292 /* Validate -mpreferred-stack-boundary= value or default it to
6293 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6294 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6295 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6297 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6298 int max = TARGET_SEH ? 4 : 12;
6300 if (opts->x_ix86_preferred_stack_boundary_arg < min
6301 || opts->x_ix86_preferred_stack_boundary_arg > max)
6303 if (min == max)
6304 error ("-mpreferred-stack-boundary is not supported "
6305 "for this target");
6306 else
6307 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6308 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6310 else
6311 ix86_preferred_stack_boundary
6312 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6315 /* Set the default value for -mstackrealign. */
6316 if (opts->x_ix86_force_align_arg_pointer == -1)
6317 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6319 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6321 /* Validate -mincoming-stack-boundary= value or default it to
6322 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6323 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6324 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6326 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6328 if (opts->x_ix86_incoming_stack_boundary_arg < min
6329 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6330 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6331 opts->x_ix86_incoming_stack_boundary_arg, min);
6332 else
6334 ix86_user_incoming_stack_boundary
6335 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6336 ix86_incoming_stack_boundary
6337 = ix86_user_incoming_stack_boundary;
6341 #ifndef NO_PROFILE_COUNTERS
6342 if (flag_nop_mcount)
6343 error ("-mnop-mcount is not compatible with this target");
6344 #endif
6345 if (flag_nop_mcount && flag_pic)
6346 error ("-mnop-mcount is not implemented for -fPIC");
6348 /* Accept -msseregparm only if at least SSE support is enabled. */
6349 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6350 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6351 error (main_args_p
6352 ? G_("%<-msseregparm%> used without SSE enabled")
6353 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6355 if (opts_set->x_ix86_fpmath)
6357 if (opts->x_ix86_fpmath & FPMATH_SSE)
6359 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6361 if (TARGET_80387_P (opts->x_target_flags))
6363 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6364 opts->x_ix86_fpmath = FPMATH_387;
6367 else if ((opts->x_ix86_fpmath & FPMATH_387)
6368 && !TARGET_80387_P (opts->x_target_flags))
6370 warning (0, "387 instruction set disabled, using SSE arithmetics");
6371 opts->x_ix86_fpmath = FPMATH_SSE;
6375 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6376 fpmath=387. The second is however default at many targets since the
6377 extra 80bit precision of temporaries is considered to be part of ABI.
6378 Overwrite the default at least for -ffast-math.
6379 TODO: -mfpmath=both seems to produce same performing code with bit
6380 smaller binaries. It is however not clear if register allocation is
6381 ready for this setting.
6382 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6383 codegen. We may switch to 387 with -ffast-math for size optimized
6384 functions. */
6385 else if (fast_math_flags_set_p (&global_options)
6386 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6387 opts->x_ix86_fpmath = FPMATH_SSE;
6388 else
6389 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6391 /* Use external vectorized library in vectorizing intrinsics. */
6392 if (opts_set->x_ix86_veclibabi_type)
6393 switch (opts->x_ix86_veclibabi_type)
6395 case ix86_veclibabi_type_svml:
6396 ix86_veclib_handler = ix86_veclibabi_svml;
6397 break;
6399 case ix86_veclibabi_type_acml:
6400 ix86_veclib_handler = ix86_veclibabi_acml;
6401 break;
6403 default:
6404 gcc_unreachable ();
6407 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6408 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6409 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6411 /* If stack probes are required, the space used for large function
6412 arguments on the stack must also be probed, so enable
6413 -maccumulate-outgoing-args so this happens in the prologue. */
6414 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6415 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6417 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6418 warning (0,
6419 main_args_p
6420 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6421 "for correctness")
6422 : G_("stack probing requires "
6423 "%<target(\"accumulate-outgoing-args\")%> for "
6424 "correctness"));
6425 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6428 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6429 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6430 if (fixed_regs[BP_REG]
6431 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6433 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6434 warning (0,
6435 main_args_p
6436 ? G_("fixed ebp register requires "
6437 "%<-maccumulate-outgoing-args%>")
6438 : G_("fixed ebp register requires "
6439 "%<target(\"accumulate-outgoing-args\")%>"));
6440 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6443 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6445 char *p;
6446 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6447 p = strchr (internal_label_prefix, 'X');
6448 internal_label_prefix_len = p - internal_label_prefix;
6449 *p = '\0';
6452 /* When scheduling description is not available, disable scheduler pass
6453 so it won't slow down the compilation and make x87 code slower. */
6454 if (!TARGET_SCHEDULE)
6455 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6457 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6458 ix86_tune_cost->simultaneous_prefetches,
6459 opts->x_param_values,
6460 opts_set->x_param_values);
6461 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6462 ix86_tune_cost->prefetch_block,
6463 opts->x_param_values,
6464 opts_set->x_param_values);
6465 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6466 ix86_tune_cost->l1_cache_size,
6467 opts->x_param_values,
6468 opts_set->x_param_values);
6469 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6470 ix86_tune_cost->l2_cache_size,
6471 opts->x_param_values,
6472 opts_set->x_param_values);
6474 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6475 if (opts->x_flag_prefetch_loop_arrays < 0
6476 && HAVE_prefetch
6477 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6478 && !opts->x_optimize_size
6479 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6480 opts->x_flag_prefetch_loop_arrays = 1;
6482 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6483 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6484 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6485 targetm.expand_builtin_va_start = NULL;
6487 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6489 ix86_gen_leave = gen_leave_rex64;
6490 if (Pmode == DImode)
6492 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6493 ix86_gen_tls_local_dynamic_base_64
6494 = gen_tls_local_dynamic_base_64_di;
6496 else
6498 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6499 ix86_gen_tls_local_dynamic_base_64
6500 = gen_tls_local_dynamic_base_64_si;
6503 else
6504 ix86_gen_leave = gen_leave;
6506 if (Pmode == DImode)
6508 ix86_gen_add3 = gen_adddi3;
6509 ix86_gen_sub3 = gen_subdi3;
6510 ix86_gen_sub3_carry = gen_subdi3_carry;
6511 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6512 ix86_gen_andsp = gen_anddi3;
6513 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6514 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6515 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6516 ix86_gen_monitor = gen_sse3_monitor_di;
6517 ix86_gen_monitorx = gen_monitorx_di;
6518 ix86_gen_clzero = gen_clzero_di;
6520 else
6522 ix86_gen_add3 = gen_addsi3;
6523 ix86_gen_sub3 = gen_subsi3;
6524 ix86_gen_sub3_carry = gen_subsi3_carry;
6525 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6526 ix86_gen_andsp = gen_andsi3;
6527 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6528 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6529 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6530 ix86_gen_monitor = gen_sse3_monitor_si;
6531 ix86_gen_monitorx = gen_monitorx_si;
6532 ix86_gen_clzero = gen_clzero_si;
6535 #ifdef USE_IX86_CLD
6536 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6537 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6538 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6539 #endif
6541 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
6543 if (opts->x_flag_fentry > 0)
6544 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6545 "with -fpic");
6546 opts->x_flag_fentry = 0;
6548 else if (TARGET_SEH)
6550 if (opts->x_flag_fentry == 0)
6551 sorry ("-mno-fentry isn%'t compatible with SEH");
6552 opts->x_flag_fentry = 1;
6554 else if (opts->x_flag_fentry < 0)
6556 #if defined(PROFILE_BEFORE_PROLOGUE)
6557 opts->x_flag_fentry = 1;
6558 #else
6559 opts->x_flag_fentry = 0;
6560 #endif
6563 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6564 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6566 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6567 opts->x_target_flags |= MASK_VZEROUPPER;
6568 if (!(opts_set->x_target_flags & MASK_STV))
6569 opts->x_target_flags |= MASK_STV;
6570 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6571 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6572 stack realignment will be extra cost the pass doesn't take into
6573 account and the pass can't realign the stack. */
6574 if (ix86_preferred_stack_boundary < 128
6575 || ix86_incoming_stack_boundary < 128
6576 || opts->x_ix86_force_align_arg_pointer)
6577 opts->x_target_flags &= ~MASK_STV;
6578 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6579 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6580 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6581 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6582 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6583 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6584 /* Enable 128-bit AVX instruction generation
6585 for the auto-vectorizer. */
6586 if (TARGET_AVX128_OPTIMAL
6587 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6588 opts->x_target_flags |= MASK_PREFER_AVX128;
6590 if (opts->x_ix86_recip_name)
6592 char *p = ASTRDUP (opts->x_ix86_recip_name);
6593 char *q;
6594 unsigned int mask, i;
6595 bool invert;
6597 while ((q = strtok (p, ",")) != NULL)
6599 p = NULL;
6600 if (*q == '!')
6602 invert = true;
6603 q++;
6605 else
6606 invert = false;
6608 if (!strcmp (q, "default"))
6609 mask = RECIP_MASK_ALL;
6610 else
6612 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6613 if (!strcmp (q, recip_options[i].string))
6615 mask = recip_options[i].mask;
6616 break;
6619 if (i == ARRAY_SIZE (recip_options))
6621 error ("unknown option for -mrecip=%s", q);
6622 invert = false;
6623 mask = RECIP_MASK_NONE;
6627 opts->x_recip_mask_explicit |= mask;
6628 if (invert)
6629 opts->x_recip_mask &= ~mask;
6630 else
6631 opts->x_recip_mask |= mask;
6635 if (TARGET_RECIP_P (opts->x_target_flags))
6636 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6637 else if (opts_set->x_target_flags & MASK_RECIP)
6638 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6640 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6641 for 64-bit Bionic. Also default long double to 64-bit for Intel
6642 MCU psABI. */
6643 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6644 && !(opts_set->x_target_flags
6645 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6646 opts->x_target_flags |= (TARGET_64BIT
6647 ? MASK_LONG_DOUBLE_128
6648 : MASK_LONG_DOUBLE_64);
6650 /* Only one of them can be active. */
6651 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6652 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6654 /* Save the initial options in case the user does function specific
6655 options. */
6656 if (main_args_p)
6657 target_option_default_node = target_option_current_node
6658 = build_target_option_node (opts);
6660 /* Handle stack protector */
6661 if (!opts_set->x_ix86_stack_protector_guard)
6662 opts->x_ix86_stack_protector_guard
6663 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6665 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6666 if (opts->x_ix86_tune_memcpy_strategy)
6668 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6669 ix86_parse_stringop_strategy_string (str, false);
6670 free (str);
6673 if (opts->x_ix86_tune_memset_strategy)
6675 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6676 ix86_parse_stringop_strategy_string (str, true);
6677 free (str);
6680 return true;
6683 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6685 static void
6686 ix86_option_override (void)
6688 ix86_option_override_internal (true, &global_options, &global_options_set);
6691 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6692 static char *
6693 ix86_offload_options (void)
6695 if (TARGET_LP64)
6696 return xstrdup ("-foffload-abi=lp64");
6697 return xstrdup ("-foffload-abi=ilp32");
6700 /* Update register usage after having seen the compiler flags. */
6702 static void
6703 ix86_conditional_register_usage (void)
6705 int i, c_mask;
6707 /* If there are no caller-saved registers, preserve all registers.
6708 except fixed_regs and registers used for function return value
6709 since aggregate_value_p checks call_used_regs[regno] on return
6710 value. */
6711 if (cfun && cfun->machine->no_caller_saved_registers)
6712 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6713 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6714 call_used_regs[i] = 0;
6716 /* For 32-bit targets, squash the REX registers. */
6717 if (! TARGET_64BIT)
6719 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6720 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6721 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6722 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6723 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6724 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6727 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6728 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6730 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6732 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6734 /* Set/reset conditionally defined registers from
6735 CALL_USED_REGISTERS initializer. */
6736 if (call_used_regs[i] > 1)
6737 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6739 /* Calculate registers of CLOBBERED_REGS register set
6740 as call used registers from GENERAL_REGS register set. */
6741 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6742 && call_used_regs[i])
6743 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6746 /* If MMX is disabled, squash the registers. */
6747 if (! TARGET_MMX)
6748 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6749 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6750 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6752 /* If SSE is disabled, squash the registers. */
6753 if (! TARGET_SSE)
6754 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6755 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6756 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6758 /* If the FPU is disabled, squash the registers. */
6759 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6760 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6761 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6762 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6764 /* If AVX512F is disabled, squash the registers. */
6765 if (! TARGET_AVX512F)
6767 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6768 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6770 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6771 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6774 /* If MPX is disabled, squash the registers. */
6775 if (! TARGET_MPX)
6776 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6777 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6781 /* Save the current options */
6783 static void
6784 ix86_function_specific_save (struct cl_target_option *ptr,
6785 struct gcc_options *opts)
6787 ptr->arch = ix86_arch;
6788 ptr->schedule = ix86_schedule;
6789 ptr->prefetch_sse = x86_prefetch_sse;
6790 ptr->tune = ix86_tune;
6791 ptr->branch_cost = ix86_branch_cost;
6792 ptr->tune_defaulted = ix86_tune_defaulted;
6793 ptr->arch_specified = ix86_arch_specified;
6794 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6795 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6796 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6797 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6798 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6799 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6800 ptr->x_ix86_abi = opts->x_ix86_abi;
6801 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6802 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6803 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6804 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6805 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6806 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6807 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6808 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6809 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6810 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6811 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6812 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6813 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6814 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6815 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6816 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6817 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6818 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6819 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6820 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6822 /* The fields are char but the variables are not; make sure the
6823 values fit in the fields. */
6824 gcc_assert (ptr->arch == ix86_arch);
6825 gcc_assert (ptr->schedule == ix86_schedule);
6826 gcc_assert (ptr->tune == ix86_tune);
6827 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6830 /* Restore the current options */
6832 static void
6833 ix86_function_specific_restore (struct gcc_options *opts,
6834 struct cl_target_option *ptr)
6836 enum processor_type old_tune = ix86_tune;
6837 enum processor_type old_arch = ix86_arch;
6838 unsigned int ix86_arch_mask;
6839 int i;
6841 /* We don't change -fPIC. */
6842 opts->x_flag_pic = flag_pic;
6844 ix86_arch = (enum processor_type) ptr->arch;
6845 ix86_schedule = (enum attr_cpu) ptr->schedule;
6846 ix86_tune = (enum processor_type) ptr->tune;
6847 x86_prefetch_sse = ptr->prefetch_sse;
6848 opts->x_ix86_branch_cost = ptr->branch_cost;
6849 ix86_tune_defaulted = ptr->tune_defaulted;
6850 ix86_arch_specified = ptr->arch_specified;
6851 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6852 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6853 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6854 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6855 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6856 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6857 opts->x_ix86_abi = ptr->x_ix86_abi;
6858 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6859 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6860 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6861 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6862 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6863 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6864 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6865 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6866 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6867 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6868 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6869 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6870 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6871 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6872 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6873 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6874 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6875 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6876 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6877 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6878 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6879 /* TODO: ix86_cost should be chosen at instruction or function granuality
6880 so for cold code we use size_cost even in !optimize_size compilation. */
6881 if (opts->x_optimize_size)
6882 ix86_cost = &ix86_size_cost;
6883 else
6884 ix86_cost = ix86_tune_cost;
6886 /* Recreate the arch feature tests if the arch changed */
6887 if (old_arch != ix86_arch)
6889 ix86_arch_mask = 1u << ix86_arch;
6890 for (i = 0; i < X86_ARCH_LAST; ++i)
6891 ix86_arch_features[i]
6892 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6895 /* Recreate the tune optimization tests */
6896 if (old_tune != ix86_tune)
6897 set_ix86_tune_features (ix86_tune, false);
6900 /* Adjust target options after streaming them in. This is mainly about
6901 reconciling them with global options. */
6903 static void
6904 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6906 /* flag_pic is a global option, but ix86_cmodel is target saved option
6907 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6908 for PIC, or error out. */
6909 if (flag_pic)
6910 switch (ptr->x_ix86_cmodel)
6912 case CM_SMALL:
6913 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6914 break;
6916 case CM_MEDIUM:
6917 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6918 break;
6920 case CM_LARGE:
6921 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6922 break;
6924 case CM_KERNEL:
6925 error ("code model %s does not support PIC mode", "kernel");
6926 break;
6928 default:
6929 break;
6931 else
6932 switch (ptr->x_ix86_cmodel)
6934 case CM_SMALL_PIC:
6935 ptr->x_ix86_cmodel = CM_SMALL;
6936 break;
6938 case CM_MEDIUM_PIC:
6939 ptr->x_ix86_cmodel = CM_MEDIUM;
6940 break;
6942 case CM_LARGE_PIC:
6943 ptr->x_ix86_cmodel = CM_LARGE;
6944 break;
6946 default:
6947 break;
6951 /* Print the current options */
6953 static void
6954 ix86_function_specific_print (FILE *file, int indent,
6955 struct cl_target_option *ptr)
6957 char *target_string
6958 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
6959 ptr->x_target_flags, ptr->x_ix86_target_flags,
6960 NULL, NULL, ptr->x_ix86_fpmath, false);
6962 gcc_assert (ptr->arch < PROCESSOR_max);
6963 fprintf (file, "%*sarch = %d (%s)\n",
6964 indent, "",
6965 ptr->arch, processor_target_table[ptr->arch].name);
6967 gcc_assert (ptr->tune < PROCESSOR_max);
6968 fprintf (file, "%*stune = %d (%s)\n",
6969 indent, "",
6970 ptr->tune, processor_target_table[ptr->tune].name);
6972 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
6974 if (target_string)
6976 fprintf (file, "%*s%s\n", indent, "", target_string);
6977 free (target_string);
6982 /* Inner function to process the attribute((target(...))), take an argument and
6983 set the current options from the argument. If we have a list, recursively go
6984 over the list. */
6986 static bool
6987 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
6988 struct gcc_options *opts,
6989 struct gcc_options *opts_set,
6990 struct gcc_options *enum_opts_set)
6992 char *next_optstr;
6993 bool ret = true;
6995 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
6996 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
6997 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
6998 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
6999 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7001 enum ix86_opt_type
7003 ix86_opt_unknown,
7004 ix86_opt_yes,
7005 ix86_opt_no,
7006 ix86_opt_str,
7007 ix86_opt_enum,
7008 ix86_opt_isa
7011 static const struct
7013 const char *string;
7014 size_t len;
7015 enum ix86_opt_type type;
7016 int opt;
7017 int mask;
7018 } attrs[] = {
7019 /* isa options */
7020 IX86_ATTR_ISA ("sgx", OPT_msgx),
7021 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7022 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7023 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7025 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7026 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7027 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7028 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7029 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7030 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7031 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7032 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7033 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7034 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7035 IX86_ATTR_ISA ("fma", OPT_mfma),
7036 IX86_ATTR_ISA ("xop", OPT_mxop),
7037 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7038 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7039 IX86_ATTR_ISA ("avx", OPT_mavx),
7040 IX86_ATTR_ISA ("sse4", OPT_msse4),
7041 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7042 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7043 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7044 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7045 IX86_ATTR_ISA ("sse3", OPT_msse3),
7046 IX86_ATTR_ISA ("aes", OPT_maes),
7047 IX86_ATTR_ISA ("sha", OPT_msha),
7048 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7049 IX86_ATTR_ISA ("sse2", OPT_msse2),
7050 IX86_ATTR_ISA ("sse", OPT_msse),
7051 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7052 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7053 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7054 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7055 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7056 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7057 IX86_ATTR_ISA ("adx", OPT_madx),
7058 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7059 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7060 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7061 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7062 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7063 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7064 IX86_ATTR_ISA ("abm", OPT_mabm),
7065 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7066 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7067 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7068 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7069 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7070 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7071 IX86_ATTR_ISA ("sahf", OPT_msahf),
7072 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7073 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7074 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7075 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7076 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7077 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7078 IX86_ATTR_ISA ("pku", OPT_mpku),
7079 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7080 IX86_ATTR_ISA ("hle", OPT_mhle),
7081 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7082 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7083 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7084 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7086 /* enum options */
7087 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7089 /* string options */
7090 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7091 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7093 /* flag options */
7094 IX86_ATTR_YES ("cld",
7095 OPT_mcld,
7096 MASK_CLD),
7098 IX86_ATTR_NO ("fancy-math-387",
7099 OPT_mfancy_math_387,
7100 MASK_NO_FANCY_MATH_387),
7102 IX86_ATTR_YES ("ieee-fp",
7103 OPT_mieee_fp,
7104 MASK_IEEE_FP),
7106 IX86_ATTR_YES ("inline-all-stringops",
7107 OPT_minline_all_stringops,
7108 MASK_INLINE_ALL_STRINGOPS),
7110 IX86_ATTR_YES ("inline-stringops-dynamically",
7111 OPT_minline_stringops_dynamically,
7112 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7114 IX86_ATTR_NO ("align-stringops",
7115 OPT_mno_align_stringops,
7116 MASK_NO_ALIGN_STRINGOPS),
7118 IX86_ATTR_YES ("recip",
7119 OPT_mrecip,
7120 MASK_RECIP),
7124 /* If this is a list, recurse to get the options. */
7125 if (TREE_CODE (args) == TREE_LIST)
7127 bool ret = true;
7129 for (; args; args = TREE_CHAIN (args))
7130 if (TREE_VALUE (args)
7131 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7132 p_strings, opts, opts_set,
7133 enum_opts_set))
7134 ret = false;
7136 return ret;
7139 else if (TREE_CODE (args) != STRING_CST)
7141 error ("attribute %<target%> argument not a string");
7142 return false;
7145 /* Handle multiple arguments separated by commas. */
7146 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7148 while (next_optstr && *next_optstr != '\0')
7150 char *p = next_optstr;
7151 char *orig_p = p;
7152 char *comma = strchr (next_optstr, ',');
7153 const char *opt_string;
7154 size_t len, opt_len;
7155 int opt;
7156 bool opt_set_p;
7157 char ch;
7158 unsigned i;
7159 enum ix86_opt_type type = ix86_opt_unknown;
7160 int mask = 0;
7162 if (comma)
7164 *comma = '\0';
7165 len = comma - next_optstr;
7166 next_optstr = comma + 1;
7168 else
7170 len = strlen (p);
7171 next_optstr = NULL;
7174 /* Recognize no-xxx. */
7175 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7177 opt_set_p = false;
7178 p += 3;
7179 len -= 3;
7181 else
7182 opt_set_p = true;
7184 /* Find the option. */
7185 ch = *p;
7186 opt = N_OPTS;
7187 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7189 type = attrs[i].type;
7190 opt_len = attrs[i].len;
7191 if (ch == attrs[i].string[0]
7192 && ((type != ix86_opt_str && type != ix86_opt_enum)
7193 ? len == opt_len
7194 : len > opt_len)
7195 && memcmp (p, attrs[i].string, opt_len) == 0)
7197 opt = attrs[i].opt;
7198 mask = attrs[i].mask;
7199 opt_string = attrs[i].string;
7200 break;
7204 /* Process the option. */
7205 if (opt == N_OPTS)
7207 error ("attribute(target(\"%s\")) is unknown", orig_p);
7208 ret = false;
7211 else if (type == ix86_opt_isa)
7213 struct cl_decoded_option decoded;
7215 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7216 ix86_handle_option (opts, opts_set,
7217 &decoded, input_location);
7220 else if (type == ix86_opt_yes || type == ix86_opt_no)
7222 if (type == ix86_opt_no)
7223 opt_set_p = !opt_set_p;
7225 if (opt_set_p)
7226 opts->x_target_flags |= mask;
7227 else
7228 opts->x_target_flags &= ~mask;
7231 else if (type == ix86_opt_str)
7233 if (p_strings[opt])
7235 error ("option(\"%s\") was already specified", opt_string);
7236 ret = false;
7238 else
7239 p_strings[opt] = xstrdup (p + opt_len);
7242 else if (type == ix86_opt_enum)
7244 bool arg_ok;
7245 int value;
7247 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7248 if (arg_ok)
7249 set_option (opts, enum_opts_set, opt, value,
7250 p + opt_len, DK_UNSPECIFIED, input_location,
7251 global_dc);
7252 else
7254 error ("attribute(target(\"%s\")) is unknown", orig_p);
7255 ret = false;
7259 else
7260 gcc_unreachable ();
7263 return ret;
7266 /* Release allocated strings. */
7267 static void
7268 release_options_strings (char **option_strings)
7270 /* Free up memory allocated to hold the strings */
7271 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7272 free (option_strings[i]);
7275 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7277 tree
7278 ix86_valid_target_attribute_tree (tree args,
7279 struct gcc_options *opts,
7280 struct gcc_options *opts_set)
7282 const char *orig_arch_string = opts->x_ix86_arch_string;
7283 const char *orig_tune_string = opts->x_ix86_tune_string;
7284 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7285 int orig_tune_defaulted = ix86_tune_defaulted;
7286 int orig_arch_specified = ix86_arch_specified;
7287 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7288 tree t = NULL_TREE;
7289 struct cl_target_option *def
7290 = TREE_TARGET_OPTION (target_option_default_node);
7291 struct gcc_options enum_opts_set;
7293 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7295 /* Process each of the options on the chain. */
7296 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7297 opts_set, &enum_opts_set))
7298 return error_mark_node;
7300 /* If the changed options are different from the default, rerun
7301 ix86_option_override_internal, and then save the options away.
7302 The string options are attribute options, and will be undone
7303 when we copy the save structure. */
7304 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7305 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7306 || opts->x_target_flags != def->x_target_flags
7307 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7308 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7309 || enum_opts_set.x_ix86_fpmath)
7311 /* If we are using the default tune= or arch=, undo the string assigned,
7312 and use the default. */
7313 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7315 opts->x_ix86_arch_string
7316 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7318 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7319 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7320 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7321 | OPTION_MASK_ABI_64
7322 | OPTION_MASK_ABI_X32
7323 | OPTION_MASK_CODE16);
7324 opts->x_ix86_isa_flags2 = 0;
7326 else if (!orig_arch_specified)
7327 opts->x_ix86_arch_string = NULL;
7329 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7330 opts->x_ix86_tune_string
7331 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7332 else if (orig_tune_defaulted)
7333 opts->x_ix86_tune_string = NULL;
7335 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7336 if (enum_opts_set.x_ix86_fpmath)
7337 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7338 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
7339 && TARGET_SSE_P (opts->x_ix86_isa_flags))
7341 if (TARGET_80387_P (opts->x_target_flags))
7342 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE
7343 | FPMATH_387);
7344 else
7345 opts->x_ix86_fpmath = (enum fpmath_unit) FPMATH_SSE;
7346 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7349 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7350 bool r = ix86_option_override_internal (false, opts, opts_set);
7351 if (!r)
7353 release_options_strings (option_strings);
7354 return error_mark_node;
7357 /* Add any builtin functions with the new isa if any. */
7358 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7360 /* Save the current options unless we are validating options for
7361 #pragma. */
7362 t = build_target_option_node (opts);
7364 opts->x_ix86_arch_string = orig_arch_string;
7365 opts->x_ix86_tune_string = orig_tune_string;
7366 opts_set->x_ix86_fpmath = orig_fpmath_set;
7368 release_options_strings (option_strings);
7371 return t;
7374 /* Hook to validate attribute((target("string"))). */
7376 static bool
7377 ix86_valid_target_attribute_p (tree fndecl,
7378 tree ARG_UNUSED (name),
7379 tree args,
7380 int ARG_UNUSED (flags))
7382 struct gcc_options func_options;
7383 tree new_target, new_optimize;
7384 bool ret = true;
7386 /* attribute((target("default"))) does nothing, beyond
7387 affecting multi-versioning. */
7388 if (TREE_VALUE (args)
7389 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7390 && TREE_CHAIN (args) == NULL_TREE
7391 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7392 return true;
7394 tree old_optimize = build_optimization_node (&global_options);
7396 /* Get the optimization options of the current function. */
7397 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7399 if (!func_optimize)
7400 func_optimize = old_optimize;
7402 /* Init func_options. */
7403 memset (&func_options, 0, sizeof (func_options));
7404 init_options_struct (&func_options, NULL);
7405 lang_hooks.init_options_struct (&func_options);
7407 cl_optimization_restore (&func_options,
7408 TREE_OPTIMIZATION (func_optimize));
7410 /* Initialize func_options to the default before its target options can
7411 be set. */
7412 cl_target_option_restore (&func_options,
7413 TREE_TARGET_OPTION (target_option_default_node));
7415 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7416 &global_options_set);
7418 new_optimize = build_optimization_node (&func_options);
7420 if (new_target == error_mark_node)
7421 ret = false;
7423 else if (fndecl && new_target)
7425 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7427 if (old_optimize != new_optimize)
7428 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7431 finalize_options_struct (&func_options);
7433 return ret;
7437 /* Hook to determine if one function can safely inline another. */
7439 static bool
7440 ix86_can_inline_p (tree caller, tree callee)
7442 bool ret = false;
7443 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7444 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7446 /* If callee has no option attributes, then it is ok to inline. */
7447 if (!callee_tree)
7448 ret = true;
7450 /* If caller has no option attributes, but callee does then it is not ok to
7451 inline. */
7452 else if (!caller_tree)
7453 ret = false;
7455 else
7457 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7458 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7460 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7461 function can inline a SSE2 function but a SSE2 function can't inline
7462 a SSE4 function. */
7463 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7464 != callee_opts->x_ix86_isa_flags)
7465 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7466 != callee_opts->x_ix86_isa_flags2))
7467 ret = false;
7469 /* See if we have the same non-isa options. */
7470 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7471 ret = false;
7473 /* See if arch, tune, etc. are the same. */
7474 else if (caller_opts->arch != callee_opts->arch)
7475 ret = false;
7477 else if (caller_opts->tune != callee_opts->tune)
7478 ret = false;
7480 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
7481 ret = false;
7483 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7484 ret = false;
7486 else
7487 ret = true;
7490 return ret;
7494 /* Remember the last target of ix86_set_current_function. */
7495 static GTY(()) tree ix86_previous_fndecl;
7497 /* Set targets globals to the default (or current #pragma GCC target
7498 if active). Invalidate ix86_previous_fndecl cache. */
7500 void
7501 ix86_reset_previous_fndecl (void)
7503 tree new_tree = target_option_current_node;
7504 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7505 if (TREE_TARGET_GLOBALS (new_tree))
7506 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7507 else if (new_tree == target_option_default_node)
7508 restore_target_globals (&default_target_globals);
7509 else
7510 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7511 ix86_previous_fndecl = NULL_TREE;
7514 /* Set the func_type field from the function FNDECL. */
7516 static void
7517 ix86_set_func_type (tree fndecl)
7519 if (cfun->machine->func_type == TYPE_UNKNOWN)
7521 if (lookup_attribute ("interrupt",
7522 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7524 if (ix86_function_naked (fndecl))
7525 error_at (DECL_SOURCE_LOCATION (fndecl),
7526 "interrupt and naked attributes are not compatible");
7528 int nargs = 0;
7529 for (tree arg = DECL_ARGUMENTS (fndecl);
7530 arg;
7531 arg = TREE_CHAIN (arg))
7532 nargs++;
7533 cfun->machine->no_caller_saved_registers = true;
7534 cfun->machine->func_type
7535 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7537 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7539 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7540 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7541 sorry ("Only DWARF debug format is supported for interrupt "
7542 "service routine.");
7544 else
7546 cfun->machine->func_type = TYPE_NORMAL;
7547 if (lookup_attribute ("no_caller_saved_registers",
7548 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7549 cfun->machine->no_caller_saved_registers = true;
7554 /* Establish appropriate back-end context for processing the function
7555 FNDECL. The argument might be NULL to indicate processing at top
7556 level, outside of any function scope. */
7557 static void
7558 ix86_set_current_function (tree fndecl)
7560 /* Only change the context if the function changes. This hook is called
7561 several times in the course of compiling a function, and we don't want to
7562 slow things down too much or call target_reinit when it isn't safe. */
7563 if (fndecl == ix86_previous_fndecl)
7565 /* There may be 2 function bodies for the same function FNDECL,
7566 one is extern inline and one isn't. Call ix86_set_func_type
7567 to set the func_type field. */
7568 if (fndecl != NULL_TREE)
7569 ix86_set_func_type (fndecl);
7570 return;
7573 tree old_tree;
7574 if (ix86_previous_fndecl == NULL_TREE)
7575 old_tree = target_option_current_node;
7576 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7577 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7578 else
7579 old_tree = target_option_default_node;
7581 if (fndecl == NULL_TREE)
7583 if (old_tree != target_option_current_node)
7584 ix86_reset_previous_fndecl ();
7585 return;
7588 ix86_set_func_type (fndecl);
7590 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7591 if (new_tree == NULL_TREE)
7592 new_tree = target_option_default_node;
7594 if (old_tree != new_tree)
7596 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7597 if (TREE_TARGET_GLOBALS (new_tree))
7598 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7599 else if (new_tree == target_option_default_node)
7600 restore_target_globals (&default_target_globals);
7601 else
7602 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7604 ix86_previous_fndecl = fndecl;
7606 static bool prev_no_caller_saved_registers;
7608 /* 64-bit MS and SYSV ABI have different set of call used registers.
7609 Avoid expensive re-initialization of init_regs each time we switch
7610 function context. */
7611 if (TARGET_64BIT
7612 && (call_used_regs[SI_REG]
7613 == (cfun->machine->call_abi == MS_ABI)))
7614 reinit_regs ();
7615 /* Need to re-initialize init_regs if caller-saved registers are
7616 changed. */
7617 else if (prev_no_caller_saved_registers
7618 != cfun->machine->no_caller_saved_registers)
7619 reinit_regs ();
7621 if (cfun->machine->func_type != TYPE_NORMAL
7622 || cfun->machine->no_caller_saved_registers)
7624 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7625 may change processor state. */
7626 const char *isa;
7627 if (TARGET_MPX)
7628 isa = "MPX";
7629 else if (TARGET_SSE)
7630 isa = "SSE";
7631 else if (TARGET_MMX)
7632 isa = "MMX/3Dnow";
7633 else if (TARGET_80387)
7634 isa = "80387";
7635 else
7636 isa = NULL;
7637 if (isa != NULL)
7639 if (cfun->machine->func_type != TYPE_NORMAL)
7640 sorry ("%s instructions aren't allowed in %s service routine",
7641 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7642 ? "exception" : "interrupt"));
7643 else
7644 sorry ("%s instructions aren't allowed in function with "
7645 "no_caller_saved_registers attribute", isa);
7646 /* Don't issue the same error twice. */
7647 cfun->machine->func_type = TYPE_NORMAL;
7648 cfun->machine->no_caller_saved_registers = false;
7652 prev_no_caller_saved_registers
7653 = cfun->machine->no_caller_saved_registers;
7657 /* Return true if this goes in large data/bss. */
7659 static bool
7660 ix86_in_large_data_p (tree exp)
7662 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7663 return false;
7665 if (exp == NULL_TREE)
7666 return false;
7668 /* Functions are never large data. */
7669 if (TREE_CODE (exp) == FUNCTION_DECL)
7670 return false;
7672 /* Automatic variables are never large data. */
7673 if (VAR_P (exp) && !is_global_var (exp))
7674 return false;
7676 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7678 const char *section = DECL_SECTION_NAME (exp);
7679 if (strcmp (section, ".ldata") == 0
7680 || strcmp (section, ".lbss") == 0)
7681 return true;
7682 return false;
7684 else
7686 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7688 /* If this is an incomplete type with size 0, then we can't put it
7689 in data because it might be too big when completed. Also,
7690 int_size_in_bytes returns -1 if size can vary or is larger than
7691 an integer in which case also it is safer to assume that it goes in
7692 large data. */
7693 if (size <= 0 || size > ix86_section_threshold)
7694 return true;
7697 return false;
7700 /* i386-specific section flag to mark large sections. */
7701 #define SECTION_LARGE SECTION_MACH_DEP
7703 /* Switch to the appropriate section for output of DECL.
7704 DECL is either a `VAR_DECL' node or a constant of some sort.
7705 RELOC indicates whether forming the initial value of DECL requires
7706 link-time relocations. */
7708 ATTRIBUTE_UNUSED static section *
7709 x86_64_elf_select_section (tree decl, int reloc,
7710 unsigned HOST_WIDE_INT align)
7712 if (ix86_in_large_data_p (decl))
7714 const char *sname = NULL;
7715 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7716 switch (categorize_decl_for_section (decl, reloc))
7718 case SECCAT_DATA:
7719 sname = ".ldata";
7720 break;
7721 case SECCAT_DATA_REL:
7722 sname = ".ldata.rel";
7723 break;
7724 case SECCAT_DATA_REL_LOCAL:
7725 sname = ".ldata.rel.local";
7726 break;
7727 case SECCAT_DATA_REL_RO:
7728 sname = ".ldata.rel.ro";
7729 break;
7730 case SECCAT_DATA_REL_RO_LOCAL:
7731 sname = ".ldata.rel.ro.local";
7732 break;
7733 case SECCAT_BSS:
7734 sname = ".lbss";
7735 flags |= SECTION_BSS;
7736 break;
7737 case SECCAT_RODATA:
7738 case SECCAT_RODATA_MERGE_STR:
7739 case SECCAT_RODATA_MERGE_STR_INIT:
7740 case SECCAT_RODATA_MERGE_CONST:
7741 sname = ".lrodata";
7742 flags &= ~SECTION_WRITE;
7743 break;
7744 case SECCAT_SRODATA:
7745 case SECCAT_SDATA:
7746 case SECCAT_SBSS:
7747 gcc_unreachable ();
7748 case SECCAT_TEXT:
7749 case SECCAT_TDATA:
7750 case SECCAT_TBSS:
7751 /* We don't split these for medium model. Place them into
7752 default sections and hope for best. */
7753 break;
7755 if (sname)
7757 /* We might get called with string constants, but get_named_section
7758 doesn't like them as they are not DECLs. Also, we need to set
7759 flags in that case. */
7760 if (!DECL_P (decl))
7761 return get_section (sname, flags, NULL);
7762 return get_named_section (decl, sname, reloc);
7765 return default_elf_select_section (decl, reloc, align);
7768 /* Select a set of attributes for section NAME based on the properties
7769 of DECL and whether or not RELOC indicates that DECL's initializer
7770 might contain runtime relocations. */
7772 static unsigned int ATTRIBUTE_UNUSED
7773 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7775 unsigned int flags = default_section_type_flags (decl, name, reloc);
7777 if (ix86_in_large_data_p (decl))
7778 flags |= SECTION_LARGE;
7780 if (decl == NULL_TREE
7781 && (strcmp (name, ".ldata.rel.ro") == 0
7782 || strcmp (name, ".ldata.rel.ro.local") == 0))
7783 flags |= SECTION_RELRO;
7785 if (strcmp (name, ".lbss") == 0
7786 || strncmp (name, ".lbss.", 5) == 0
7787 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7788 flags |= SECTION_BSS;
7790 return flags;
7793 /* Build up a unique section name, expressed as a
7794 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7795 RELOC indicates whether the initial value of EXP requires
7796 link-time relocations. */
7798 static void ATTRIBUTE_UNUSED
7799 x86_64_elf_unique_section (tree decl, int reloc)
7801 if (ix86_in_large_data_p (decl))
7803 const char *prefix = NULL;
7804 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7805 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7807 switch (categorize_decl_for_section (decl, reloc))
7809 case SECCAT_DATA:
7810 case SECCAT_DATA_REL:
7811 case SECCAT_DATA_REL_LOCAL:
7812 case SECCAT_DATA_REL_RO:
7813 case SECCAT_DATA_REL_RO_LOCAL:
7814 prefix = one_only ? ".ld" : ".ldata";
7815 break;
7816 case SECCAT_BSS:
7817 prefix = one_only ? ".lb" : ".lbss";
7818 break;
7819 case SECCAT_RODATA:
7820 case SECCAT_RODATA_MERGE_STR:
7821 case SECCAT_RODATA_MERGE_STR_INIT:
7822 case SECCAT_RODATA_MERGE_CONST:
7823 prefix = one_only ? ".lr" : ".lrodata";
7824 break;
7825 case SECCAT_SRODATA:
7826 case SECCAT_SDATA:
7827 case SECCAT_SBSS:
7828 gcc_unreachable ();
7829 case SECCAT_TEXT:
7830 case SECCAT_TDATA:
7831 case SECCAT_TBSS:
7832 /* We don't split these for medium model. Place them into
7833 default sections and hope for best. */
7834 break;
7836 if (prefix)
7838 const char *name, *linkonce;
7839 char *string;
7841 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7842 name = targetm.strip_name_encoding (name);
7844 /* If we're using one_only, then there needs to be a .gnu.linkonce
7845 prefix to the section name. */
7846 linkonce = one_only ? ".gnu.linkonce" : "";
7848 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7850 set_decl_section_name (decl, string);
7851 return;
7854 default_unique_section (decl, reloc);
7857 #ifdef COMMON_ASM_OP
7859 #ifndef LARGECOMM_SECTION_ASM_OP
7860 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7861 #endif
7863 /* This says how to output assembler code to declare an
7864 uninitialized external linkage data object.
7866 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7867 large objects. */
7868 void
7869 x86_elf_aligned_decl_common (FILE *file, tree decl,
7870 const char *name, unsigned HOST_WIDE_INT size,
7871 int align)
7873 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7874 && size > (unsigned int)ix86_section_threshold)
7876 switch_to_section (get_named_section (decl, ".lbss", 0));
7877 fputs (LARGECOMM_SECTION_ASM_OP, file);
7879 else
7880 fputs (COMMON_ASM_OP, file);
7881 assemble_name (file, name);
7882 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7883 size, align / BITS_PER_UNIT);
7885 #endif
7887 /* Utility function for targets to use in implementing
7888 ASM_OUTPUT_ALIGNED_BSS. */
7890 void
7891 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7892 unsigned HOST_WIDE_INT size, int align)
7894 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7895 && size > (unsigned int)ix86_section_threshold)
7896 switch_to_section (get_named_section (decl, ".lbss", 0));
7897 else
7898 switch_to_section (bss_section);
7899 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7900 #ifdef ASM_DECLARE_OBJECT_NAME
7901 last_assemble_variable_decl = decl;
7902 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7903 #else
7904 /* Standard thing is just output label for the object. */
7905 ASM_OUTPUT_LABEL (file, name);
7906 #endif /* ASM_DECLARE_OBJECT_NAME */
7907 ASM_OUTPUT_SKIP (file, size ? size : 1);
7910 /* Decide whether we must probe the stack before any space allocation
7911 on this target. It's essentially TARGET_STACK_PROBE except when
7912 -fstack-check causes the stack to be already probed differently. */
7914 bool
7915 ix86_target_stack_probe (void)
7917 /* Do not probe the stack twice if static stack checking is enabled. */
7918 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7919 return false;
7921 return TARGET_STACK_PROBE;
7924 /* Decide whether we can make a sibling call to a function. DECL is the
7925 declaration of the function being targeted by the call and EXP is the
7926 CALL_EXPR representing the call. */
7928 static bool
7929 ix86_function_ok_for_sibcall (tree decl, tree exp)
7931 tree type, decl_or_type;
7932 rtx a, b;
7933 bool bind_global = decl && !targetm.binds_local_p (decl);
7935 if (ix86_function_naked (current_function_decl))
7936 return false;
7938 /* Sibling call isn't OK if there are no caller-saved registers
7939 since all registers must be preserved before return. */
7940 if (cfun->machine->no_caller_saved_registers)
7941 return false;
7943 /* If we are generating position-independent code, we cannot sibcall
7944 optimize direct calls to global functions, as the PLT requires
7945 %ebx be live. (Darwin does not have a PLT.) */
7946 if (!TARGET_MACHO
7947 && !TARGET_64BIT
7948 && flag_pic
7949 && flag_plt
7950 && bind_global)
7951 return false;
7953 /* If we need to align the outgoing stack, then sibcalling would
7954 unalign the stack, which may break the called function. */
7955 if (ix86_minimum_incoming_stack_boundary (true)
7956 < PREFERRED_STACK_BOUNDARY)
7957 return false;
7959 if (decl)
7961 decl_or_type = decl;
7962 type = TREE_TYPE (decl);
7964 else
7966 /* We're looking at the CALL_EXPR, we need the type of the function. */
7967 type = CALL_EXPR_FN (exp); /* pointer expression */
7968 type = TREE_TYPE (type); /* pointer type */
7969 type = TREE_TYPE (type); /* function type */
7970 decl_or_type = type;
7973 /* Check that the return value locations are the same. Like
7974 if we are returning floats on the 80387 register stack, we cannot
7975 make a sibcall from a function that doesn't return a float to a
7976 function that does or, conversely, from a function that does return
7977 a float to a function that doesn't; the necessary stack adjustment
7978 would not be executed. This is also the place we notice
7979 differences in the return value ABI. Note that it is ok for one
7980 of the functions to have void return type as long as the return
7981 value of the other is passed in a register. */
7982 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
7983 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
7984 cfun->decl, false);
7985 if (STACK_REG_P (a) || STACK_REG_P (b))
7987 if (!rtx_equal_p (a, b))
7988 return false;
7990 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
7992 else if (!rtx_equal_p (a, b))
7993 return false;
7995 if (TARGET_64BIT)
7997 /* The SYSV ABI has more call-clobbered registers;
7998 disallow sibcalls from MS to SYSV. */
7999 if (cfun->machine->call_abi == MS_ABI
8000 && ix86_function_type_abi (type) == SYSV_ABI)
8001 return false;
8003 else
8005 /* If this call is indirect, we'll need to be able to use a
8006 call-clobbered register for the address of the target function.
8007 Make sure that all such registers are not used for passing
8008 parameters. Note that DLLIMPORT functions and call to global
8009 function via GOT slot are indirect. */
8010 if (!decl
8011 || (bind_global && flag_pic && !flag_plt)
8012 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8014 /* Check if regparm >= 3 since arg_reg_available is set to
8015 false if regparm == 0. If regparm is 1 or 2, there is
8016 always a call-clobbered register available.
8018 ??? The symbol indirect call doesn't need a call-clobbered
8019 register. But we don't know if this is a symbol indirect
8020 call or not here. */
8021 if (ix86_function_regparm (type, NULL) >= 3
8022 && !cfun->machine->arg_reg_available)
8023 return false;
8027 /* Otherwise okay. That also includes certain types of indirect calls. */
8028 return true;
8031 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8032 and "sseregparm" calling convention attributes;
8033 arguments as in struct attribute_spec.handler. */
8035 static tree
8036 ix86_handle_cconv_attribute (tree *node, tree name,
8037 tree args,
8038 int,
8039 bool *no_add_attrs)
8041 if (TREE_CODE (*node) != FUNCTION_TYPE
8042 && TREE_CODE (*node) != METHOD_TYPE
8043 && TREE_CODE (*node) != FIELD_DECL
8044 && TREE_CODE (*node) != TYPE_DECL)
8046 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8047 name);
8048 *no_add_attrs = true;
8049 return NULL_TREE;
8052 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8053 if (is_attribute_p ("regparm", name))
8055 tree cst;
8057 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8059 error ("fastcall and regparm attributes are not compatible");
8062 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8064 error ("regparam and thiscall attributes are not compatible");
8067 cst = TREE_VALUE (args);
8068 if (TREE_CODE (cst) != INTEGER_CST)
8070 warning (OPT_Wattributes,
8071 "%qE attribute requires an integer constant argument",
8072 name);
8073 *no_add_attrs = true;
8075 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8077 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8078 name, REGPARM_MAX);
8079 *no_add_attrs = true;
8082 return NULL_TREE;
8085 if (TARGET_64BIT)
8087 /* Do not warn when emulating the MS ABI. */
8088 if ((TREE_CODE (*node) != FUNCTION_TYPE
8089 && TREE_CODE (*node) != METHOD_TYPE)
8090 || ix86_function_type_abi (*node) != MS_ABI)
8091 warning (OPT_Wattributes, "%qE attribute ignored",
8092 name);
8093 *no_add_attrs = true;
8094 return NULL_TREE;
8097 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8098 if (is_attribute_p ("fastcall", name))
8100 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8102 error ("fastcall and cdecl attributes are not compatible");
8104 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8106 error ("fastcall and stdcall attributes are not compatible");
8108 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8110 error ("fastcall and regparm attributes are not compatible");
8112 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8114 error ("fastcall and thiscall attributes are not compatible");
8118 /* Can combine stdcall with fastcall (redundant), regparm and
8119 sseregparm. */
8120 else if (is_attribute_p ("stdcall", name))
8122 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8124 error ("stdcall and cdecl attributes are not compatible");
8126 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8128 error ("stdcall and fastcall attributes are not compatible");
8130 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8132 error ("stdcall and thiscall attributes are not compatible");
8136 /* Can combine cdecl with regparm and sseregparm. */
8137 else if (is_attribute_p ("cdecl", name))
8139 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8141 error ("stdcall and cdecl attributes are not compatible");
8143 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8145 error ("fastcall and cdecl attributes are not compatible");
8147 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8149 error ("cdecl and thiscall attributes are not compatible");
8152 else if (is_attribute_p ("thiscall", name))
8154 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8155 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8156 name);
8157 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8159 error ("stdcall and thiscall attributes are not compatible");
8161 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8163 error ("fastcall and thiscall attributes are not compatible");
8165 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8167 error ("cdecl and thiscall attributes are not compatible");
8171 /* Can combine sseregparm with all attributes. */
8173 return NULL_TREE;
8176 /* The transactional memory builtins are implicitly regparm or fastcall
8177 depending on the ABI. Override the generic do-nothing attribute that
8178 these builtins were declared with, and replace it with one of the two
8179 attributes that we expect elsewhere. */
8181 static tree
8182 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8183 int flags, bool *no_add_attrs)
8185 tree alt;
8187 /* In no case do we want to add the placeholder attribute. */
8188 *no_add_attrs = true;
8190 /* The 64-bit ABI is unchanged for transactional memory. */
8191 if (TARGET_64BIT)
8192 return NULL_TREE;
8194 /* ??? Is there a better way to validate 32-bit windows? We have
8195 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8196 if (CHECK_STACK_LIMIT > 0)
8197 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8198 else
8200 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8201 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8203 decl_attributes (node, alt, flags);
8205 return NULL_TREE;
8208 /* This function determines from TYPE the calling-convention. */
8210 unsigned int
8211 ix86_get_callcvt (const_tree type)
8213 unsigned int ret = 0;
8214 bool is_stdarg;
8215 tree attrs;
8217 if (TARGET_64BIT)
8218 return IX86_CALLCVT_CDECL;
8220 attrs = TYPE_ATTRIBUTES (type);
8221 if (attrs != NULL_TREE)
8223 if (lookup_attribute ("cdecl", attrs))
8224 ret |= IX86_CALLCVT_CDECL;
8225 else if (lookup_attribute ("stdcall", attrs))
8226 ret |= IX86_CALLCVT_STDCALL;
8227 else if (lookup_attribute ("fastcall", attrs))
8228 ret |= IX86_CALLCVT_FASTCALL;
8229 else if (lookup_attribute ("thiscall", attrs))
8230 ret |= IX86_CALLCVT_THISCALL;
8232 /* Regparam isn't allowed for thiscall and fastcall. */
8233 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8235 if (lookup_attribute ("regparm", attrs))
8236 ret |= IX86_CALLCVT_REGPARM;
8237 if (lookup_attribute ("sseregparm", attrs))
8238 ret |= IX86_CALLCVT_SSEREGPARM;
8241 if (IX86_BASE_CALLCVT(ret) != 0)
8242 return ret;
8245 is_stdarg = stdarg_p (type);
8246 if (TARGET_RTD && !is_stdarg)
8247 return IX86_CALLCVT_STDCALL | ret;
8249 if (ret != 0
8250 || is_stdarg
8251 || TREE_CODE (type) != METHOD_TYPE
8252 || ix86_function_type_abi (type) != MS_ABI)
8253 return IX86_CALLCVT_CDECL | ret;
8255 return IX86_CALLCVT_THISCALL;
8258 /* Return 0 if the attributes for two types are incompatible, 1 if they
8259 are compatible, and 2 if they are nearly compatible (which causes a
8260 warning to be generated). */
8262 static int
8263 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8265 unsigned int ccvt1, ccvt2;
8267 if (TREE_CODE (type1) != FUNCTION_TYPE
8268 && TREE_CODE (type1) != METHOD_TYPE)
8269 return 1;
8271 ccvt1 = ix86_get_callcvt (type1);
8272 ccvt2 = ix86_get_callcvt (type2);
8273 if (ccvt1 != ccvt2)
8274 return 0;
8275 if (ix86_function_regparm (type1, NULL)
8276 != ix86_function_regparm (type2, NULL))
8277 return 0;
8279 return 1;
8282 /* Return the regparm value for a function with the indicated TYPE and DECL.
8283 DECL may be NULL when calling function indirectly
8284 or considering a libcall. */
8286 static int
8287 ix86_function_regparm (const_tree type, const_tree decl)
8289 tree attr;
8290 int regparm;
8291 unsigned int ccvt;
8293 if (TARGET_64BIT)
8294 return (ix86_function_type_abi (type) == SYSV_ABI
8295 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8296 ccvt = ix86_get_callcvt (type);
8297 regparm = ix86_regparm;
8299 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8301 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8302 if (attr)
8304 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8305 return regparm;
8308 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8309 return 2;
8310 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8311 return 1;
8313 /* Use register calling convention for local functions when possible. */
8314 if (decl
8315 && TREE_CODE (decl) == FUNCTION_DECL)
8317 cgraph_node *target = cgraph_node::get (decl);
8318 if (target)
8319 target = target->function_symbol ();
8321 /* Caller and callee must agree on the calling convention, so
8322 checking here just optimize means that with
8323 __attribute__((optimize (...))) caller could use regparm convention
8324 and callee not, or vice versa. Instead look at whether the callee
8325 is optimized or not. */
8326 if (target && opt_for_fn (target->decl, optimize)
8327 && !(profile_flag && !flag_fentry))
8329 cgraph_local_info *i = &target->local;
8330 if (i && i->local && i->can_change_signature)
8332 int local_regparm, globals = 0, regno;
8334 /* Make sure no regparm register is taken by a
8335 fixed register variable. */
8336 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8337 local_regparm++)
8338 if (fixed_regs[local_regparm])
8339 break;
8341 /* We don't want to use regparm(3) for nested functions as
8342 these use a static chain pointer in the third argument. */
8343 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8344 local_regparm = 2;
8346 /* Save a register for the split stack. */
8347 if (flag_split_stack)
8349 if (local_regparm == 3)
8350 local_regparm = 2;
8351 else if (local_regparm == 2
8352 && DECL_STATIC_CHAIN (target->decl))
8353 local_regparm = 1;
8356 /* Each fixed register usage increases register pressure,
8357 so less registers should be used for argument passing.
8358 This functionality can be overriden by an explicit
8359 regparm value. */
8360 for (regno = AX_REG; regno <= DI_REG; regno++)
8361 if (fixed_regs[regno])
8362 globals++;
8364 local_regparm
8365 = globals < local_regparm ? local_regparm - globals : 0;
8367 if (local_regparm > regparm)
8368 regparm = local_regparm;
8373 return regparm;
8376 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8377 DFmode (2) arguments in SSE registers for a function with the
8378 indicated TYPE and DECL. DECL may be NULL when calling function
8379 indirectly or considering a libcall. Return -1 if any FP parameter
8380 should be rejected by error. This is used in siutation we imply SSE
8381 calling convetion but the function is called from another function with
8382 SSE disabled. Otherwise return 0. */
8384 static int
8385 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8387 gcc_assert (!TARGET_64BIT);
8389 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8390 by the sseregparm attribute. */
8391 if (TARGET_SSEREGPARM
8392 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8394 if (!TARGET_SSE)
8396 if (warn)
8398 if (decl)
8399 error ("calling %qD with attribute sseregparm without "
8400 "SSE/SSE2 enabled", decl);
8401 else
8402 error ("calling %qT with attribute sseregparm without "
8403 "SSE/SSE2 enabled", type);
8405 return 0;
8408 return 2;
8411 if (!decl)
8412 return 0;
8414 cgraph_node *target = cgraph_node::get (decl);
8415 if (target)
8416 target = target->function_symbol ();
8418 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8419 (and DFmode for SSE2) arguments in SSE registers. */
8420 if (target
8421 /* TARGET_SSE_MATH */
8422 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8423 && opt_for_fn (target->decl, optimize)
8424 && !(profile_flag && !flag_fentry))
8426 cgraph_local_info *i = &target->local;
8427 if (i && i->local && i->can_change_signature)
8429 /* Refuse to produce wrong code when local function with SSE enabled
8430 is called from SSE disabled function.
8431 FIXME: We need a way to detect these cases cross-ltrans partition
8432 and avoid using SSE calling conventions on local functions called
8433 from function with SSE disabled. For now at least delay the
8434 warning until we know we are going to produce wrong code.
8435 See PR66047 */
8436 if (!TARGET_SSE && warn)
8437 return -1;
8438 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8439 ->x_ix86_isa_flags) ? 2 : 1;
8443 return 0;
8446 /* Return true if EAX is live at the start of the function. Used by
8447 ix86_expand_prologue to determine if we need special help before
8448 calling allocate_stack_worker. */
8450 static bool
8451 ix86_eax_live_at_start_p (void)
8453 /* Cheat. Don't bother working forward from ix86_function_regparm
8454 to the function type to whether an actual argument is located in
8455 eax. Instead just look at cfg info, which is still close enough
8456 to correct at this point. This gives false positives for broken
8457 functions that might use uninitialized data that happens to be
8458 allocated in eax, but who cares? */
8459 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8462 static bool
8463 ix86_keep_aggregate_return_pointer (tree fntype)
8465 tree attr;
8467 if (!TARGET_64BIT)
8469 attr = lookup_attribute ("callee_pop_aggregate_return",
8470 TYPE_ATTRIBUTES (fntype));
8471 if (attr)
8472 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8474 /* For 32-bit MS-ABI the default is to keep aggregate
8475 return pointer. */
8476 if (ix86_function_type_abi (fntype) == MS_ABI)
8477 return true;
8479 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8482 /* Value is the number of bytes of arguments automatically
8483 popped when returning from a subroutine call.
8484 FUNDECL is the declaration node of the function (as a tree),
8485 FUNTYPE is the data type of the function (as a tree),
8486 or for a library call it is an identifier node for the subroutine name.
8487 SIZE is the number of bytes of arguments passed on the stack.
8489 On the 80386, the RTD insn may be used to pop them if the number
8490 of args is fixed, but if the number is variable then the caller
8491 must pop them all. RTD can't be used for library calls now
8492 because the library is compiled with the Unix compiler.
8493 Use of RTD is a selectable option, since it is incompatible with
8494 standard Unix calling sequences. If the option is not selected,
8495 the caller must always pop the args.
8497 The attribute stdcall is equivalent to RTD on a per module basis. */
8499 static int
8500 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8502 unsigned int ccvt;
8504 /* None of the 64-bit ABIs pop arguments. */
8505 if (TARGET_64BIT)
8506 return 0;
8508 ccvt = ix86_get_callcvt (funtype);
8510 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8511 | IX86_CALLCVT_THISCALL)) != 0
8512 && ! stdarg_p (funtype))
8513 return size;
8515 /* Lose any fake structure return argument if it is passed on the stack. */
8516 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8517 && !ix86_keep_aggregate_return_pointer (funtype))
8519 int nregs = ix86_function_regparm (funtype, fundecl);
8520 if (nregs == 0)
8521 return GET_MODE_SIZE (Pmode);
8524 return 0;
8527 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8529 static bool
8530 ix86_legitimate_combined_insn (rtx_insn *insn)
8532 int i;
8534 /* Check operand constraints in case hard registers were propagated
8535 into insn pattern. This check prevents combine pass from
8536 generating insn patterns with invalid hard register operands.
8537 These invalid insns can eventually confuse reload to error out
8538 with a spill failure. See also PRs 46829 and 46843. */
8540 gcc_assert (INSN_CODE (insn) >= 0);
8542 extract_insn (insn);
8543 preprocess_constraints (insn);
8545 int n_operands = recog_data.n_operands;
8546 int n_alternatives = recog_data.n_alternatives;
8547 for (i = 0; i < n_operands; i++)
8549 rtx op = recog_data.operand[i];
8550 machine_mode mode = GET_MODE (op);
8551 const operand_alternative *op_alt;
8552 int offset = 0;
8553 bool win;
8554 int j;
8556 /* A unary operator may be accepted by the predicate, but it
8557 is irrelevant for matching constraints. */
8558 if (UNARY_P (op))
8559 op = XEXP (op, 0);
8561 if (SUBREG_P (op))
8563 if (REG_P (SUBREG_REG (op))
8564 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8565 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8566 GET_MODE (SUBREG_REG (op)),
8567 SUBREG_BYTE (op),
8568 GET_MODE (op));
8569 op = SUBREG_REG (op);
8572 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8573 continue;
8575 op_alt = recog_op_alt;
8577 /* Operand has no constraints, anything is OK. */
8578 win = !n_alternatives;
8580 alternative_mask preferred = get_preferred_alternatives (insn);
8581 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8583 if (!TEST_BIT (preferred, j))
8584 continue;
8585 if (op_alt[i].anything_ok
8586 || (op_alt[i].matches != -1
8587 && operands_match_p
8588 (recog_data.operand[i],
8589 recog_data.operand[op_alt[i].matches]))
8590 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8592 win = true;
8593 break;
8597 if (!win)
8598 return false;
8601 return true;
8604 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8606 static unsigned HOST_WIDE_INT
8607 ix86_asan_shadow_offset (void)
8609 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8610 : HOST_WIDE_INT_C (0x7fff8000))
8611 : (HOST_WIDE_INT_1 << 29);
8614 /* Argument support functions. */
8616 /* Return true when register may be used to pass function parameters. */
8617 bool
8618 ix86_function_arg_regno_p (int regno)
8620 int i;
8621 enum calling_abi call_abi;
8622 const int *parm_regs;
8624 if (TARGET_MPX && BND_REGNO_P (regno))
8625 return true;
8627 if (!TARGET_64BIT)
8629 if (TARGET_MACHO)
8630 return (regno < REGPARM_MAX
8631 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8632 else
8633 return (regno < REGPARM_MAX
8634 || (TARGET_MMX && MMX_REGNO_P (regno)
8635 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8636 || (TARGET_SSE && SSE_REGNO_P (regno)
8637 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8640 if (TARGET_SSE && SSE_REGNO_P (regno)
8641 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8642 return true;
8644 /* TODO: The function should depend on current function ABI but
8645 builtins.c would need updating then. Therefore we use the
8646 default ABI. */
8647 call_abi = ix86_cfun_abi ();
8649 /* RAX is used as hidden argument to va_arg functions. */
8650 if (call_abi == SYSV_ABI && regno == AX_REG)
8651 return true;
8653 if (call_abi == MS_ABI)
8654 parm_regs = x86_64_ms_abi_int_parameter_registers;
8655 else
8656 parm_regs = x86_64_int_parameter_registers;
8658 for (i = 0; i < (call_abi == MS_ABI
8659 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8660 if (regno == parm_regs[i])
8661 return true;
8662 return false;
8665 /* Return if we do not know how to pass TYPE solely in registers. */
8667 static bool
8668 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8670 if (must_pass_in_stack_var_size_or_pad (mode, type))
8671 return true;
8673 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8674 The layout_type routine is crafty and tries to trick us into passing
8675 currently unsupported vector types on the stack by using TImode. */
8676 return (!TARGET_64BIT && mode == TImode
8677 && type && TREE_CODE (type) != VECTOR_TYPE);
8680 /* It returns the size, in bytes, of the area reserved for arguments passed
8681 in registers for the function represented by fndecl dependent to the used
8682 abi format. */
8684 ix86_reg_parm_stack_space (const_tree fndecl)
8686 enum calling_abi call_abi = SYSV_ABI;
8687 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8688 call_abi = ix86_function_abi (fndecl);
8689 else
8690 call_abi = ix86_function_type_abi (fndecl);
8691 if (TARGET_64BIT && call_abi == MS_ABI)
8692 return 32;
8693 return 0;
8696 /* We add this as a workaround in order to use libc_has_function
8697 hook in i386.md. */
8698 bool
8699 ix86_libc_has_function (enum function_class fn_class)
8701 return targetm.libc_has_function (fn_class);
8704 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8705 specifying the call abi used. */
8706 enum calling_abi
8707 ix86_function_type_abi (const_tree fntype)
8709 enum calling_abi abi = ix86_abi;
8711 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8712 return abi;
8714 if (abi == SYSV_ABI
8715 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8717 if (TARGET_X32)
8718 error ("X32 does not support ms_abi attribute");
8720 abi = MS_ABI;
8722 else if (abi == MS_ABI
8723 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8724 abi = SYSV_ABI;
8726 return abi;
8729 static enum calling_abi
8730 ix86_function_abi (const_tree fndecl)
8732 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8735 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8736 specifying the call abi used. */
8737 enum calling_abi
8738 ix86_cfun_abi (void)
8740 return cfun ? cfun->machine->call_abi : ix86_abi;
8743 static bool
8744 ix86_function_ms_hook_prologue (const_tree fn)
8746 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8748 if (decl_function_context (fn) != NULL_TREE)
8749 error_at (DECL_SOURCE_LOCATION (fn),
8750 "ms_hook_prologue is not compatible with nested function");
8751 else
8752 return true;
8754 return false;
8757 static bool
8758 ix86_function_naked (const_tree fn)
8760 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
8761 return true;
8763 return false;
8766 /* Write the extra assembler code needed to declare a function properly. */
8768 void
8769 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8770 tree decl)
8772 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8774 if (is_ms_hook)
8776 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8777 unsigned int filler_cc = 0xcccccccc;
8779 for (i = 0; i < filler_count; i += 4)
8780 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8783 #ifdef SUBTARGET_ASM_UNWIND_INIT
8784 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8785 #endif
8787 ASM_OUTPUT_LABEL (asm_out_file, fname);
8789 /* Output magic byte marker, if hot-patch attribute is set. */
8790 if (is_ms_hook)
8792 if (TARGET_64BIT)
8794 /* leaq [%rsp + 0], %rsp */
8795 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
8796 asm_out_file);
8798 else
8800 /* movl.s %edi, %edi
8801 push %ebp
8802 movl.s %esp, %ebp */
8803 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
8808 /* Implementation of call abi switching target hook. Specific to FNDECL
8809 the specific call register sets are set. See also
8810 ix86_conditional_register_usage for more details. */
8811 void
8812 ix86_call_abi_override (const_tree fndecl)
8814 cfun->machine->call_abi = ix86_function_abi (fndecl);
8817 /* Return 1 if pseudo register should be created and used to hold
8818 GOT address for PIC code. */
8819 bool
8820 ix86_use_pseudo_pic_reg (void)
8822 if ((TARGET_64BIT
8823 && (ix86_cmodel == CM_SMALL_PIC
8824 || TARGET_PECOFF))
8825 || !flag_pic)
8826 return false;
8827 return true;
8830 /* Initialize large model PIC register. */
8832 static void
8833 ix86_init_large_pic_reg (unsigned int tmp_regno)
8835 rtx_code_label *label;
8836 rtx tmp_reg;
8838 gcc_assert (Pmode == DImode);
8839 label = gen_label_rtx ();
8840 emit_label (label);
8841 LABEL_PRESERVE_P (label) = 1;
8842 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8843 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8844 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8845 label));
8846 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8847 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8848 pic_offset_table_rtx, tmp_reg));
8851 /* Create and initialize PIC register if required. */
8852 static void
8853 ix86_init_pic_reg (void)
8855 edge entry_edge;
8856 rtx_insn *seq;
8858 if (!ix86_use_pseudo_pic_reg ())
8859 return;
8861 start_sequence ();
8863 if (TARGET_64BIT)
8865 if (ix86_cmodel == CM_LARGE_PIC)
8866 ix86_init_large_pic_reg (R11_REG);
8867 else
8868 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8870 else
8872 /* If there is future mcount call in the function it is more profitable
8873 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8874 rtx reg = crtl->profile
8875 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8876 : pic_offset_table_rtx;
8877 rtx_insn *insn = emit_insn (gen_set_got (reg));
8878 RTX_FRAME_RELATED_P (insn) = 1;
8879 if (crtl->profile)
8880 emit_move_insn (pic_offset_table_rtx, reg);
8881 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8884 seq = get_insns ();
8885 end_sequence ();
8887 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8888 insert_insn_on_edge (seq, entry_edge);
8889 commit_one_edge_insertion (entry_edge);
8892 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8893 for a call to a function whose data type is FNTYPE.
8894 For a library call, FNTYPE is 0. */
8896 void
8897 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8898 tree fntype, /* tree ptr for function decl */
8899 rtx libname, /* SYMBOL_REF of library name or 0 */
8900 tree fndecl,
8901 int caller)
8903 struct cgraph_local_info *i = NULL;
8904 struct cgraph_node *target = NULL;
8906 memset (cum, 0, sizeof (*cum));
8908 if (fndecl)
8910 target = cgraph_node::get (fndecl);
8911 if (target)
8913 target = target->function_symbol ();
8914 i = cgraph_node::local_info (target->decl);
8915 cum->call_abi = ix86_function_abi (target->decl);
8917 else
8918 cum->call_abi = ix86_function_abi (fndecl);
8920 else
8921 cum->call_abi = ix86_function_type_abi (fntype);
8923 cum->caller = caller;
8925 /* Set up the number of registers to use for passing arguments. */
8926 cum->nregs = ix86_regparm;
8927 if (TARGET_64BIT)
8929 cum->nregs = (cum->call_abi == SYSV_ABI
8930 ? X86_64_REGPARM_MAX
8931 : X86_64_MS_REGPARM_MAX);
8933 if (TARGET_SSE)
8935 cum->sse_nregs = SSE_REGPARM_MAX;
8936 if (TARGET_64BIT)
8938 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8939 ? X86_64_SSE_REGPARM_MAX
8940 : X86_64_MS_SSE_REGPARM_MAX);
8943 if (TARGET_MMX)
8944 cum->mmx_nregs = MMX_REGPARM_MAX;
8945 cum->warn_avx512f = true;
8946 cum->warn_avx = true;
8947 cum->warn_sse = true;
8948 cum->warn_mmx = true;
8950 /* Because type might mismatch in between caller and callee, we need to
8951 use actual type of function for local calls.
8952 FIXME: cgraph_analyze can be told to actually record if function uses
8953 va_start so for local functions maybe_vaarg can be made aggressive
8954 helping K&R code.
8955 FIXME: once typesytem is fixed, we won't need this code anymore. */
8956 if (i && i->local && i->can_change_signature)
8957 fntype = TREE_TYPE (target->decl);
8958 cum->stdarg = stdarg_p (fntype);
8959 cum->maybe_vaarg = (fntype
8960 ? (!prototype_p (fntype) || stdarg_p (fntype))
8961 : !libname);
8963 cum->bnd_regno = FIRST_BND_REG;
8964 cum->bnds_in_bt = 0;
8965 cum->force_bnd_pass = 0;
8966 cum->decl = fndecl;
8968 if (!TARGET_64BIT)
8970 /* If there are variable arguments, then we won't pass anything
8971 in registers in 32-bit mode. */
8972 if (stdarg_p (fntype))
8974 cum->nregs = 0;
8975 /* Since in 32-bit, variable arguments are always passed on
8976 stack, there is scratch register available for indirect
8977 sibcall. */
8978 cfun->machine->arg_reg_available = true;
8979 cum->sse_nregs = 0;
8980 cum->mmx_nregs = 0;
8981 cum->warn_avx512f = false;
8982 cum->warn_avx = false;
8983 cum->warn_sse = false;
8984 cum->warn_mmx = false;
8985 return;
8988 /* Use ecx and edx registers if function has fastcall attribute,
8989 else look for regparm information. */
8990 if (fntype)
8992 unsigned int ccvt = ix86_get_callcvt (fntype);
8993 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8995 cum->nregs = 1;
8996 cum->fastcall = 1; /* Same first register as in fastcall. */
8998 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
9000 cum->nregs = 2;
9001 cum->fastcall = 1;
9003 else
9004 cum->nregs = ix86_function_regparm (fntype, fndecl);
9007 /* Set up the number of SSE registers used for passing SFmode
9008 and DFmode arguments. Warn for mismatching ABI. */
9009 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
9012 cfun->machine->arg_reg_available = (cum->nregs > 0);
9015 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9016 But in the case of vector types, it is some vector mode.
9018 When we have only some of our vector isa extensions enabled, then there
9019 are some modes for which vector_mode_supported_p is false. For these
9020 modes, the generic vector support in gcc will choose some non-vector mode
9021 in order to implement the type. By computing the natural mode, we'll
9022 select the proper ABI location for the operand and not depend on whatever
9023 the middle-end decides to do with these vector types.
9025 The midde-end can't deal with the vector types > 16 bytes. In this
9026 case, we return the original mode and warn ABI change if CUM isn't
9027 NULL.
9029 If INT_RETURN is true, warn ABI change if the vector mode isn't
9030 available for function return value. */
9032 static machine_mode
9033 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9034 bool in_return)
9036 machine_mode mode = TYPE_MODE (type);
9038 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9040 HOST_WIDE_INT size = int_size_in_bytes (type);
9041 if ((size == 8 || size == 16 || size == 32 || size == 64)
9042 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9043 && TYPE_VECTOR_SUBPARTS (type) > 1)
9045 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9047 /* There are no XFmode vector modes. */
9048 if (innermode == XFmode)
9049 return mode;
9051 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9052 mode = MIN_MODE_VECTOR_FLOAT;
9053 else
9054 mode = MIN_MODE_VECTOR_INT;
9056 /* Get the mode which has this inner mode and number of units. */
9057 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
9058 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9059 && GET_MODE_INNER (mode) == innermode)
9061 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9063 static bool warnedavx512f;
9064 static bool warnedavx512f_ret;
9066 if (cum && cum->warn_avx512f && !warnedavx512f)
9068 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9069 "without AVX512F enabled changes the ABI"))
9070 warnedavx512f = true;
9072 else if (in_return && !warnedavx512f_ret)
9074 if (warning (OPT_Wpsabi, "AVX512F vector return "
9075 "without AVX512F enabled changes the ABI"))
9076 warnedavx512f_ret = true;
9079 return TYPE_MODE (type);
9081 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9083 static bool warnedavx;
9084 static bool warnedavx_ret;
9086 if (cum && cum->warn_avx && !warnedavx)
9088 if (warning (OPT_Wpsabi, "AVX vector argument "
9089 "without AVX enabled changes the ABI"))
9090 warnedavx = true;
9092 else if (in_return && !warnedavx_ret)
9094 if (warning (OPT_Wpsabi, "AVX vector return "
9095 "without AVX enabled changes the ABI"))
9096 warnedavx_ret = true;
9099 return TYPE_MODE (type);
9101 else if (((size == 8 && TARGET_64BIT) || size == 16)
9102 && !TARGET_SSE
9103 && !TARGET_IAMCU)
9105 static bool warnedsse;
9106 static bool warnedsse_ret;
9108 if (cum && cum->warn_sse && !warnedsse)
9110 if (warning (OPT_Wpsabi, "SSE vector argument "
9111 "without SSE enabled changes the ABI"))
9112 warnedsse = true;
9114 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9116 if (warning (OPT_Wpsabi, "SSE vector return "
9117 "without SSE enabled changes the ABI"))
9118 warnedsse_ret = true;
9121 else if ((size == 8 && !TARGET_64BIT)
9122 && (!cfun
9123 || cfun->machine->func_type == TYPE_NORMAL)
9124 && !TARGET_MMX
9125 && !TARGET_IAMCU)
9127 static bool warnedmmx;
9128 static bool warnedmmx_ret;
9130 if (cum && cum->warn_mmx && !warnedmmx)
9132 if (warning (OPT_Wpsabi, "MMX vector argument "
9133 "without MMX enabled changes the ABI"))
9134 warnedmmx = true;
9136 else if (in_return && !warnedmmx_ret)
9138 if (warning (OPT_Wpsabi, "MMX vector return "
9139 "without MMX enabled changes the ABI"))
9140 warnedmmx_ret = true;
9143 return mode;
9146 gcc_unreachable ();
9150 return mode;
9153 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9154 this may not agree with the mode that the type system has chosen for the
9155 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9156 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9158 static rtx
9159 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9160 unsigned int regno)
9162 rtx tmp;
9164 if (orig_mode != BLKmode)
9165 tmp = gen_rtx_REG (orig_mode, regno);
9166 else
9168 tmp = gen_rtx_REG (mode, regno);
9169 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9170 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9173 return tmp;
9176 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9177 of this code is to classify each 8bytes of incoming argument by the register
9178 class and assign registers accordingly. */
9180 /* Return the union class of CLASS1 and CLASS2.
9181 See the x86-64 PS ABI for details. */
9183 static enum x86_64_reg_class
9184 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9186 /* Rule #1: If both classes are equal, this is the resulting class. */
9187 if (class1 == class2)
9188 return class1;
9190 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9191 the other class. */
9192 if (class1 == X86_64_NO_CLASS)
9193 return class2;
9194 if (class2 == X86_64_NO_CLASS)
9195 return class1;
9197 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9198 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9199 return X86_64_MEMORY_CLASS;
9201 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9202 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9203 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9204 return X86_64_INTEGERSI_CLASS;
9205 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9206 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9207 return X86_64_INTEGER_CLASS;
9209 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9210 MEMORY is used. */
9211 if (class1 == X86_64_X87_CLASS
9212 || class1 == X86_64_X87UP_CLASS
9213 || class1 == X86_64_COMPLEX_X87_CLASS
9214 || class2 == X86_64_X87_CLASS
9215 || class2 == X86_64_X87UP_CLASS
9216 || class2 == X86_64_COMPLEX_X87_CLASS)
9217 return X86_64_MEMORY_CLASS;
9219 /* Rule #6: Otherwise class SSE is used. */
9220 return X86_64_SSE_CLASS;
9223 /* Classify the argument of type TYPE and mode MODE.
9224 CLASSES will be filled by the register class used to pass each word
9225 of the operand. The number of words is returned. In case the parameter
9226 should be passed in memory, 0 is returned. As a special case for zero
9227 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9229 BIT_OFFSET is used internally for handling records and specifies offset
9230 of the offset in bits modulo 512 to avoid overflow cases.
9232 See the x86-64 PS ABI for details.
9235 static int
9236 classify_argument (machine_mode mode, const_tree type,
9237 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9239 HOST_WIDE_INT bytes =
9240 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9241 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9243 /* Variable sized entities are always passed/returned in memory. */
9244 if (bytes < 0)
9245 return 0;
9247 if (mode != VOIDmode
9248 && targetm.calls.must_pass_in_stack (mode, type))
9249 return 0;
9251 if (type && AGGREGATE_TYPE_P (type))
9253 int i;
9254 tree field;
9255 enum x86_64_reg_class subclasses[MAX_CLASSES];
9257 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9258 if (bytes > 64)
9259 return 0;
9261 for (i = 0; i < words; i++)
9262 classes[i] = X86_64_NO_CLASS;
9264 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9265 signalize memory class, so handle it as special case. */
9266 if (!words)
9268 classes[0] = X86_64_NO_CLASS;
9269 return 1;
9272 /* Classify each field of record and merge classes. */
9273 switch (TREE_CODE (type))
9275 case RECORD_TYPE:
9276 /* And now merge the fields of structure. */
9277 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9279 if (TREE_CODE (field) == FIELD_DECL)
9281 int num;
9283 if (TREE_TYPE (field) == error_mark_node)
9284 continue;
9286 /* Bitfields are always classified as integer. Handle them
9287 early, since later code would consider them to be
9288 misaligned integers. */
9289 if (DECL_BIT_FIELD (field))
9291 for (i = (int_bit_position (field)
9292 + (bit_offset % 64)) / 8 / 8;
9293 i < ((int_bit_position (field) + (bit_offset % 64))
9294 + tree_to_shwi (DECL_SIZE (field))
9295 + 63) / 8 / 8; i++)
9296 classes[i] =
9297 merge_classes (X86_64_INTEGER_CLASS,
9298 classes[i]);
9300 else
9302 int pos;
9304 type = TREE_TYPE (field);
9306 /* Flexible array member is ignored. */
9307 if (TYPE_MODE (type) == BLKmode
9308 && TREE_CODE (type) == ARRAY_TYPE
9309 && TYPE_SIZE (type) == NULL_TREE
9310 && TYPE_DOMAIN (type) != NULL_TREE
9311 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9312 == NULL_TREE))
9314 static bool warned;
9316 if (!warned && warn_psabi)
9318 warned = true;
9319 inform (input_location,
9320 "the ABI of passing struct with"
9321 " a flexible array member has"
9322 " changed in GCC 4.4");
9324 continue;
9326 num = classify_argument (TYPE_MODE (type), type,
9327 subclasses,
9328 (int_bit_position (field)
9329 + bit_offset) % 512);
9330 if (!num)
9331 return 0;
9332 pos = (int_bit_position (field)
9333 + (bit_offset % 64)) / 8 / 8;
9334 for (i = 0; i < num && (i + pos) < words; i++)
9335 classes[i + pos] =
9336 merge_classes (subclasses[i], classes[i + pos]);
9340 break;
9342 case ARRAY_TYPE:
9343 /* Arrays are handled as small records. */
9345 int num;
9346 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9347 TREE_TYPE (type), subclasses, bit_offset);
9348 if (!num)
9349 return 0;
9351 /* The partial classes are now full classes. */
9352 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9353 subclasses[0] = X86_64_SSE_CLASS;
9354 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9355 && !((bit_offset % 64) == 0 && bytes == 4))
9356 subclasses[0] = X86_64_INTEGER_CLASS;
9358 for (i = 0; i < words; i++)
9359 classes[i] = subclasses[i % num];
9361 break;
9363 case UNION_TYPE:
9364 case QUAL_UNION_TYPE:
9365 /* Unions are similar to RECORD_TYPE but offset is always 0.
9367 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9369 if (TREE_CODE (field) == FIELD_DECL)
9371 int num;
9373 if (TREE_TYPE (field) == error_mark_node)
9374 continue;
9376 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9377 TREE_TYPE (field), subclasses,
9378 bit_offset);
9379 if (!num)
9380 return 0;
9381 for (i = 0; i < num && i < words; i++)
9382 classes[i] = merge_classes (subclasses[i], classes[i]);
9385 break;
9387 default:
9388 gcc_unreachable ();
9391 if (words > 2)
9393 /* When size > 16 bytes, if the first one isn't
9394 X86_64_SSE_CLASS or any other ones aren't
9395 X86_64_SSEUP_CLASS, everything should be passed in
9396 memory. */
9397 if (classes[0] != X86_64_SSE_CLASS)
9398 return 0;
9400 for (i = 1; i < words; i++)
9401 if (classes[i] != X86_64_SSEUP_CLASS)
9402 return 0;
9405 /* Final merger cleanup. */
9406 for (i = 0; i < words; i++)
9408 /* If one class is MEMORY, everything should be passed in
9409 memory. */
9410 if (classes[i] == X86_64_MEMORY_CLASS)
9411 return 0;
9413 /* The X86_64_SSEUP_CLASS should be always preceded by
9414 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9415 if (classes[i] == X86_64_SSEUP_CLASS
9416 && classes[i - 1] != X86_64_SSE_CLASS
9417 && classes[i - 1] != X86_64_SSEUP_CLASS)
9419 /* The first one should never be X86_64_SSEUP_CLASS. */
9420 gcc_assert (i != 0);
9421 classes[i] = X86_64_SSE_CLASS;
9424 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9425 everything should be passed in memory. */
9426 if (classes[i] == X86_64_X87UP_CLASS
9427 && (classes[i - 1] != X86_64_X87_CLASS))
9429 static bool warned;
9431 /* The first one should never be X86_64_X87UP_CLASS. */
9432 gcc_assert (i != 0);
9433 if (!warned && warn_psabi)
9435 warned = true;
9436 inform (input_location,
9437 "the ABI of passing union with long double"
9438 " has changed in GCC 4.4");
9440 return 0;
9443 return words;
9446 /* Compute alignment needed. We align all types to natural boundaries with
9447 exception of XFmode that is aligned to 64bits. */
9448 if (mode != VOIDmode && mode != BLKmode)
9450 int mode_alignment = GET_MODE_BITSIZE (mode);
9452 if (mode == XFmode)
9453 mode_alignment = 128;
9454 else if (mode == XCmode)
9455 mode_alignment = 256;
9456 if (COMPLEX_MODE_P (mode))
9457 mode_alignment /= 2;
9458 /* Misaligned fields are always returned in memory. */
9459 if (bit_offset % mode_alignment)
9460 return 0;
9463 /* for V1xx modes, just use the base mode */
9464 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9465 && GET_MODE_UNIT_SIZE (mode) == bytes)
9466 mode = GET_MODE_INNER (mode);
9468 /* Classification of atomic types. */
9469 switch (mode)
9471 case SDmode:
9472 case DDmode:
9473 classes[0] = X86_64_SSE_CLASS;
9474 return 1;
9475 case TDmode:
9476 classes[0] = X86_64_SSE_CLASS;
9477 classes[1] = X86_64_SSEUP_CLASS;
9478 return 2;
9479 case DImode:
9480 case SImode:
9481 case HImode:
9482 case QImode:
9483 case CSImode:
9484 case CHImode:
9485 case CQImode:
9487 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9489 /* Analyze last 128 bits only. */
9490 size = (size - 1) & 0x7f;
9492 if (size < 32)
9494 classes[0] = X86_64_INTEGERSI_CLASS;
9495 return 1;
9497 else if (size < 64)
9499 classes[0] = X86_64_INTEGER_CLASS;
9500 return 1;
9502 else if (size < 64+32)
9504 classes[0] = X86_64_INTEGER_CLASS;
9505 classes[1] = X86_64_INTEGERSI_CLASS;
9506 return 2;
9508 else if (size < 64+64)
9510 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9511 return 2;
9513 else
9514 gcc_unreachable ();
9516 case CDImode:
9517 case TImode:
9518 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9519 return 2;
9520 case COImode:
9521 case OImode:
9522 /* OImode shouldn't be used directly. */
9523 gcc_unreachable ();
9524 case CTImode:
9525 return 0;
9526 case SFmode:
9527 if (!(bit_offset % 64))
9528 classes[0] = X86_64_SSESF_CLASS;
9529 else
9530 classes[0] = X86_64_SSE_CLASS;
9531 return 1;
9532 case DFmode:
9533 classes[0] = X86_64_SSEDF_CLASS;
9534 return 1;
9535 case XFmode:
9536 classes[0] = X86_64_X87_CLASS;
9537 classes[1] = X86_64_X87UP_CLASS;
9538 return 2;
9539 case TFmode:
9540 classes[0] = X86_64_SSE_CLASS;
9541 classes[1] = X86_64_SSEUP_CLASS;
9542 return 2;
9543 case SCmode:
9544 classes[0] = X86_64_SSE_CLASS;
9545 if (!(bit_offset % 64))
9546 return 1;
9547 else
9549 static bool warned;
9551 if (!warned && warn_psabi)
9553 warned = true;
9554 inform (input_location,
9555 "the ABI of passing structure with complex float"
9556 " member has changed in GCC 4.4");
9558 classes[1] = X86_64_SSESF_CLASS;
9559 return 2;
9561 case DCmode:
9562 classes[0] = X86_64_SSEDF_CLASS;
9563 classes[1] = X86_64_SSEDF_CLASS;
9564 return 2;
9565 case XCmode:
9566 classes[0] = X86_64_COMPLEX_X87_CLASS;
9567 return 1;
9568 case TCmode:
9569 /* This modes is larger than 16 bytes. */
9570 return 0;
9571 case V8SFmode:
9572 case V8SImode:
9573 case V32QImode:
9574 case V16HImode:
9575 case V4DFmode:
9576 case V4DImode:
9577 classes[0] = X86_64_SSE_CLASS;
9578 classes[1] = X86_64_SSEUP_CLASS;
9579 classes[2] = X86_64_SSEUP_CLASS;
9580 classes[3] = X86_64_SSEUP_CLASS;
9581 return 4;
9582 case V8DFmode:
9583 case V16SFmode:
9584 case V8DImode:
9585 case V16SImode:
9586 case V32HImode:
9587 case V64QImode:
9588 classes[0] = X86_64_SSE_CLASS;
9589 classes[1] = X86_64_SSEUP_CLASS;
9590 classes[2] = X86_64_SSEUP_CLASS;
9591 classes[3] = X86_64_SSEUP_CLASS;
9592 classes[4] = X86_64_SSEUP_CLASS;
9593 classes[5] = X86_64_SSEUP_CLASS;
9594 classes[6] = X86_64_SSEUP_CLASS;
9595 classes[7] = X86_64_SSEUP_CLASS;
9596 return 8;
9597 case V4SFmode:
9598 case V4SImode:
9599 case V16QImode:
9600 case V8HImode:
9601 case V2DFmode:
9602 case V2DImode:
9603 classes[0] = X86_64_SSE_CLASS;
9604 classes[1] = X86_64_SSEUP_CLASS;
9605 return 2;
9606 case V1TImode:
9607 case V1DImode:
9608 case V2SFmode:
9609 case V2SImode:
9610 case V4HImode:
9611 case V8QImode:
9612 classes[0] = X86_64_SSE_CLASS;
9613 return 1;
9614 case BLKmode:
9615 case VOIDmode:
9616 return 0;
9617 default:
9618 gcc_assert (VECTOR_MODE_P (mode));
9620 if (bytes > 16)
9621 return 0;
9623 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9625 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9626 classes[0] = X86_64_INTEGERSI_CLASS;
9627 else
9628 classes[0] = X86_64_INTEGER_CLASS;
9629 classes[1] = X86_64_INTEGER_CLASS;
9630 return 1 + (bytes > 8);
9634 /* Examine the argument and return set number of register required in each
9635 class. Return true iff parameter should be passed in memory. */
9637 static bool
9638 examine_argument (machine_mode mode, const_tree type, int in_return,
9639 int *int_nregs, int *sse_nregs)
9641 enum x86_64_reg_class regclass[MAX_CLASSES];
9642 int n = classify_argument (mode, type, regclass, 0);
9644 *int_nregs = 0;
9645 *sse_nregs = 0;
9647 if (!n)
9648 return true;
9649 for (n--; n >= 0; n--)
9650 switch (regclass[n])
9652 case X86_64_INTEGER_CLASS:
9653 case X86_64_INTEGERSI_CLASS:
9654 (*int_nregs)++;
9655 break;
9656 case X86_64_SSE_CLASS:
9657 case X86_64_SSESF_CLASS:
9658 case X86_64_SSEDF_CLASS:
9659 (*sse_nregs)++;
9660 break;
9661 case X86_64_NO_CLASS:
9662 case X86_64_SSEUP_CLASS:
9663 break;
9664 case X86_64_X87_CLASS:
9665 case X86_64_X87UP_CLASS:
9666 case X86_64_COMPLEX_X87_CLASS:
9667 if (!in_return)
9668 return true;
9669 break;
9670 case X86_64_MEMORY_CLASS:
9671 gcc_unreachable ();
9674 return false;
9677 /* Construct container for the argument used by GCC interface. See
9678 FUNCTION_ARG for the detailed description. */
9680 static rtx
9681 construct_container (machine_mode mode, machine_mode orig_mode,
9682 const_tree type, int in_return, int nintregs, int nsseregs,
9683 const int *intreg, int sse_regno)
9685 /* The following variables hold the static issued_error state. */
9686 static bool issued_sse_arg_error;
9687 static bool issued_sse_ret_error;
9688 static bool issued_x87_ret_error;
9690 machine_mode tmpmode;
9691 int bytes =
9692 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9693 enum x86_64_reg_class regclass[MAX_CLASSES];
9694 int n;
9695 int i;
9696 int nexps = 0;
9697 int needed_sseregs, needed_intregs;
9698 rtx exp[MAX_CLASSES];
9699 rtx ret;
9701 n = classify_argument (mode, type, regclass, 0);
9702 if (!n)
9703 return NULL;
9704 if (examine_argument (mode, type, in_return, &needed_intregs,
9705 &needed_sseregs))
9706 return NULL;
9707 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9708 return NULL;
9710 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9711 some less clueful developer tries to use floating-point anyway. */
9712 if (needed_sseregs && !TARGET_SSE)
9714 if (in_return)
9716 if (!issued_sse_ret_error)
9718 error ("SSE register return with SSE disabled");
9719 issued_sse_ret_error = true;
9722 else if (!issued_sse_arg_error)
9724 error ("SSE register argument with SSE disabled");
9725 issued_sse_arg_error = true;
9727 return NULL;
9730 /* Likewise, error if the ABI requires us to return values in the
9731 x87 registers and the user specified -mno-80387. */
9732 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9733 for (i = 0; i < n; i++)
9734 if (regclass[i] == X86_64_X87_CLASS
9735 || regclass[i] == X86_64_X87UP_CLASS
9736 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9738 if (!issued_x87_ret_error)
9740 error ("x87 register return with x87 disabled");
9741 issued_x87_ret_error = true;
9743 return NULL;
9746 /* First construct simple cases. Avoid SCmode, since we want to use
9747 single register to pass this type. */
9748 if (n == 1 && mode != SCmode)
9749 switch (regclass[0])
9751 case X86_64_INTEGER_CLASS:
9752 case X86_64_INTEGERSI_CLASS:
9753 return gen_rtx_REG (mode, intreg[0]);
9754 case X86_64_SSE_CLASS:
9755 case X86_64_SSESF_CLASS:
9756 case X86_64_SSEDF_CLASS:
9757 if (mode != BLKmode)
9758 return gen_reg_or_parallel (mode, orig_mode,
9759 SSE_REGNO (sse_regno));
9760 break;
9761 case X86_64_X87_CLASS:
9762 case X86_64_COMPLEX_X87_CLASS:
9763 return gen_rtx_REG (mode, FIRST_STACK_REG);
9764 case X86_64_NO_CLASS:
9765 /* Zero sized array, struct or class. */
9766 return NULL;
9767 default:
9768 gcc_unreachable ();
9770 if (n == 2
9771 && regclass[0] == X86_64_SSE_CLASS
9772 && regclass[1] == X86_64_SSEUP_CLASS
9773 && mode != BLKmode)
9774 return gen_reg_or_parallel (mode, orig_mode,
9775 SSE_REGNO (sse_regno));
9776 if (n == 4
9777 && regclass[0] == X86_64_SSE_CLASS
9778 && regclass[1] == X86_64_SSEUP_CLASS
9779 && regclass[2] == X86_64_SSEUP_CLASS
9780 && regclass[3] == X86_64_SSEUP_CLASS
9781 && mode != BLKmode)
9782 return gen_reg_or_parallel (mode, orig_mode,
9783 SSE_REGNO (sse_regno));
9784 if (n == 8
9785 && regclass[0] == X86_64_SSE_CLASS
9786 && regclass[1] == X86_64_SSEUP_CLASS
9787 && regclass[2] == X86_64_SSEUP_CLASS
9788 && regclass[3] == X86_64_SSEUP_CLASS
9789 && regclass[4] == X86_64_SSEUP_CLASS
9790 && regclass[5] == X86_64_SSEUP_CLASS
9791 && regclass[6] == X86_64_SSEUP_CLASS
9792 && regclass[7] == X86_64_SSEUP_CLASS
9793 && mode != BLKmode)
9794 return gen_reg_or_parallel (mode, orig_mode,
9795 SSE_REGNO (sse_regno));
9796 if (n == 2
9797 && regclass[0] == X86_64_X87_CLASS
9798 && regclass[1] == X86_64_X87UP_CLASS)
9799 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9801 if (n == 2
9802 && regclass[0] == X86_64_INTEGER_CLASS
9803 && regclass[1] == X86_64_INTEGER_CLASS
9804 && (mode == CDImode || mode == TImode)
9805 && intreg[0] + 1 == intreg[1])
9806 return gen_rtx_REG (mode, intreg[0]);
9808 /* Otherwise figure out the entries of the PARALLEL. */
9809 for (i = 0; i < n; i++)
9811 int pos;
9813 switch (regclass[i])
9815 case X86_64_NO_CLASS:
9816 break;
9817 case X86_64_INTEGER_CLASS:
9818 case X86_64_INTEGERSI_CLASS:
9819 /* Merge TImodes on aligned occasions here too. */
9820 if (i * 8 + 8 > bytes)
9821 tmpmode
9822 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9823 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9824 tmpmode = SImode;
9825 else
9826 tmpmode = DImode;
9827 /* We've requested 24 bytes we
9828 don't have mode for. Use DImode. */
9829 if (tmpmode == BLKmode)
9830 tmpmode = DImode;
9831 exp [nexps++]
9832 = gen_rtx_EXPR_LIST (VOIDmode,
9833 gen_rtx_REG (tmpmode, *intreg),
9834 GEN_INT (i*8));
9835 intreg++;
9836 break;
9837 case X86_64_SSESF_CLASS:
9838 exp [nexps++]
9839 = gen_rtx_EXPR_LIST (VOIDmode,
9840 gen_rtx_REG (SFmode,
9841 SSE_REGNO (sse_regno)),
9842 GEN_INT (i*8));
9843 sse_regno++;
9844 break;
9845 case X86_64_SSEDF_CLASS:
9846 exp [nexps++]
9847 = gen_rtx_EXPR_LIST (VOIDmode,
9848 gen_rtx_REG (DFmode,
9849 SSE_REGNO (sse_regno)),
9850 GEN_INT (i*8));
9851 sse_regno++;
9852 break;
9853 case X86_64_SSE_CLASS:
9854 pos = i;
9855 switch (n)
9857 case 1:
9858 tmpmode = DImode;
9859 break;
9860 case 2:
9861 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9863 tmpmode = TImode;
9864 i++;
9866 else
9867 tmpmode = DImode;
9868 break;
9869 case 4:
9870 gcc_assert (i == 0
9871 && regclass[1] == X86_64_SSEUP_CLASS
9872 && regclass[2] == X86_64_SSEUP_CLASS
9873 && regclass[3] == X86_64_SSEUP_CLASS);
9874 tmpmode = OImode;
9875 i += 3;
9876 break;
9877 case 8:
9878 gcc_assert (i == 0
9879 && regclass[1] == X86_64_SSEUP_CLASS
9880 && regclass[2] == X86_64_SSEUP_CLASS
9881 && regclass[3] == X86_64_SSEUP_CLASS
9882 && regclass[4] == X86_64_SSEUP_CLASS
9883 && regclass[5] == X86_64_SSEUP_CLASS
9884 && regclass[6] == X86_64_SSEUP_CLASS
9885 && regclass[7] == X86_64_SSEUP_CLASS);
9886 tmpmode = XImode;
9887 i += 7;
9888 break;
9889 default:
9890 gcc_unreachable ();
9892 exp [nexps++]
9893 = gen_rtx_EXPR_LIST (VOIDmode,
9894 gen_rtx_REG (tmpmode,
9895 SSE_REGNO (sse_regno)),
9896 GEN_INT (pos*8));
9897 sse_regno++;
9898 break;
9899 default:
9900 gcc_unreachable ();
9904 /* Empty aligned struct, union or class. */
9905 if (nexps == 0)
9906 return NULL;
9908 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9909 for (i = 0; i < nexps; i++)
9910 XVECEXP (ret, 0, i) = exp [i];
9911 return ret;
9914 /* Update the data in CUM to advance over an argument of mode MODE
9915 and data type TYPE. (TYPE is null for libcalls where that information
9916 may not be available.)
9918 Return a number of integer regsiters advanced over. */
9920 static int
9921 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9922 const_tree type, HOST_WIDE_INT bytes,
9923 HOST_WIDE_INT words)
9925 int res = 0;
9926 bool error_p = false;
9928 if (TARGET_IAMCU)
9930 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9931 bytes in registers. */
9932 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9933 goto pass_in_reg;
9934 return res;
9937 switch (mode)
9939 default:
9940 break;
9942 case BLKmode:
9943 if (bytes < 0)
9944 break;
9945 /* FALLTHRU */
9947 case DImode:
9948 case SImode:
9949 case HImode:
9950 case QImode:
9951 pass_in_reg:
9952 cum->words += words;
9953 cum->nregs -= words;
9954 cum->regno += words;
9955 if (cum->nregs >= 0)
9956 res = words;
9957 if (cum->nregs <= 0)
9959 cum->nregs = 0;
9960 cfun->machine->arg_reg_available = false;
9961 cum->regno = 0;
9963 break;
9965 case OImode:
9966 /* OImode shouldn't be used directly. */
9967 gcc_unreachable ();
9969 case DFmode:
9970 if (cum->float_in_sse == -1)
9971 error_p = true;
9972 if (cum->float_in_sse < 2)
9973 break;
9974 /* FALLTHRU */
9975 case SFmode:
9976 if (cum->float_in_sse == -1)
9977 error_p = true;
9978 if (cum->float_in_sse < 1)
9979 break;
9980 /* FALLTHRU */
9982 case V8SFmode:
9983 case V8SImode:
9984 case V64QImode:
9985 case V32HImode:
9986 case V16SImode:
9987 case V8DImode:
9988 case V16SFmode:
9989 case V8DFmode:
9990 case V32QImode:
9991 case V16HImode:
9992 case V4DFmode:
9993 case V4DImode:
9994 case TImode:
9995 case V16QImode:
9996 case V8HImode:
9997 case V4SImode:
9998 case V2DImode:
9999 case V4SFmode:
10000 case V2DFmode:
10001 if (!type || !AGGREGATE_TYPE_P (type))
10003 cum->sse_words += words;
10004 cum->sse_nregs -= 1;
10005 cum->sse_regno += 1;
10006 if (cum->sse_nregs <= 0)
10008 cum->sse_nregs = 0;
10009 cum->sse_regno = 0;
10012 break;
10014 case V8QImode:
10015 case V4HImode:
10016 case V2SImode:
10017 case V2SFmode:
10018 case V1TImode:
10019 case V1DImode:
10020 if (!type || !AGGREGATE_TYPE_P (type))
10022 cum->mmx_words += words;
10023 cum->mmx_nregs -= 1;
10024 cum->mmx_regno += 1;
10025 if (cum->mmx_nregs <= 0)
10027 cum->mmx_nregs = 0;
10028 cum->mmx_regno = 0;
10031 break;
10033 if (error_p)
10035 cum->float_in_sse = 0;
10036 error ("calling %qD with SSE calling convention without "
10037 "SSE/SSE2 enabled", cum->decl);
10038 sorry ("this is a GCC bug that can be worked around by adding "
10039 "attribute used to function called");
10042 return res;
10045 static int
10046 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10047 const_tree type, HOST_WIDE_INT words, bool named)
10049 int int_nregs, sse_nregs;
10051 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10052 if (!named && (VALID_AVX512F_REG_MODE (mode)
10053 || VALID_AVX256_REG_MODE (mode)))
10054 return 0;
10056 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10057 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10059 cum->nregs -= int_nregs;
10060 cum->sse_nregs -= sse_nregs;
10061 cum->regno += int_nregs;
10062 cum->sse_regno += sse_nregs;
10063 return int_nregs;
10065 else
10067 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10068 cum->words = ROUND_UP (cum->words, align);
10069 cum->words += words;
10070 return 0;
10074 static int
10075 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10076 HOST_WIDE_INT words)
10078 /* Otherwise, this should be passed indirect. */
10079 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10081 cum->words += words;
10082 if (cum->nregs > 0)
10084 cum->nregs -= 1;
10085 cum->regno += 1;
10086 return 1;
10088 return 0;
10091 /* Update the data in CUM to advance over an argument of mode MODE and
10092 data type TYPE. (TYPE is null for libcalls where that information
10093 may not be available.) */
10095 static void
10096 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10097 const_tree type, bool named)
10099 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10100 HOST_WIDE_INT bytes, words;
10101 int nregs;
10103 /* The argument of interrupt handler is a special case and is
10104 handled in ix86_function_arg. */
10105 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10106 return;
10108 if (mode == BLKmode)
10109 bytes = int_size_in_bytes (type);
10110 else
10111 bytes = GET_MODE_SIZE (mode);
10112 words = CEIL (bytes, UNITS_PER_WORD);
10114 if (type)
10115 mode = type_natural_mode (type, NULL, false);
10117 if ((type && POINTER_BOUNDS_TYPE_P (type))
10118 || POINTER_BOUNDS_MODE_P (mode))
10120 /* If we pass bounds in BT then just update remained bounds count. */
10121 if (cum->bnds_in_bt)
10123 cum->bnds_in_bt--;
10124 return;
10127 /* Update remained number of bounds to force. */
10128 if (cum->force_bnd_pass)
10129 cum->force_bnd_pass--;
10131 cum->bnd_regno++;
10133 return;
10136 /* The first arg not going to Bounds Tables resets this counter. */
10137 cum->bnds_in_bt = 0;
10138 /* For unnamed args we always pass bounds to avoid bounds mess when
10139 passed and received types do not match. If bounds do not follow
10140 unnamed arg, still pretend required number of bounds were passed. */
10141 if (cum->force_bnd_pass)
10143 cum->bnd_regno += cum->force_bnd_pass;
10144 cum->force_bnd_pass = 0;
10147 if (TARGET_64BIT)
10149 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10151 if (call_abi == MS_ABI)
10152 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10153 else
10154 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10156 else
10157 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10159 /* For stdarg we expect bounds to be passed for each value passed
10160 in register. */
10161 if (cum->stdarg)
10162 cum->force_bnd_pass = nregs;
10163 /* For pointers passed in memory we expect bounds passed in Bounds
10164 Table. */
10165 if (!nregs)
10167 /* Track if there are outgoing arguments on stack. */
10168 if (cum->caller)
10169 cfun->machine->outgoing_args_on_stack = true;
10171 cum->bnds_in_bt = chkp_type_bounds_count (type);
10175 /* Define where to put the arguments to a function.
10176 Value is zero to push the argument on the stack,
10177 or a hard register in which to store the argument.
10179 MODE is the argument's machine mode.
10180 TYPE is the data type of the argument (as a tree).
10181 This is null for libcalls where that information may
10182 not be available.
10183 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10184 the preceding args and about the function being called.
10185 NAMED is nonzero if this argument is a named parameter
10186 (otherwise it is an extra parameter matching an ellipsis). */
10188 static rtx
10189 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10190 machine_mode orig_mode, const_tree type,
10191 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10193 bool error_p = false;
10195 /* Avoid the AL settings for the Unix64 ABI. */
10196 if (mode == VOIDmode)
10197 return constm1_rtx;
10199 if (TARGET_IAMCU)
10201 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10202 bytes in registers. */
10203 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10204 goto pass_in_reg;
10205 return NULL_RTX;
10208 switch (mode)
10210 default:
10211 break;
10213 case BLKmode:
10214 if (bytes < 0)
10215 break;
10216 /* FALLTHRU */
10217 case DImode:
10218 case SImode:
10219 case HImode:
10220 case QImode:
10221 pass_in_reg:
10222 if (words <= cum->nregs)
10224 int regno = cum->regno;
10226 /* Fastcall allocates the first two DWORD (SImode) or
10227 smaller arguments to ECX and EDX if it isn't an
10228 aggregate type . */
10229 if (cum->fastcall)
10231 if (mode == BLKmode
10232 || mode == DImode
10233 || (type && AGGREGATE_TYPE_P (type)))
10234 break;
10236 /* ECX not EAX is the first allocated register. */
10237 if (regno == AX_REG)
10238 regno = CX_REG;
10240 return gen_rtx_REG (mode, regno);
10242 break;
10244 case DFmode:
10245 if (cum->float_in_sse == -1)
10246 error_p = true;
10247 if (cum->float_in_sse < 2)
10248 break;
10249 /* FALLTHRU */
10250 case SFmode:
10251 if (cum->float_in_sse == -1)
10252 error_p = true;
10253 if (cum->float_in_sse < 1)
10254 break;
10255 /* FALLTHRU */
10256 case TImode:
10257 /* In 32bit, we pass TImode in xmm registers. */
10258 case V16QImode:
10259 case V8HImode:
10260 case V4SImode:
10261 case V2DImode:
10262 case V4SFmode:
10263 case V2DFmode:
10264 if (!type || !AGGREGATE_TYPE_P (type))
10266 if (cum->sse_nregs)
10267 return gen_reg_or_parallel (mode, orig_mode,
10268 cum->sse_regno + FIRST_SSE_REG);
10270 break;
10272 case OImode:
10273 case XImode:
10274 /* OImode and XImode shouldn't be used directly. */
10275 gcc_unreachable ();
10277 case V64QImode:
10278 case V32HImode:
10279 case V16SImode:
10280 case V8DImode:
10281 case V16SFmode:
10282 case V8DFmode:
10283 case V8SFmode:
10284 case V8SImode:
10285 case V32QImode:
10286 case V16HImode:
10287 case V4DFmode:
10288 case V4DImode:
10289 if (!type || !AGGREGATE_TYPE_P (type))
10291 if (cum->sse_nregs)
10292 return gen_reg_or_parallel (mode, orig_mode,
10293 cum->sse_regno + FIRST_SSE_REG);
10295 break;
10297 case V8QImode:
10298 case V4HImode:
10299 case V2SImode:
10300 case V2SFmode:
10301 case V1TImode:
10302 case V1DImode:
10303 if (!type || !AGGREGATE_TYPE_P (type))
10305 if (cum->mmx_nregs)
10306 return gen_reg_or_parallel (mode, orig_mode,
10307 cum->mmx_regno + FIRST_MMX_REG);
10309 break;
10311 if (error_p)
10313 cum->float_in_sse = 0;
10314 error ("calling %qD with SSE calling convention without "
10315 "SSE/SSE2 enabled", cum->decl);
10316 sorry ("this is a GCC bug that can be worked around by adding "
10317 "attribute used to function called");
10320 return NULL_RTX;
10323 static rtx
10324 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10325 machine_mode orig_mode, const_tree type, bool named)
10327 /* Handle a hidden AL argument containing number of registers
10328 for varargs x86-64 functions. */
10329 if (mode == VOIDmode)
10330 return GEN_INT (cum->maybe_vaarg
10331 ? (cum->sse_nregs < 0
10332 ? X86_64_SSE_REGPARM_MAX
10333 : cum->sse_regno)
10334 : -1);
10336 switch (mode)
10338 default:
10339 break;
10341 case V8SFmode:
10342 case V8SImode:
10343 case V32QImode:
10344 case V16HImode:
10345 case V4DFmode:
10346 case V4DImode:
10347 case V16SFmode:
10348 case V16SImode:
10349 case V64QImode:
10350 case V32HImode:
10351 case V8DFmode:
10352 case V8DImode:
10353 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10354 if (!named)
10355 return NULL;
10356 break;
10359 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10360 cum->sse_nregs,
10361 &x86_64_int_parameter_registers [cum->regno],
10362 cum->sse_regno);
10365 static rtx
10366 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10367 machine_mode orig_mode, bool named,
10368 HOST_WIDE_INT bytes)
10370 unsigned int regno;
10372 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10373 We use value of -2 to specify that current function call is MSABI. */
10374 if (mode == VOIDmode)
10375 return GEN_INT (-2);
10377 /* If we've run out of registers, it goes on the stack. */
10378 if (cum->nregs == 0)
10379 return NULL_RTX;
10381 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10383 /* Only floating point modes are passed in anything but integer regs. */
10384 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10386 if (named)
10387 regno = cum->regno + FIRST_SSE_REG;
10388 else
10390 rtx t1, t2;
10392 /* Unnamed floating parameters are passed in both the
10393 SSE and integer registers. */
10394 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10395 t2 = gen_rtx_REG (mode, regno);
10396 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10397 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10398 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10401 /* Handle aggregated types passed in register. */
10402 if (orig_mode == BLKmode)
10404 if (bytes > 0 && bytes <= 8)
10405 mode = (bytes > 4 ? DImode : SImode);
10406 if (mode == BLKmode)
10407 mode = DImode;
10410 return gen_reg_or_parallel (mode, orig_mode, regno);
10413 /* Return where to put the arguments to a function.
10414 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10416 MODE is the argument's machine mode. TYPE is the data type of the
10417 argument. It is null for libcalls where that information may not be
10418 available. CUM gives information about the preceding args and about
10419 the function being called. NAMED is nonzero if this argument is a
10420 named parameter (otherwise it is an extra parameter matching an
10421 ellipsis). */
10423 static rtx
10424 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10425 const_tree type, bool named)
10427 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10428 machine_mode mode = omode;
10429 HOST_WIDE_INT bytes, words;
10430 rtx arg;
10432 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10434 gcc_assert (type != NULL_TREE);
10435 if (POINTER_TYPE_P (type))
10437 /* This is the pointer argument. */
10438 gcc_assert (TYPE_MODE (type) == Pmode);
10439 /* It is at -WORD(AP) in the current frame in interrupt and
10440 exception handlers. */
10441 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
10443 else
10445 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10446 && TREE_CODE (type) == INTEGER_TYPE
10447 && TYPE_MODE (type) == word_mode);
10448 /* The error code is the word-mode integer argument at
10449 -2 * WORD(AP) in the current frame of the exception
10450 handler. */
10451 arg = gen_rtx_MEM (word_mode,
10452 plus_constant (Pmode,
10453 arg_pointer_rtx,
10454 -2 * UNITS_PER_WORD));
10456 return arg;
10459 /* All pointer bounds arguments are handled separately here. */
10460 if ((type && POINTER_BOUNDS_TYPE_P (type))
10461 || POINTER_BOUNDS_MODE_P (mode))
10463 /* Return NULL if bounds are forced to go in Bounds Table. */
10464 if (cum->bnds_in_bt)
10465 arg = NULL;
10466 /* Return the next available bound reg if any. */
10467 else if (cum->bnd_regno <= LAST_BND_REG)
10468 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10469 /* Return the next special slot number otherwise. */
10470 else
10471 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10473 return arg;
10476 if (mode == BLKmode)
10477 bytes = int_size_in_bytes (type);
10478 else
10479 bytes = GET_MODE_SIZE (mode);
10480 words = CEIL (bytes, UNITS_PER_WORD);
10482 /* To simplify the code below, represent vector types with a vector mode
10483 even if MMX/SSE are not active. */
10484 if (type && TREE_CODE (type) == VECTOR_TYPE)
10485 mode = type_natural_mode (type, cum, false);
10487 if (TARGET_64BIT)
10489 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10491 if (call_abi == MS_ABI)
10492 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10493 else
10494 arg = function_arg_64 (cum, mode, omode, type, named);
10496 else
10497 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10499 /* Track if there are outgoing arguments on stack. */
10500 if (arg == NULL_RTX && cum->caller)
10501 cfun->machine->outgoing_args_on_stack = true;
10503 return arg;
10506 /* A C expression that indicates when an argument must be passed by
10507 reference. If nonzero for an argument, a copy of that argument is
10508 made in memory and a pointer to the argument is passed instead of
10509 the argument itself. The pointer is passed in whatever way is
10510 appropriate for passing a pointer to that type. */
10512 static bool
10513 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10514 const_tree type, bool)
10516 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10518 /* Bounds are never passed by reference. */
10519 if ((type && POINTER_BOUNDS_TYPE_P (type))
10520 || POINTER_BOUNDS_MODE_P (mode))
10521 return false;
10523 if (TARGET_64BIT)
10525 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10527 /* See Windows x64 Software Convention. */
10528 if (call_abi == MS_ABI)
10530 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10532 if (type)
10534 /* Arrays are passed by reference. */
10535 if (TREE_CODE (type) == ARRAY_TYPE)
10536 return true;
10538 if (RECORD_OR_UNION_TYPE_P (type))
10540 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10541 are passed by reference. */
10542 msize = int_size_in_bytes (type);
10546 /* __m128 is passed by reference. */
10547 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10549 else if (type && int_size_in_bytes (type) == -1)
10550 return true;
10553 return false;
10556 /* Return true when TYPE should be 128bit aligned for 32bit argument
10557 passing ABI. XXX: This function is obsolete and is only used for
10558 checking psABI compatibility with previous versions of GCC. */
10560 static bool
10561 ix86_compat_aligned_value_p (const_tree type)
10563 machine_mode mode = TYPE_MODE (type);
10564 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10565 || mode == TDmode
10566 || mode == TFmode
10567 || mode == TCmode)
10568 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10569 return true;
10570 if (TYPE_ALIGN (type) < 128)
10571 return false;
10573 if (AGGREGATE_TYPE_P (type))
10575 /* Walk the aggregates recursively. */
10576 switch (TREE_CODE (type))
10578 case RECORD_TYPE:
10579 case UNION_TYPE:
10580 case QUAL_UNION_TYPE:
10582 tree field;
10584 /* Walk all the structure fields. */
10585 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10587 if (TREE_CODE (field) == FIELD_DECL
10588 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10589 return true;
10591 break;
10594 case ARRAY_TYPE:
10595 /* Just for use if some languages passes arrays by value. */
10596 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10597 return true;
10598 break;
10600 default:
10601 gcc_unreachable ();
10604 return false;
10607 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10608 XXX: This function is obsolete and is only used for checking psABI
10609 compatibility with previous versions of GCC. */
10611 static unsigned int
10612 ix86_compat_function_arg_boundary (machine_mode mode,
10613 const_tree type, unsigned int align)
10615 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10616 natural boundaries. */
10617 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10619 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10620 make an exception for SSE modes since these require 128bit
10621 alignment.
10623 The handling here differs from field_alignment. ICC aligns MMX
10624 arguments to 4 byte boundaries, while structure fields are aligned
10625 to 8 byte boundaries. */
10626 if (!type)
10628 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10629 align = PARM_BOUNDARY;
10631 else
10633 if (!ix86_compat_aligned_value_p (type))
10634 align = PARM_BOUNDARY;
10637 if (align > BIGGEST_ALIGNMENT)
10638 align = BIGGEST_ALIGNMENT;
10639 return align;
10642 /* Return true when TYPE should be 128bit aligned for 32bit argument
10643 passing ABI. */
10645 static bool
10646 ix86_contains_aligned_value_p (const_tree type)
10648 machine_mode mode = TYPE_MODE (type);
10650 if (mode == XFmode || mode == XCmode)
10651 return false;
10653 if (TYPE_ALIGN (type) < 128)
10654 return false;
10656 if (AGGREGATE_TYPE_P (type))
10658 /* Walk the aggregates recursively. */
10659 switch (TREE_CODE (type))
10661 case RECORD_TYPE:
10662 case UNION_TYPE:
10663 case QUAL_UNION_TYPE:
10665 tree field;
10667 /* Walk all the structure fields. */
10668 for (field = TYPE_FIELDS (type);
10669 field;
10670 field = DECL_CHAIN (field))
10672 if (TREE_CODE (field) == FIELD_DECL
10673 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10674 return true;
10676 break;
10679 case ARRAY_TYPE:
10680 /* Just for use if some languages passes arrays by value. */
10681 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10682 return true;
10683 break;
10685 default:
10686 gcc_unreachable ();
10689 else
10690 return TYPE_ALIGN (type) >= 128;
10692 return false;
10695 /* Gives the alignment boundary, in bits, of an argument with the
10696 specified mode and type. */
10698 static unsigned int
10699 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10701 unsigned int align;
10702 if (type)
10704 /* Since the main variant type is used for call, we convert it to
10705 the main variant type. */
10706 type = TYPE_MAIN_VARIANT (type);
10707 align = TYPE_ALIGN (type);
10709 else
10710 align = GET_MODE_ALIGNMENT (mode);
10711 if (align < PARM_BOUNDARY)
10712 align = PARM_BOUNDARY;
10713 else
10715 static bool warned;
10716 unsigned int saved_align = align;
10718 if (!TARGET_64BIT)
10720 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10721 if (!type)
10723 if (mode == XFmode || mode == XCmode)
10724 align = PARM_BOUNDARY;
10726 else if (!ix86_contains_aligned_value_p (type))
10727 align = PARM_BOUNDARY;
10729 if (align < 128)
10730 align = PARM_BOUNDARY;
10733 if (warn_psabi
10734 && !warned
10735 && align != ix86_compat_function_arg_boundary (mode, type,
10736 saved_align))
10738 warned = true;
10739 inform (input_location,
10740 "The ABI for passing parameters with %d-byte"
10741 " alignment has changed in GCC 4.6",
10742 align / BITS_PER_UNIT);
10746 return align;
10749 /* Return true if N is a possible register number of function value. */
10751 static bool
10752 ix86_function_value_regno_p (const unsigned int regno)
10754 switch (regno)
10756 case AX_REG:
10757 return true;
10758 case DX_REG:
10759 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10760 case DI_REG:
10761 case SI_REG:
10762 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10764 case BND0_REG:
10765 case BND1_REG:
10766 return chkp_function_instrumented_p (current_function_decl);
10768 /* Complex values are returned in %st(0)/%st(1) pair. */
10769 case ST0_REG:
10770 case ST1_REG:
10771 /* TODO: The function should depend on current function ABI but
10772 builtins.c would need updating then. Therefore we use the
10773 default ABI. */
10774 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10775 return false;
10776 return TARGET_FLOAT_RETURNS_IN_80387;
10778 /* Complex values are returned in %xmm0/%xmm1 pair. */
10779 case XMM0_REG:
10780 case XMM1_REG:
10781 return TARGET_SSE;
10783 case MM0_REG:
10784 if (TARGET_MACHO || TARGET_64BIT)
10785 return false;
10786 return TARGET_MMX;
10789 return false;
10792 /* Define how to find the value returned by a function.
10793 VALTYPE is the data type of the value (as a tree).
10794 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10795 otherwise, FUNC is 0. */
10797 static rtx
10798 function_value_32 (machine_mode orig_mode, machine_mode mode,
10799 const_tree fntype, const_tree fn)
10801 unsigned int regno;
10803 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10804 we normally prevent this case when mmx is not available. However
10805 some ABIs may require the result to be returned like DImode. */
10806 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10807 regno = FIRST_MMX_REG;
10809 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10810 we prevent this case when sse is not available. However some ABIs
10811 may require the result to be returned like integer TImode. */
10812 else if (mode == TImode
10813 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10814 regno = FIRST_SSE_REG;
10816 /* 32-byte vector modes in %ymm0. */
10817 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10818 regno = FIRST_SSE_REG;
10820 /* 64-byte vector modes in %zmm0. */
10821 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10822 regno = FIRST_SSE_REG;
10824 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10825 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10826 regno = FIRST_FLOAT_REG;
10827 else
10828 /* Most things go in %eax. */
10829 regno = AX_REG;
10831 /* Override FP return register with %xmm0 for local functions when
10832 SSE math is enabled or for functions with sseregparm attribute. */
10833 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10835 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10836 if (sse_level == -1)
10838 error ("calling %qD with SSE calling convention without "
10839 "SSE/SSE2 enabled", fn);
10840 sorry ("this is a GCC bug that can be worked around by adding "
10841 "attribute used to function called");
10843 else if ((sse_level >= 1 && mode == SFmode)
10844 || (sse_level == 2 && mode == DFmode))
10845 regno = FIRST_SSE_REG;
10848 /* OImode shouldn't be used directly. */
10849 gcc_assert (mode != OImode);
10851 return gen_rtx_REG (orig_mode, regno);
10854 static rtx
10855 function_value_64 (machine_mode orig_mode, machine_mode mode,
10856 const_tree valtype)
10858 rtx ret;
10860 /* Handle libcalls, which don't provide a type node. */
10861 if (valtype == NULL)
10863 unsigned int regno;
10865 switch (mode)
10867 case SFmode:
10868 case SCmode:
10869 case DFmode:
10870 case DCmode:
10871 case TFmode:
10872 case SDmode:
10873 case DDmode:
10874 case TDmode:
10875 regno = FIRST_SSE_REG;
10876 break;
10877 case XFmode:
10878 case XCmode:
10879 regno = FIRST_FLOAT_REG;
10880 break;
10881 case TCmode:
10882 return NULL;
10883 default:
10884 regno = AX_REG;
10887 return gen_rtx_REG (mode, regno);
10889 else if (POINTER_TYPE_P (valtype))
10891 /* Pointers are always returned in word_mode. */
10892 mode = word_mode;
10895 ret = construct_container (mode, orig_mode, valtype, 1,
10896 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10897 x86_64_int_return_registers, 0);
10899 /* For zero sized structures, construct_container returns NULL, but we
10900 need to keep rest of compiler happy by returning meaningful value. */
10901 if (!ret)
10902 ret = gen_rtx_REG (orig_mode, AX_REG);
10904 return ret;
10907 static rtx
10908 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10909 const_tree valtype)
10911 unsigned int regno = AX_REG;
10913 if (TARGET_SSE)
10915 switch (GET_MODE_SIZE (mode))
10917 case 16:
10918 if (valtype != NULL_TREE
10919 && !VECTOR_INTEGER_TYPE_P (valtype)
10920 && !VECTOR_INTEGER_TYPE_P (valtype)
10921 && !INTEGRAL_TYPE_P (valtype)
10922 && !VECTOR_FLOAT_TYPE_P (valtype))
10923 break;
10924 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10925 && !COMPLEX_MODE_P (mode))
10926 regno = FIRST_SSE_REG;
10927 break;
10928 case 8:
10929 case 4:
10930 if (mode == SFmode || mode == DFmode)
10931 regno = FIRST_SSE_REG;
10932 break;
10933 default:
10934 break;
10937 return gen_rtx_REG (orig_mode, regno);
10940 static rtx
10941 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10942 machine_mode orig_mode, machine_mode mode)
10944 const_tree fn, fntype;
10946 fn = NULL_TREE;
10947 if (fntype_or_decl && DECL_P (fntype_or_decl))
10948 fn = fntype_or_decl;
10949 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
10951 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
10952 || POINTER_BOUNDS_MODE_P (mode))
10953 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
10954 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
10955 return function_value_ms_64 (orig_mode, mode, valtype);
10956 else if (TARGET_64BIT)
10957 return function_value_64 (orig_mode, mode, valtype);
10958 else
10959 return function_value_32 (orig_mode, mode, fntype, fn);
10962 static rtx
10963 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
10965 machine_mode mode, orig_mode;
10967 orig_mode = TYPE_MODE (valtype);
10968 mode = type_natural_mode (valtype, NULL, true);
10969 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
10972 /* Return an RTX representing a place where a function returns
10973 or recieves pointer bounds or NULL if no bounds are returned.
10975 VALTYPE is a data type of a value returned by the function.
10977 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
10978 or FUNCTION_TYPE of the function.
10980 If OUTGOING is false, return a place in which the caller will
10981 see the return value. Otherwise, return a place where a
10982 function returns a value. */
10984 static rtx
10985 ix86_function_value_bounds (const_tree valtype,
10986 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
10987 bool outgoing ATTRIBUTE_UNUSED)
10989 rtx res = NULL_RTX;
10991 if (BOUNDED_TYPE_P (valtype))
10992 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
10993 else if (chkp_type_has_pointer (valtype))
10995 bitmap slots;
10996 rtx bounds[2];
10997 bitmap_iterator bi;
10998 unsigned i, bnd_no = 0;
11000 bitmap_obstack_initialize (NULL);
11001 slots = BITMAP_ALLOC (NULL);
11002 chkp_find_bound_slots (valtype, slots);
11004 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
11006 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
11007 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
11008 gcc_assert (bnd_no < 2);
11009 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11012 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11014 BITMAP_FREE (slots);
11015 bitmap_obstack_release (NULL);
11017 else
11018 res = NULL_RTX;
11020 return res;
11023 /* Pointer function arguments and return values are promoted to
11024 word_mode for normal functions. */
11026 static machine_mode
11027 ix86_promote_function_mode (const_tree type, machine_mode mode,
11028 int *punsignedp, const_tree fntype,
11029 int for_return)
11031 if (cfun->machine->func_type == TYPE_NORMAL
11032 && type != NULL_TREE
11033 && POINTER_TYPE_P (type))
11035 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11036 return word_mode;
11038 return default_promote_function_mode (type, mode, punsignedp, fntype,
11039 for_return);
11042 /* Return true if a structure, union or array with MODE containing FIELD
11043 should be accessed using BLKmode. */
11045 static bool
11046 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11048 /* Union with XFmode must be in BLKmode. */
11049 return (mode == XFmode
11050 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11051 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11055 ix86_libcall_value (machine_mode mode)
11057 return ix86_function_value_1 (NULL, NULL, mode, mode);
11060 /* Return true iff type is returned in memory. */
11062 static bool
11063 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11065 #ifdef SUBTARGET_RETURN_IN_MEMORY
11066 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11067 #else
11068 const machine_mode mode = type_natural_mode (type, NULL, true);
11069 HOST_WIDE_INT size;
11071 if (POINTER_BOUNDS_TYPE_P (type))
11072 return false;
11074 if (TARGET_64BIT)
11076 if (ix86_function_type_abi (fntype) == MS_ABI)
11078 size = int_size_in_bytes (type);
11080 /* __m128 is returned in xmm0. */
11081 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11082 || INTEGRAL_TYPE_P (type)
11083 || VECTOR_FLOAT_TYPE_P (type))
11084 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11085 && !COMPLEX_MODE_P (mode)
11086 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11087 return false;
11089 /* Otherwise, the size must be exactly in [1248]. */
11090 return size != 1 && size != 2 && size != 4 && size != 8;
11092 else
11094 int needed_intregs, needed_sseregs;
11096 return examine_argument (mode, type, 1,
11097 &needed_intregs, &needed_sseregs);
11100 else
11102 size = int_size_in_bytes (type);
11104 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11105 bytes in registers. */
11106 if (TARGET_IAMCU)
11107 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11109 if (mode == BLKmode)
11110 return true;
11112 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11113 return false;
11115 if (VECTOR_MODE_P (mode) || mode == TImode)
11117 /* User-created vectors small enough to fit in EAX. */
11118 if (size < 8)
11119 return false;
11121 /* Unless ABI prescibes otherwise,
11122 MMX/3dNow values are returned in MM0 if available. */
11124 if (size == 8)
11125 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11127 /* SSE values are returned in XMM0 if available. */
11128 if (size == 16)
11129 return !TARGET_SSE;
11131 /* AVX values are returned in YMM0 if available. */
11132 if (size == 32)
11133 return !TARGET_AVX;
11135 /* AVX512F values are returned in ZMM0 if available. */
11136 if (size == 64)
11137 return !TARGET_AVX512F;
11140 if (mode == XFmode)
11141 return false;
11143 if (size > 12)
11144 return true;
11146 /* OImode shouldn't be used directly. */
11147 gcc_assert (mode != OImode);
11149 return false;
11151 #endif
11155 /* Create the va_list data type. */
11157 static tree
11158 ix86_build_builtin_va_list_64 (void)
11160 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11162 record = lang_hooks.types.make_type (RECORD_TYPE);
11163 type_decl = build_decl (BUILTINS_LOCATION,
11164 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11166 f_gpr = build_decl (BUILTINS_LOCATION,
11167 FIELD_DECL, get_identifier ("gp_offset"),
11168 unsigned_type_node);
11169 f_fpr = build_decl (BUILTINS_LOCATION,
11170 FIELD_DECL, get_identifier ("fp_offset"),
11171 unsigned_type_node);
11172 f_ovf = build_decl (BUILTINS_LOCATION,
11173 FIELD_DECL, get_identifier ("overflow_arg_area"),
11174 ptr_type_node);
11175 f_sav = build_decl (BUILTINS_LOCATION,
11176 FIELD_DECL, get_identifier ("reg_save_area"),
11177 ptr_type_node);
11179 va_list_gpr_counter_field = f_gpr;
11180 va_list_fpr_counter_field = f_fpr;
11182 DECL_FIELD_CONTEXT (f_gpr) = record;
11183 DECL_FIELD_CONTEXT (f_fpr) = record;
11184 DECL_FIELD_CONTEXT (f_ovf) = record;
11185 DECL_FIELD_CONTEXT (f_sav) = record;
11187 TYPE_STUB_DECL (record) = type_decl;
11188 TYPE_NAME (record) = type_decl;
11189 TYPE_FIELDS (record) = f_gpr;
11190 DECL_CHAIN (f_gpr) = f_fpr;
11191 DECL_CHAIN (f_fpr) = f_ovf;
11192 DECL_CHAIN (f_ovf) = f_sav;
11194 layout_type (record);
11196 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11197 NULL_TREE, TYPE_ATTRIBUTES (record));
11199 /* The correct type is an array type of one element. */
11200 return build_array_type (record, build_index_type (size_zero_node));
11203 /* Setup the builtin va_list data type and for 64-bit the additional
11204 calling convention specific va_list data types. */
11206 static tree
11207 ix86_build_builtin_va_list (void)
11209 if (TARGET_64BIT)
11211 /* Initialize ABI specific va_list builtin types.
11213 In lto1, we can encounter two va_list types:
11214 - one as a result of the type-merge across TUs, and
11215 - the one constructed here.
11216 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11217 a type identity check in canonical_va_list_type based on
11218 TYPE_MAIN_VARIANT (which we used to have) will not work.
11219 Instead, we tag each va_list_type_node with its unique attribute, and
11220 look for the attribute in the type identity check in
11221 canonical_va_list_type.
11223 Tagging sysv_va_list_type_node directly with the attribute is
11224 problematic since it's a array of one record, which will degrade into a
11225 pointer to record when used as parameter (see build_va_arg comments for
11226 an example), dropping the attribute in the process. So we tag the
11227 record instead. */
11229 /* For SYSV_ABI we use an array of one record. */
11230 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11232 /* For MS_ABI we use plain pointer to argument area. */
11233 tree char_ptr_type = build_pointer_type (char_type_node);
11234 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11235 TYPE_ATTRIBUTES (char_ptr_type));
11236 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11238 return ((ix86_abi == MS_ABI)
11239 ? ms_va_list_type_node
11240 : sysv_va_list_type_node);
11242 else
11244 /* For i386 we use plain pointer to argument area. */
11245 return build_pointer_type (char_type_node);
11249 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11251 static void
11252 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11254 rtx save_area, mem;
11255 alias_set_type set;
11256 int i, max;
11258 /* GPR size of varargs save area. */
11259 if (cfun->va_list_gpr_size)
11260 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11261 else
11262 ix86_varargs_gpr_size = 0;
11264 /* FPR size of varargs save area. We don't need it if we don't pass
11265 anything in SSE registers. */
11266 if (TARGET_SSE && cfun->va_list_fpr_size)
11267 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11268 else
11269 ix86_varargs_fpr_size = 0;
11271 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11272 return;
11274 save_area = frame_pointer_rtx;
11275 set = get_varargs_alias_set ();
11277 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11278 if (max > X86_64_REGPARM_MAX)
11279 max = X86_64_REGPARM_MAX;
11281 for (i = cum->regno; i < max; i++)
11283 mem = gen_rtx_MEM (word_mode,
11284 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11285 MEM_NOTRAP_P (mem) = 1;
11286 set_mem_alias_set (mem, set);
11287 emit_move_insn (mem,
11288 gen_rtx_REG (word_mode,
11289 x86_64_int_parameter_registers[i]));
11292 if (ix86_varargs_fpr_size)
11294 machine_mode smode;
11295 rtx_code_label *label;
11296 rtx test;
11298 /* Now emit code to save SSE registers. The AX parameter contains number
11299 of SSE parameter registers used to call this function, though all we
11300 actually check here is the zero/non-zero status. */
11302 label = gen_label_rtx ();
11303 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11304 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11305 label));
11307 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11308 we used movdqa (i.e. TImode) instead? Perhaps even better would
11309 be if we could determine the real mode of the data, via a hook
11310 into pass_stdarg. Ignore all that for now. */
11311 smode = V4SFmode;
11312 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11313 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11315 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11316 if (max > X86_64_SSE_REGPARM_MAX)
11317 max = X86_64_SSE_REGPARM_MAX;
11319 for (i = cum->sse_regno; i < max; ++i)
11321 mem = plus_constant (Pmode, save_area,
11322 i * 16 + ix86_varargs_gpr_size);
11323 mem = gen_rtx_MEM (smode, mem);
11324 MEM_NOTRAP_P (mem) = 1;
11325 set_mem_alias_set (mem, set);
11326 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11328 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11331 emit_label (label);
11335 static void
11336 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11338 alias_set_type set = get_varargs_alias_set ();
11339 int i;
11341 /* Reset to zero, as there might be a sysv vaarg used
11342 before. */
11343 ix86_varargs_gpr_size = 0;
11344 ix86_varargs_fpr_size = 0;
11346 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11348 rtx reg, mem;
11350 mem = gen_rtx_MEM (Pmode,
11351 plus_constant (Pmode, virtual_incoming_args_rtx,
11352 i * UNITS_PER_WORD));
11353 MEM_NOTRAP_P (mem) = 1;
11354 set_mem_alias_set (mem, set);
11356 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11357 emit_move_insn (mem, reg);
11361 static void
11362 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11363 tree type, int *, int no_rtl)
11365 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11366 CUMULATIVE_ARGS next_cum;
11367 tree fntype;
11369 /* This argument doesn't appear to be used anymore. Which is good,
11370 because the old code here didn't suppress rtl generation. */
11371 gcc_assert (!no_rtl);
11373 if (!TARGET_64BIT)
11374 return;
11376 fntype = TREE_TYPE (current_function_decl);
11378 /* For varargs, we do not want to skip the dummy va_dcl argument.
11379 For stdargs, we do want to skip the last named argument. */
11380 next_cum = *cum;
11381 if (stdarg_p (fntype))
11382 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11383 true);
11385 if (cum->call_abi == MS_ABI)
11386 setup_incoming_varargs_ms_64 (&next_cum);
11387 else
11388 setup_incoming_varargs_64 (&next_cum);
11391 static void
11392 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11393 machine_mode mode,
11394 tree type,
11395 int *pretend_size ATTRIBUTE_UNUSED,
11396 int no_rtl)
11398 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11399 CUMULATIVE_ARGS next_cum;
11400 tree fntype;
11401 rtx save_area;
11402 int bnd_reg, i, max;
11404 gcc_assert (!no_rtl);
11406 /* Do nothing if we use plain pointer to argument area. */
11407 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11408 return;
11410 fntype = TREE_TYPE (current_function_decl);
11412 /* For varargs, we do not want to skip the dummy va_dcl argument.
11413 For stdargs, we do want to skip the last named argument. */
11414 next_cum = *cum;
11415 if (stdarg_p (fntype))
11416 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11417 true);
11418 save_area = frame_pointer_rtx;
11420 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11421 if (max > X86_64_REGPARM_MAX)
11422 max = X86_64_REGPARM_MAX;
11424 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11425 if (chkp_function_instrumented_p (current_function_decl))
11426 for (i = cum->regno; i < max; i++)
11428 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11429 rtx ptr = gen_rtx_REG (Pmode,
11430 x86_64_int_parameter_registers[i]);
11431 rtx bounds;
11433 if (bnd_reg <= LAST_BND_REG)
11434 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11435 else
11437 rtx ldx_addr =
11438 plus_constant (Pmode, arg_pointer_rtx,
11439 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11440 bounds = gen_reg_rtx (BNDmode);
11441 emit_insn (BNDmode == BND64mode
11442 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11443 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11446 emit_insn (BNDmode == BND64mode
11447 ? gen_bnd64_stx (addr, ptr, bounds)
11448 : gen_bnd32_stx (addr, ptr, bounds));
11450 bnd_reg++;
11455 /* Checks if TYPE is of kind va_list char *. */
11457 static bool
11458 is_va_list_char_pointer (tree type)
11460 tree canonic;
11462 /* For 32-bit it is always true. */
11463 if (!TARGET_64BIT)
11464 return true;
11465 canonic = ix86_canonical_va_list_type (type);
11466 return (canonic == ms_va_list_type_node
11467 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11470 /* Implement va_start. */
11472 static void
11473 ix86_va_start (tree valist, rtx nextarg)
11475 HOST_WIDE_INT words, n_gpr, n_fpr;
11476 tree f_gpr, f_fpr, f_ovf, f_sav;
11477 tree gpr, fpr, ovf, sav, t;
11478 tree type;
11479 rtx ovf_rtx;
11481 if (flag_split_stack
11482 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11484 unsigned int scratch_regno;
11486 /* When we are splitting the stack, we can't refer to the stack
11487 arguments using internal_arg_pointer, because they may be on
11488 the old stack. The split stack prologue will arrange to
11489 leave a pointer to the old stack arguments in a scratch
11490 register, which we here copy to a pseudo-register. The split
11491 stack prologue can't set the pseudo-register directly because
11492 it (the prologue) runs before any registers have been saved. */
11494 scratch_regno = split_stack_prologue_scratch_regno ();
11495 if (scratch_regno != INVALID_REGNUM)
11497 rtx reg;
11498 rtx_insn *seq;
11500 reg = gen_reg_rtx (Pmode);
11501 cfun->machine->split_stack_varargs_pointer = reg;
11503 start_sequence ();
11504 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11505 seq = get_insns ();
11506 end_sequence ();
11508 push_topmost_sequence ();
11509 emit_insn_after (seq, entry_of_function ());
11510 pop_topmost_sequence ();
11514 /* Only 64bit target needs something special. */
11515 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11517 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11518 std_expand_builtin_va_start (valist, nextarg);
11519 else
11521 rtx va_r, next;
11523 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11524 next = expand_binop (ptr_mode, add_optab,
11525 cfun->machine->split_stack_varargs_pointer,
11526 crtl->args.arg_offset_rtx,
11527 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11528 convert_move (va_r, next, 0);
11530 /* Store zero bounds for va_list. */
11531 if (chkp_function_instrumented_p (current_function_decl))
11532 chkp_expand_bounds_reset_for_mem (valist,
11533 make_tree (TREE_TYPE (valist),
11534 next));
11537 return;
11540 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11541 f_fpr = DECL_CHAIN (f_gpr);
11542 f_ovf = DECL_CHAIN (f_fpr);
11543 f_sav = DECL_CHAIN (f_ovf);
11545 valist = build_simple_mem_ref (valist);
11546 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11547 /* The following should be folded into the MEM_REF offset. */
11548 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11549 f_gpr, NULL_TREE);
11550 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11551 f_fpr, NULL_TREE);
11552 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11553 f_ovf, NULL_TREE);
11554 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11555 f_sav, NULL_TREE);
11557 /* Count number of gp and fp argument registers used. */
11558 words = crtl->args.info.words;
11559 n_gpr = crtl->args.info.regno;
11560 n_fpr = crtl->args.info.sse_regno;
11562 if (cfun->va_list_gpr_size)
11564 type = TREE_TYPE (gpr);
11565 t = build2 (MODIFY_EXPR, type,
11566 gpr, build_int_cst (type, n_gpr * 8));
11567 TREE_SIDE_EFFECTS (t) = 1;
11568 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11571 if (TARGET_SSE && cfun->va_list_fpr_size)
11573 type = TREE_TYPE (fpr);
11574 t = build2 (MODIFY_EXPR, type, fpr,
11575 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11576 TREE_SIDE_EFFECTS (t) = 1;
11577 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11580 /* Find the overflow area. */
11581 type = TREE_TYPE (ovf);
11582 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11583 ovf_rtx = crtl->args.internal_arg_pointer;
11584 else
11585 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11586 t = make_tree (type, ovf_rtx);
11587 if (words != 0)
11588 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11590 /* Store zero bounds for overflow area pointer. */
11591 if (chkp_function_instrumented_p (current_function_decl))
11592 chkp_expand_bounds_reset_for_mem (ovf, t);
11594 t = build2 (MODIFY_EXPR, type, ovf, t);
11595 TREE_SIDE_EFFECTS (t) = 1;
11596 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11598 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11600 /* Find the register save area.
11601 Prologue of the function save it right above stack frame. */
11602 type = TREE_TYPE (sav);
11603 t = make_tree (type, frame_pointer_rtx);
11604 if (!ix86_varargs_gpr_size)
11605 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11607 /* Store zero bounds for save area pointer. */
11608 if (chkp_function_instrumented_p (current_function_decl))
11609 chkp_expand_bounds_reset_for_mem (sav, t);
11611 t = build2 (MODIFY_EXPR, type, sav, t);
11612 TREE_SIDE_EFFECTS (t) = 1;
11613 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11617 /* Implement va_arg. */
11619 static tree
11620 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11621 gimple_seq *post_p)
11623 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11624 tree f_gpr, f_fpr, f_ovf, f_sav;
11625 tree gpr, fpr, ovf, sav, t;
11626 int size, rsize;
11627 tree lab_false, lab_over = NULL_TREE;
11628 tree addr, t2;
11629 rtx container;
11630 int indirect_p = 0;
11631 tree ptrtype;
11632 machine_mode nat_mode;
11633 unsigned int arg_boundary;
11635 /* Only 64bit target needs something special. */
11636 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11637 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11639 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11640 f_fpr = DECL_CHAIN (f_gpr);
11641 f_ovf = DECL_CHAIN (f_fpr);
11642 f_sav = DECL_CHAIN (f_ovf);
11644 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11645 valist, f_gpr, NULL_TREE);
11647 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11648 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11649 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11651 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11652 if (indirect_p)
11653 type = build_pointer_type (type);
11654 size = int_size_in_bytes (type);
11655 rsize = CEIL (size, UNITS_PER_WORD);
11657 nat_mode = type_natural_mode (type, NULL, false);
11658 switch (nat_mode)
11660 case V8SFmode:
11661 case V8SImode:
11662 case V32QImode:
11663 case V16HImode:
11664 case V4DFmode:
11665 case V4DImode:
11666 case V16SFmode:
11667 case V16SImode:
11668 case V64QImode:
11669 case V32HImode:
11670 case V8DFmode:
11671 case V8DImode:
11672 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11673 if (!TARGET_64BIT_MS_ABI)
11675 container = NULL;
11676 break;
11678 /* FALLTHRU */
11680 default:
11681 container = construct_container (nat_mode, TYPE_MODE (type),
11682 type, 0, X86_64_REGPARM_MAX,
11683 X86_64_SSE_REGPARM_MAX, intreg,
11685 break;
11688 /* Pull the value out of the saved registers. */
11690 addr = create_tmp_var (ptr_type_node, "addr");
11692 if (container)
11694 int needed_intregs, needed_sseregs;
11695 bool need_temp;
11696 tree int_addr, sse_addr;
11698 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11699 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11701 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11703 need_temp = (!REG_P (container)
11704 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11705 || TYPE_ALIGN (type) > 128));
11707 /* In case we are passing structure, verify that it is consecutive block
11708 on the register save area. If not we need to do moves. */
11709 if (!need_temp && !REG_P (container))
11711 /* Verify that all registers are strictly consecutive */
11712 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11714 int i;
11716 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11718 rtx slot = XVECEXP (container, 0, i);
11719 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11720 || INTVAL (XEXP (slot, 1)) != i * 16)
11721 need_temp = true;
11724 else
11726 int i;
11728 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11730 rtx slot = XVECEXP (container, 0, i);
11731 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11732 || INTVAL (XEXP (slot, 1)) != i * 8)
11733 need_temp = true;
11737 if (!need_temp)
11739 int_addr = addr;
11740 sse_addr = addr;
11742 else
11744 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11745 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11748 /* First ensure that we fit completely in registers. */
11749 if (needed_intregs)
11751 t = build_int_cst (TREE_TYPE (gpr),
11752 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11753 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11754 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11755 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11756 gimplify_and_add (t, pre_p);
11758 if (needed_sseregs)
11760 t = build_int_cst (TREE_TYPE (fpr),
11761 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11762 + X86_64_REGPARM_MAX * 8);
11763 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11764 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11765 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11766 gimplify_and_add (t, pre_p);
11769 /* Compute index to start of area used for integer regs. */
11770 if (needed_intregs)
11772 /* int_addr = gpr + sav; */
11773 t = fold_build_pointer_plus (sav, gpr);
11774 gimplify_assign (int_addr, t, pre_p);
11776 if (needed_sseregs)
11778 /* sse_addr = fpr + sav; */
11779 t = fold_build_pointer_plus (sav, fpr);
11780 gimplify_assign (sse_addr, t, pre_p);
11782 if (need_temp)
11784 int i, prev_size = 0;
11785 tree temp = create_tmp_var (type, "va_arg_tmp");
11787 /* addr = &temp; */
11788 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11789 gimplify_assign (addr, t, pre_p);
11791 for (i = 0; i < XVECLEN (container, 0); i++)
11793 rtx slot = XVECEXP (container, 0, i);
11794 rtx reg = XEXP (slot, 0);
11795 machine_mode mode = GET_MODE (reg);
11796 tree piece_type;
11797 tree addr_type;
11798 tree daddr_type;
11799 tree src_addr, src;
11800 int src_offset;
11801 tree dest_addr, dest;
11802 int cur_size = GET_MODE_SIZE (mode);
11804 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11805 prev_size = INTVAL (XEXP (slot, 1));
11806 if (prev_size + cur_size > size)
11808 cur_size = size - prev_size;
11809 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11810 if (mode == BLKmode)
11811 mode = QImode;
11813 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11814 if (mode == GET_MODE (reg))
11815 addr_type = build_pointer_type (piece_type);
11816 else
11817 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11818 true);
11819 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11820 true);
11822 if (SSE_REGNO_P (REGNO (reg)))
11824 src_addr = sse_addr;
11825 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11827 else
11829 src_addr = int_addr;
11830 src_offset = REGNO (reg) * 8;
11832 src_addr = fold_convert (addr_type, src_addr);
11833 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11835 dest_addr = fold_convert (daddr_type, addr);
11836 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11837 if (cur_size == GET_MODE_SIZE (mode))
11839 src = build_va_arg_indirect_ref (src_addr);
11840 dest = build_va_arg_indirect_ref (dest_addr);
11842 gimplify_assign (dest, src, pre_p);
11844 else
11846 tree copy
11847 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11848 3, dest_addr, src_addr,
11849 size_int (cur_size));
11850 gimplify_and_add (copy, pre_p);
11852 prev_size += cur_size;
11856 if (needed_intregs)
11858 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11859 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11860 gimplify_assign (gpr, t, pre_p);
11863 if (needed_sseregs)
11865 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11866 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11867 gimplify_assign (unshare_expr (fpr), t, pre_p);
11870 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11872 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11875 /* ... otherwise out of the overflow area. */
11877 /* When we align parameter on stack for caller, if the parameter
11878 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11879 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11880 here with caller. */
11881 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11882 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11883 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11885 /* Care for on-stack alignment if needed. */
11886 if (arg_boundary <= 64 || size == 0)
11887 t = ovf;
11888 else
11890 HOST_WIDE_INT align = arg_boundary / 8;
11891 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11892 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11893 build_int_cst (TREE_TYPE (t), -align));
11896 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11897 gimplify_assign (addr, t, pre_p);
11899 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11900 gimplify_assign (unshare_expr (ovf), t, pre_p);
11902 if (container)
11903 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11905 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11906 addr = fold_convert (ptrtype, addr);
11908 if (indirect_p)
11909 addr = build_va_arg_indirect_ref (addr);
11910 return build_va_arg_indirect_ref (addr);
11913 /* Return true if OPNUM's MEM should be matched
11914 in movabs* patterns. */
11916 bool
11917 ix86_check_movabs (rtx insn, int opnum)
11919 rtx set, mem;
11921 set = PATTERN (insn);
11922 if (GET_CODE (set) == PARALLEL)
11923 set = XVECEXP (set, 0, 0);
11924 gcc_assert (GET_CODE (set) == SET);
11925 mem = XEXP (set, opnum);
11926 while (SUBREG_P (mem))
11927 mem = SUBREG_REG (mem);
11928 gcc_assert (MEM_P (mem));
11929 return volatile_ok || !MEM_VOLATILE_P (mem);
11932 /* Return false if INSN contains a MEM with a non-default address space. */
11933 bool
11934 ix86_check_no_addr_space (rtx insn)
11936 subrtx_var_iterator::array_type array;
11937 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11939 rtx x = *iter;
11940 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11941 return false;
11943 return true;
11946 /* Initialize the table of extra 80387 mathematical constants. */
11948 static void
11949 init_ext_80387_constants (void)
11951 static const char * cst[5] =
11953 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
11954 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
11955 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
11956 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
11957 "3.1415926535897932385128089594061862044", /* 4: fldpi */
11959 int i;
11961 for (i = 0; i < 5; i++)
11963 real_from_string (&ext_80387_constants_table[i], cst[i]);
11964 /* Ensure each constant is rounded to XFmode precision. */
11965 real_convert (&ext_80387_constants_table[i],
11966 XFmode, &ext_80387_constants_table[i]);
11969 ext_80387_constants_init = 1;
11972 /* Return non-zero if the constant is something that
11973 can be loaded with a special instruction. */
11976 standard_80387_constant_p (rtx x)
11978 machine_mode mode = GET_MODE (x);
11980 const REAL_VALUE_TYPE *r;
11982 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
11983 return -1;
11985 if (x == CONST0_RTX (mode))
11986 return 1;
11987 if (x == CONST1_RTX (mode))
11988 return 2;
11990 r = CONST_DOUBLE_REAL_VALUE (x);
11992 /* For XFmode constants, try to find a special 80387 instruction when
11993 optimizing for size or on those CPUs that benefit from them. */
11994 if (mode == XFmode
11995 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
11997 int i;
11999 if (! ext_80387_constants_init)
12000 init_ext_80387_constants ();
12002 for (i = 0; i < 5; i++)
12003 if (real_identical (r, &ext_80387_constants_table[i]))
12004 return i + 3;
12007 /* Load of the constant -0.0 or -1.0 will be split as
12008 fldz;fchs or fld1;fchs sequence. */
12009 if (real_isnegzero (r))
12010 return 8;
12011 if (real_identical (r, &dconstm1))
12012 return 9;
12014 return 0;
12017 /* Return the opcode of the special instruction to be used to load
12018 the constant X. */
12020 const char *
12021 standard_80387_constant_opcode (rtx x)
12023 switch (standard_80387_constant_p (x))
12025 case 1:
12026 return "fldz";
12027 case 2:
12028 return "fld1";
12029 case 3:
12030 return "fldlg2";
12031 case 4:
12032 return "fldln2";
12033 case 5:
12034 return "fldl2e";
12035 case 6:
12036 return "fldl2t";
12037 case 7:
12038 return "fldpi";
12039 case 8:
12040 case 9:
12041 return "#";
12042 default:
12043 gcc_unreachable ();
12047 /* Return the CONST_DOUBLE representing the 80387 constant that is
12048 loaded by the specified special instruction. The argument IDX
12049 matches the return value from standard_80387_constant_p. */
12052 standard_80387_constant_rtx (int idx)
12054 int i;
12056 if (! ext_80387_constants_init)
12057 init_ext_80387_constants ();
12059 switch (idx)
12061 case 3:
12062 case 4:
12063 case 5:
12064 case 6:
12065 case 7:
12066 i = idx - 3;
12067 break;
12069 default:
12070 gcc_unreachable ();
12073 return const_double_from_real_value (ext_80387_constants_table[i],
12074 XFmode);
12077 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12078 in supported SSE/AVX vector mode. */
12081 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12083 machine_mode mode;
12085 if (!TARGET_SSE)
12086 return 0;
12088 mode = GET_MODE (x);
12090 if (x == const0_rtx || const0_operand (x, mode))
12091 return 1;
12093 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12095 /* VOIDmode integer constant, get mode from the predicate. */
12096 if (mode == VOIDmode)
12097 mode = pred_mode;
12099 switch (GET_MODE_SIZE (mode))
12101 case 64:
12102 if (TARGET_AVX512F)
12103 return 2;
12104 break;
12105 case 32:
12106 if (TARGET_AVX2)
12107 return 2;
12108 break;
12109 case 16:
12110 if (TARGET_SSE2)
12111 return 2;
12112 break;
12113 case 0:
12114 /* VOIDmode */
12115 gcc_unreachable ();
12116 default:
12117 break;
12121 return 0;
12124 /* Return the opcode of the special instruction to be used to load
12125 the constant X. */
12127 const char *
12128 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12130 machine_mode mode;
12132 gcc_assert (TARGET_SSE);
12134 mode = GET_MODE (x);
12136 if (x == const0_rtx || const0_operand (x, mode))
12138 switch (get_attr_mode (insn))
12140 case MODE_XI:
12141 return "vpxord\t%g0, %g0, %g0";
12142 case MODE_OI:
12143 return (TARGET_AVX512VL
12144 ? "vpxord\t%x0, %x0, %x0"
12145 : "vpxor\t%x0, %x0, %x0");
12146 case MODE_TI:
12147 return (TARGET_AVX512VL
12148 ? "vpxord\t%t0, %t0, %t0"
12149 : "%vpxor\t%0, %d0");
12151 case MODE_V8DF:
12152 return (TARGET_AVX512DQ
12153 ? "vxorpd\t%g0, %g0, %g0"
12154 : "vpxorq\t%g0, %g0, %g0");
12155 case MODE_V4DF:
12156 return "vxorpd\t%x0, %x0, %x0";
12157 case MODE_V2DF:
12158 return "%vxorpd\t%0, %d0";
12160 case MODE_V16SF:
12161 return (TARGET_AVX512DQ
12162 ? "vxorps\t%g0, %g0, %g0"
12163 : "vpxord\t%g0, %g0, %g0");
12164 case MODE_V8SF:
12165 return "vxorps\t%x0, %x0, %x0";
12166 case MODE_V4SF:
12167 return "%vxorps\t%0, %d0";
12169 default:
12170 gcc_unreachable ();
12173 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12175 enum attr_mode insn_mode = get_attr_mode (insn);
12177 switch (insn_mode)
12179 case MODE_XI:
12180 case MODE_V8DF:
12181 case MODE_V16SF:
12182 gcc_assert (TARGET_AVX512F);
12183 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12185 case MODE_OI:
12186 case MODE_V4DF:
12187 case MODE_V8SF:
12188 gcc_assert (TARGET_AVX2);
12189 /* FALLTHRU */
12190 case MODE_TI:
12191 case MODE_V2DF:
12192 case MODE_V4SF:
12193 gcc_assert (TARGET_SSE2);
12194 return (TARGET_AVX
12195 ? "vpcmpeqd\t%0, %0, %0"
12196 : "pcmpeqd\t%0, %0");
12198 default:
12199 gcc_unreachable ();
12203 gcc_unreachable ();
12206 /* Returns true if INSN can be transformed from a memory load
12207 to a supported FP constant load. */
12209 bool
12210 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12212 rtx src = find_constant_src (insn);
12214 gcc_assert (REG_P (dst));
12216 if (src == NULL
12217 || (SSE_REGNO_P (REGNO (dst))
12218 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12219 || (STACK_REGNO_P (REGNO (dst))
12220 && standard_80387_constant_p (src) < 1))
12221 return false;
12223 return true;
12226 /* Returns true if OP contains a symbol reference */
12228 bool
12229 symbolic_reference_mentioned_p (rtx op)
12231 const char *fmt;
12232 int i;
12234 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12235 return true;
12237 fmt = GET_RTX_FORMAT (GET_CODE (op));
12238 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12240 if (fmt[i] == 'E')
12242 int j;
12244 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12245 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12246 return true;
12249 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12250 return true;
12253 return false;
12256 /* Return true if it is appropriate to emit `ret' instructions in the
12257 body of a function. Do this only if the epilogue is simple, needing a
12258 couple of insns. Prior to reloading, we can't tell how many registers
12259 must be saved, so return false then. Return false if there is no frame
12260 marker to de-allocate. */
12262 bool
12263 ix86_can_use_return_insn_p (void)
12265 struct ix86_frame frame;
12267 if (ix86_function_naked (current_function_decl))
12268 return false;
12270 /* Don't use `ret' instruction in interrupt handler. */
12271 if (! reload_completed
12272 || frame_pointer_needed
12273 || cfun->machine->func_type != TYPE_NORMAL)
12274 return 0;
12276 /* Don't allow more than 32k pop, since that's all we can do
12277 with one instruction. */
12278 if (crtl->args.pops_args && crtl->args.size >= 32768)
12279 return 0;
12281 frame = cfun->machine->frame;
12282 return (frame.stack_pointer_offset == UNITS_PER_WORD
12283 && (frame.nregs + frame.nsseregs) == 0);
12286 /* Value should be nonzero if functions must have frame pointers.
12287 Zero means the frame pointer need not be set up (and parms may
12288 be accessed via the stack pointer) in functions that seem suitable. */
12290 static bool
12291 ix86_frame_pointer_required (void)
12293 /* If we accessed previous frames, then the generated code expects
12294 to be able to access the saved ebp value in our frame. */
12295 if (cfun->machine->accesses_prev_frame)
12296 return true;
12298 /* Several x86 os'es need a frame pointer for other reasons,
12299 usually pertaining to setjmp. */
12300 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12301 return true;
12303 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12304 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12305 return true;
12307 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12308 allocation is 4GB. */
12309 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12310 return true;
12312 /* SSE saves require frame-pointer when stack is misaligned. */
12313 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12314 return true;
12316 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12317 turns off the frame pointer by default. Turn it back on now if
12318 we've not got a leaf function. */
12319 if (TARGET_OMIT_LEAF_FRAME_POINTER
12320 && (!crtl->is_leaf
12321 || ix86_current_function_calls_tls_descriptor))
12322 return true;
12324 if (crtl->profile && !flag_fentry)
12325 return true;
12327 return false;
12330 /* Record that the current function accesses previous call frames. */
12332 void
12333 ix86_setup_frame_addresses (void)
12335 cfun->machine->accesses_prev_frame = 1;
12338 #ifndef USE_HIDDEN_LINKONCE
12339 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12340 # define USE_HIDDEN_LINKONCE 1
12341 # else
12342 # define USE_HIDDEN_LINKONCE 0
12343 # endif
12344 #endif
12346 static int pic_labels_used;
12348 /* Fills in the label name that should be used for a pc thunk for
12349 the given register. */
12351 static void
12352 get_pc_thunk_name (char name[32], unsigned int regno)
12354 gcc_assert (!TARGET_64BIT);
12356 if (USE_HIDDEN_LINKONCE)
12357 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12358 else
12359 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12363 /* This function generates code for -fpic that loads %ebx with
12364 the return address of the caller and then returns. */
12366 static void
12367 ix86_code_end (void)
12369 rtx xops[2];
12370 int regno;
12372 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12374 char name[32];
12375 tree decl;
12377 if (!(pic_labels_used & (1 << regno)))
12378 continue;
12380 get_pc_thunk_name (name, regno);
12382 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12383 get_identifier (name),
12384 build_function_type_list (void_type_node, NULL_TREE));
12385 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12386 NULL_TREE, void_type_node);
12387 TREE_PUBLIC (decl) = 1;
12388 TREE_STATIC (decl) = 1;
12389 DECL_IGNORED_P (decl) = 1;
12391 #if TARGET_MACHO
12392 if (TARGET_MACHO)
12394 switch_to_section (darwin_sections[picbase_thunk_section]);
12395 fputs ("\t.weak_definition\t", asm_out_file);
12396 assemble_name (asm_out_file, name);
12397 fputs ("\n\t.private_extern\t", asm_out_file);
12398 assemble_name (asm_out_file, name);
12399 putc ('\n', asm_out_file);
12400 ASM_OUTPUT_LABEL (asm_out_file, name);
12401 DECL_WEAK (decl) = 1;
12403 else
12404 #endif
12405 if (USE_HIDDEN_LINKONCE)
12407 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12409 targetm.asm_out.unique_section (decl, 0);
12410 switch_to_section (get_named_section (decl, NULL, 0));
12412 targetm.asm_out.globalize_label (asm_out_file, name);
12413 fputs ("\t.hidden\t", asm_out_file);
12414 assemble_name (asm_out_file, name);
12415 putc ('\n', asm_out_file);
12416 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12418 else
12420 switch_to_section (text_section);
12421 ASM_OUTPUT_LABEL (asm_out_file, name);
12424 DECL_INITIAL (decl) = make_node (BLOCK);
12425 current_function_decl = decl;
12426 allocate_struct_function (decl, false);
12427 init_function_start (decl);
12428 /* We're about to hide the function body from callees of final_* by
12429 emitting it directly; tell them we're a thunk, if they care. */
12430 cfun->is_thunk = true;
12431 first_function_block_is_cold = false;
12432 /* Make sure unwind info is emitted for the thunk if needed. */
12433 final_start_function (emit_barrier (), asm_out_file, 1);
12435 /* Pad stack IP move with 4 instructions (two NOPs count
12436 as one instruction). */
12437 if (TARGET_PAD_SHORT_FUNCTION)
12439 int i = 8;
12441 while (i--)
12442 fputs ("\tnop\n", asm_out_file);
12445 xops[0] = gen_rtx_REG (Pmode, regno);
12446 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12447 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12448 output_asm_insn ("%!ret", NULL);
12449 final_end_function ();
12450 init_insn_lengths ();
12451 free_after_compilation (cfun);
12452 set_cfun (NULL);
12453 current_function_decl = NULL;
12456 if (flag_split_stack)
12457 file_end_indicate_split_stack ();
12460 /* Emit code for the SET_GOT patterns. */
12462 const char *
12463 output_set_got (rtx dest, rtx label)
12465 rtx xops[3];
12467 xops[0] = dest;
12469 if (TARGET_VXWORKS_RTP && flag_pic)
12471 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12472 xops[2] = gen_rtx_MEM (Pmode,
12473 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12474 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12476 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12477 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12478 an unadorned address. */
12479 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12480 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12481 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12482 return "";
12485 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12487 if (flag_pic)
12489 char name[32];
12490 get_pc_thunk_name (name, REGNO (dest));
12491 pic_labels_used |= 1 << REGNO (dest);
12493 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12494 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12495 output_asm_insn ("%!call\t%X2", xops);
12497 #if TARGET_MACHO
12498 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12499 This is what will be referenced by the Mach-O PIC subsystem. */
12500 if (machopic_should_output_picbase_label () || !label)
12501 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12503 /* When we are restoring the pic base at the site of a nonlocal label,
12504 and we decided to emit the pic base above, we will still output a
12505 local label used for calculating the correction offset (even though
12506 the offset will be 0 in that case). */
12507 if (label)
12508 targetm.asm_out.internal_label (asm_out_file, "L",
12509 CODE_LABEL_NUMBER (label));
12510 #endif
12512 else
12514 if (TARGET_MACHO)
12515 /* We don't need a pic base, we're not producing pic. */
12516 gcc_unreachable ();
12518 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12519 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12520 targetm.asm_out.internal_label (asm_out_file, "L",
12521 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12524 if (!TARGET_MACHO)
12525 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12527 return "";
12530 /* Generate an "push" pattern for input ARG. */
12532 static rtx
12533 gen_push (rtx arg)
12535 struct machine_function *m = cfun->machine;
12537 if (m->fs.cfa_reg == stack_pointer_rtx)
12538 m->fs.cfa_offset += UNITS_PER_WORD;
12539 m->fs.sp_offset += UNITS_PER_WORD;
12541 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12542 arg = gen_rtx_REG (word_mode, REGNO (arg));
12544 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12545 gen_rtx_PRE_DEC (Pmode,
12546 stack_pointer_rtx)),
12547 arg);
12550 /* Generate an "pop" pattern for input ARG. */
12552 static rtx
12553 gen_pop (rtx arg)
12555 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12556 arg = gen_rtx_REG (word_mode, REGNO (arg));
12558 return gen_rtx_SET (arg,
12559 gen_rtx_MEM (word_mode,
12560 gen_rtx_POST_INC (Pmode,
12561 stack_pointer_rtx)));
12564 /* Return >= 0 if there is an unused call-clobbered register available
12565 for the entire function. */
12567 static unsigned int
12568 ix86_select_alt_pic_regnum (void)
12570 if (ix86_use_pseudo_pic_reg ())
12571 return INVALID_REGNUM;
12573 if (crtl->is_leaf
12574 && !crtl->profile
12575 && !ix86_current_function_calls_tls_descriptor)
12577 int i, drap;
12578 /* Can't use the same register for both PIC and DRAP. */
12579 if (crtl->drap_reg)
12580 drap = REGNO (crtl->drap_reg);
12581 else
12582 drap = -1;
12583 for (i = 2; i >= 0; --i)
12584 if (i != drap && !df_regs_ever_live_p (i))
12585 return i;
12588 return INVALID_REGNUM;
12591 /* Return true if REGNO is used by the epilogue. */
12593 bool
12594 ix86_epilogue_uses (int regno)
12596 /* If there are no caller-saved registers, we preserve all registers,
12597 except for MMX and x87 registers which aren't supported when saving
12598 and restoring registers. Don't explicitly save SP register since
12599 it is always preserved. */
12600 return (epilogue_completed
12601 && cfun->machine->no_caller_saved_registers
12602 && !fixed_regs[regno]
12603 && !STACK_REGNO_P (regno)
12604 && !MMX_REGNO_P (regno));
12607 /* Return nonzero if register REGNO can be used as a scratch register
12608 in peephole2. */
12610 static bool
12611 ix86_hard_regno_scratch_ok (unsigned int regno)
12613 /* If there are no caller-saved registers, we can't use any register
12614 as a scratch register after epilogue and use REGNO as scratch
12615 register only if it has been used before to avoid saving and
12616 restoring it. */
12617 return (!cfun->machine->no_caller_saved_registers
12618 || (!epilogue_completed
12619 && df_regs_ever_live_p (regno)));
12622 /* Return true if register class CL should be an additional allocno
12623 class. */
12625 static bool
12626 ix86_additional_allocno_class_p (reg_class_t cl)
12628 return cl == MOD4_SSE_REGS;
12631 /* Return TRUE if we need to save REGNO. */
12633 static bool
12634 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12636 /* If there are no caller-saved registers, we preserve all registers,
12637 except for MMX and x87 registers which aren't supported when saving
12638 and restoring registers. Don't explicitly save SP register since
12639 it is always preserved. */
12640 if (cfun->machine->no_caller_saved_registers)
12642 /* Don't preserve registers used for function return value. */
12643 rtx reg = crtl->return_rtx;
12644 if (reg)
12646 unsigned int i = REGNO (reg);
12647 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12648 while (nregs-- > 0)
12649 if ((i + nregs) == regno)
12650 return false;
12652 reg = crtl->return_bnd;
12653 if (reg)
12655 i = REGNO (reg);
12656 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12657 while (nregs-- > 0)
12658 if ((i + nregs) == regno)
12659 return false;
12663 return (df_regs_ever_live_p (regno)
12664 && !fixed_regs[regno]
12665 && !STACK_REGNO_P (regno)
12666 && !MMX_REGNO_P (regno)
12667 && (regno != HARD_FRAME_POINTER_REGNUM
12668 || !frame_pointer_needed));
12671 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12672 && pic_offset_table_rtx)
12674 if (ix86_use_pseudo_pic_reg ())
12676 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12677 _mcount in prologue. */
12678 if (!TARGET_64BIT && flag_pic && crtl->profile)
12679 return true;
12681 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12682 || crtl->profile
12683 || crtl->calls_eh_return
12684 || crtl->uses_const_pool
12685 || cfun->has_nonlocal_label)
12686 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12689 if (crtl->calls_eh_return && maybe_eh_return)
12691 unsigned i;
12692 for (i = 0; ; i++)
12694 unsigned test = EH_RETURN_DATA_REGNO (i);
12695 if (test == INVALID_REGNUM)
12696 break;
12697 if (test == regno)
12698 return true;
12702 if (ignore_outlined && cfun->machine->call_ms2sysv)
12704 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12705 + xlogue_layout::MIN_REGS;
12706 if (xlogue_layout::is_stub_managed_reg (regno, count))
12707 return false;
12710 if (crtl->drap_reg
12711 && regno == REGNO (crtl->drap_reg)
12712 && !cfun->machine->no_drap_save_restore)
12713 return true;
12715 return (df_regs_ever_live_p (regno)
12716 && !call_used_regs[regno]
12717 && !fixed_regs[regno]
12718 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12721 /* Return number of saved general prupose registers. */
12723 static int
12724 ix86_nsaved_regs (void)
12726 int nregs = 0;
12727 int regno;
12729 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12730 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12731 nregs ++;
12732 return nregs;
12735 /* Return number of saved SSE registers. */
12737 static int
12738 ix86_nsaved_sseregs (void)
12740 int nregs = 0;
12741 int regno;
12743 if (!TARGET_64BIT_MS_ABI)
12744 return 0;
12745 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12746 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12747 nregs ++;
12748 return nregs;
12751 /* Given FROM and TO register numbers, say whether this elimination is
12752 allowed. If stack alignment is needed, we can only replace argument
12753 pointer with hard frame pointer, or replace frame pointer with stack
12754 pointer. Otherwise, frame pointer elimination is automatically
12755 handled and all other eliminations are valid. */
12757 static bool
12758 ix86_can_eliminate (const int from, const int to)
12760 if (stack_realign_fp)
12761 return ((from == ARG_POINTER_REGNUM
12762 && to == HARD_FRAME_POINTER_REGNUM)
12763 || (from == FRAME_POINTER_REGNUM
12764 && to == STACK_POINTER_REGNUM));
12765 else
12766 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12769 /* Return the offset between two registers, one to be eliminated, and the other
12770 its replacement, at the start of a routine. */
12772 HOST_WIDE_INT
12773 ix86_initial_elimination_offset (int from, int to)
12775 struct ix86_frame frame = cfun->machine->frame;
12777 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12778 return frame.hard_frame_pointer_offset;
12779 else if (from == FRAME_POINTER_REGNUM
12780 && to == HARD_FRAME_POINTER_REGNUM)
12781 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12782 else
12784 gcc_assert (to == STACK_POINTER_REGNUM);
12786 if (from == ARG_POINTER_REGNUM)
12787 return frame.stack_pointer_offset;
12789 gcc_assert (from == FRAME_POINTER_REGNUM);
12790 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12794 /* In a dynamically-aligned function, we can't know the offset from
12795 stack pointer to frame pointer, so we must ensure that setjmp
12796 eliminates fp against the hard fp (%ebp) rather than trying to
12797 index from %esp up to the top of the frame across a gap that is
12798 of unknown (at compile-time) size. */
12799 static rtx
12800 ix86_builtin_setjmp_frame_value (void)
12802 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12805 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12806 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12808 static bool warned_once = false;
12809 if (!warned_once)
12811 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12812 feature);
12813 warned_once = true;
12817 /* When using -fsplit-stack, the allocation routines set a field in
12818 the TCB to the bottom of the stack plus this much space, measured
12819 in bytes. */
12821 #define SPLIT_STACK_AVAILABLE 256
12823 /* Fill structure ix86_frame about frame of currently computed function. */
12825 static void
12826 ix86_compute_frame_layout (void)
12828 struct ix86_frame *frame = &cfun->machine->frame;
12829 struct machine_function *m = cfun->machine;
12830 unsigned HOST_WIDE_INT stack_alignment_needed;
12831 HOST_WIDE_INT offset;
12832 unsigned HOST_WIDE_INT preferred_alignment;
12833 HOST_WIDE_INT size = get_frame_size ();
12834 HOST_WIDE_INT to_allocate;
12836 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12837 * ms_abi functions that call a sysv function. We now need to prune away
12838 * cases where it should be disabled. */
12839 if (TARGET_64BIT && m->call_ms2sysv)
12841 gcc_assert (TARGET_64BIT_MS_ABI);
12842 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12843 gcc_assert (!TARGET_SEH);
12844 gcc_assert (TARGET_SSE);
12845 gcc_assert (!ix86_using_red_zone ());
12847 if (crtl->calls_eh_return)
12849 gcc_assert (!reload_completed);
12850 m->call_ms2sysv = false;
12851 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12854 else if (ix86_static_chain_on_stack)
12856 gcc_assert (!reload_completed);
12857 m->call_ms2sysv = false;
12858 warn_once_call_ms2sysv_xlogues ("static call chains");
12861 /* Finally, compute which registers the stub will manage. */
12862 else
12864 unsigned count = xlogue_layout::count_stub_managed_regs ();
12865 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12866 m->call_ms2sysv_pad_in = 0;
12870 frame->nregs = ix86_nsaved_regs ();
12871 frame->nsseregs = ix86_nsaved_sseregs ();
12873 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12874 except for function prologues, leaf functions and when the defult
12875 incoming stack boundary is overriden at command line or via
12876 force_align_arg_pointer attribute. */
12877 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12878 && (!crtl->is_leaf || cfun->calls_alloca != 0
12879 || ix86_current_function_calls_tls_descriptor
12880 || ix86_incoming_stack_boundary < 128))
12882 crtl->preferred_stack_boundary = 128;
12883 crtl->stack_alignment_needed = 128;
12886 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12887 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12889 gcc_assert (!size || stack_alignment_needed);
12890 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12891 gcc_assert (preferred_alignment <= stack_alignment_needed);
12893 /* For SEH we have to limit the amount of code movement into the prologue.
12894 At present we do this via a BLOCKAGE, at which point there's very little
12895 scheduling that can be done, which means that there's very little point
12896 in doing anything except PUSHs. */
12897 if (TARGET_SEH)
12898 m->use_fast_prologue_epilogue = false;
12899 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12901 int count = frame->nregs;
12902 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12904 /* The fast prologue uses move instead of push to save registers. This
12905 is significantly longer, but also executes faster as modern hardware
12906 can execute the moves in parallel, but can't do that for push/pop.
12908 Be careful about choosing what prologue to emit: When function takes
12909 many instructions to execute we may use slow version as well as in
12910 case function is known to be outside hot spot (this is known with
12911 feedback only). Weight the size of function by number of registers
12912 to save as it is cheap to use one or two push instructions but very
12913 slow to use many of them. */
12914 if (count)
12915 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12916 if (node->frequency < NODE_FREQUENCY_NORMAL
12917 || (flag_branch_probabilities
12918 && node->frequency < NODE_FREQUENCY_HOT))
12919 m->use_fast_prologue_epilogue = false;
12920 else
12921 m->use_fast_prologue_epilogue
12922 = !expensive_function_p (count);
12925 frame->save_regs_using_mov
12926 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12927 /* If static stack checking is enabled and done with probes,
12928 the registers need to be saved before allocating the frame. */
12929 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12931 /* Skip return address and error code in exception handler. */
12932 offset = INCOMING_FRAME_SP_OFFSET;
12934 /* Skip pushed static chain. */
12935 if (ix86_static_chain_on_stack)
12936 offset += UNITS_PER_WORD;
12938 /* Skip saved base pointer. */
12939 if (frame_pointer_needed)
12940 offset += UNITS_PER_WORD;
12941 frame->hfp_save_offset = offset;
12943 /* The traditional frame pointer location is at the top of the frame. */
12944 frame->hard_frame_pointer_offset = offset;
12946 /* Register save area */
12947 offset += frame->nregs * UNITS_PER_WORD;
12948 frame->reg_save_offset = offset;
12950 /* On SEH target, registers are pushed just before the frame pointer
12951 location. */
12952 if (TARGET_SEH)
12953 frame->hard_frame_pointer_offset = offset;
12955 /* When re-aligning the stack frame, but not saving SSE registers, this
12956 is the offset we want adjust the stack pointer to. */
12957 frame->stack_realign_allocate_offset = offset;
12959 /* The re-aligned stack starts here. Values before this point are not
12960 directly comparable with values below this point. Use sp_valid_at
12961 to determine if the stack pointer is valid for a given offset and
12962 fp_valid_at for the frame pointer. */
12963 if (stack_realign_fp)
12964 offset = ROUND_UP (offset, stack_alignment_needed);
12965 frame->stack_realign_offset = offset;
12967 if (TARGET_64BIT && m->call_ms2sysv)
12969 gcc_assert (stack_alignment_needed >= 16);
12970 gcc_assert (!frame->nsseregs);
12972 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
12973 offset += xlogue_layout::get_instance ().get_stack_space_used ();
12976 /* Align and set SSE register save area. */
12977 else if (frame->nsseregs)
12979 /* The only ABI that has saved SSE registers (Win64) also has a
12980 16-byte aligned default stack. However, many programs violate
12981 the ABI, and Wine64 forces stack realignment to compensate.
12983 If the incoming stack boundary is at least 16 bytes, or DRAP is
12984 required and the DRAP re-alignment boundary is at least 16 bytes,
12985 then we want the SSE register save area properly aligned. */
12986 if (ix86_incoming_stack_boundary >= 128
12987 || (stack_realign_drap && stack_alignment_needed >= 16))
12988 offset = ROUND_UP (offset, 16);
12989 offset += frame->nsseregs * 16;
12990 frame->stack_realign_allocate_offset = offset;
12993 frame->sse_reg_save_offset = offset;
12995 /* Va-arg area */
12996 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
12997 offset += frame->va_arg_size;
12999 /* Align start of frame for local function. */
13000 if (stack_realign_fp
13001 || m->call_ms2sysv
13002 || offset != frame->sse_reg_save_offset
13003 || size != 0
13004 || !crtl->is_leaf
13005 || cfun->calls_alloca
13006 || ix86_current_function_calls_tls_descriptor)
13007 offset = ROUND_UP (offset, stack_alignment_needed);
13009 /* Frame pointer points here. */
13010 frame->frame_pointer_offset = offset;
13012 offset += size;
13014 /* Add outgoing arguments area. Can be skipped if we eliminated
13015 all the function calls as dead code.
13016 Skipping is however impossible when function calls alloca. Alloca
13017 expander assumes that last crtl->outgoing_args_size
13018 of stack frame are unused. */
13019 if (ACCUMULATE_OUTGOING_ARGS
13020 && (!crtl->is_leaf || cfun->calls_alloca
13021 || ix86_current_function_calls_tls_descriptor))
13023 offset += crtl->outgoing_args_size;
13024 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13026 else
13027 frame->outgoing_arguments_size = 0;
13029 /* Align stack boundary. Only needed if we're calling another function
13030 or using alloca. */
13031 if (!crtl->is_leaf || cfun->calls_alloca
13032 || ix86_current_function_calls_tls_descriptor)
13033 offset = ROUND_UP (offset, preferred_alignment);
13035 /* We've reached end of stack frame. */
13036 frame->stack_pointer_offset = offset;
13038 /* Size prologue needs to allocate. */
13039 to_allocate = offset - frame->sse_reg_save_offset;
13041 if ((!to_allocate && frame->nregs <= 1)
13042 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13043 frame->save_regs_using_mov = false;
13045 if (ix86_using_red_zone ()
13046 && crtl->sp_is_unchanging
13047 && crtl->is_leaf
13048 && !ix86_pc_thunk_call_expanded
13049 && !ix86_current_function_calls_tls_descriptor)
13051 frame->red_zone_size = to_allocate;
13052 if (frame->save_regs_using_mov)
13053 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13054 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13055 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13057 else
13058 frame->red_zone_size = 0;
13059 frame->stack_pointer_offset -= frame->red_zone_size;
13061 /* The SEH frame pointer location is near the bottom of the frame.
13062 This is enforced by the fact that the difference between the
13063 stack pointer and the frame pointer is limited to 240 bytes in
13064 the unwind data structure. */
13065 if (TARGET_SEH)
13067 HOST_WIDE_INT diff;
13069 /* If we can leave the frame pointer where it is, do so. Also, returns
13070 the establisher frame for __builtin_frame_address (0). */
13071 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13072 if (diff <= SEH_MAX_FRAME_SIZE
13073 && (diff > 240 || (diff & 15) != 0)
13074 && !crtl->accesses_prior_frames)
13076 /* Ideally we'd determine what portion of the local stack frame
13077 (within the constraint of the lowest 240) is most heavily used.
13078 But without that complication, simply bias the frame pointer
13079 by 128 bytes so as to maximize the amount of the local stack
13080 frame that is addressable with 8-bit offsets. */
13081 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13086 /* This is semi-inlined memory_address_length, but simplified
13087 since we know that we're always dealing with reg+offset, and
13088 to avoid having to create and discard all that rtl. */
13090 static inline int
13091 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13093 int len = 4;
13095 if (offset == 0)
13097 /* EBP and R13 cannot be encoded without an offset. */
13098 len = (regno == BP_REG || regno == R13_REG);
13100 else if (IN_RANGE (offset, -128, 127))
13101 len = 1;
13103 /* ESP and R12 must be encoded with a SIB byte. */
13104 if (regno == SP_REG || regno == R12_REG)
13105 len++;
13107 return len;
13110 /* Determine if the stack pointer is valid for accessing the cfa_offset.
13111 The register is saved at CFA - CFA_OFFSET. */
13113 static inline bool
13114 sp_valid_at (HOST_WIDE_INT cfa_offset)
13116 const struct machine_frame_state &fs = cfun->machine->fs;
13117 return fs.sp_valid && !(fs.sp_realigned
13118 && cfa_offset <= fs.sp_realigned_offset);
13121 /* Determine if the frame pointer is valid for accessing the cfa_offset.
13122 The register is saved at CFA - CFA_OFFSET. */
13124 static inline bool
13125 fp_valid_at (HOST_WIDE_INT cfa_offset)
13127 const struct machine_frame_state &fs = cfun->machine->fs;
13128 return fs.fp_valid && !(fs.sp_valid && fs.sp_realigned
13129 && cfa_offset > fs.sp_realigned_offset);
13132 /* Choose a base register based upon alignment requested, speed and/or
13133 size. */
13135 static void
13136 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13137 HOST_WIDE_INT &base_offset,
13138 unsigned int align_reqested, unsigned int *align)
13140 const struct machine_function *m = cfun->machine;
13141 unsigned int hfp_align;
13142 unsigned int drap_align;
13143 unsigned int sp_align;
13144 bool hfp_ok = fp_valid_at (cfa_offset);
13145 bool drap_ok = m->fs.drap_valid;
13146 bool sp_ok = sp_valid_at (cfa_offset);
13148 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13150 /* Filter out any registers that don't meet the requested alignment
13151 criteria. */
13152 if (align_reqested)
13154 if (m->fs.realigned)
13155 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13156 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13157 notes (which we would need to use a realigned stack pointer),
13158 so disable on SEH targets. */
13159 else if (m->fs.sp_realigned)
13160 sp_align = crtl->stack_alignment_needed;
13162 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13163 drap_ok = drap_ok && drap_align >= align_reqested;
13164 sp_ok = sp_ok && sp_align >= align_reqested;
13167 if (m->use_fast_prologue_epilogue)
13169 /* Choose the base register most likely to allow the most scheduling
13170 opportunities. Generally FP is valid throughout the function,
13171 while DRAP must be reloaded within the epilogue. But choose either
13172 over the SP due to increased encoding size. */
13174 if (hfp_ok)
13176 base_reg = hard_frame_pointer_rtx;
13177 base_offset = m->fs.fp_offset - cfa_offset;
13179 else if (drap_ok)
13181 base_reg = crtl->drap_reg;
13182 base_offset = 0 - cfa_offset;
13184 else if (sp_ok)
13186 base_reg = stack_pointer_rtx;
13187 base_offset = m->fs.sp_offset - cfa_offset;
13190 else
13192 HOST_WIDE_INT toffset;
13193 int len = 16, tlen;
13195 /* Choose the base register with the smallest address encoding.
13196 With a tie, choose FP > DRAP > SP. */
13197 if (sp_ok)
13199 base_reg = stack_pointer_rtx;
13200 base_offset = m->fs.sp_offset - cfa_offset;
13201 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13203 if (drap_ok)
13205 toffset = 0 - cfa_offset;
13206 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13207 if (tlen <= len)
13209 base_reg = crtl->drap_reg;
13210 base_offset = toffset;
13211 len = tlen;
13214 if (hfp_ok)
13216 toffset = m->fs.fp_offset - cfa_offset;
13217 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13218 if (tlen <= len)
13220 base_reg = hard_frame_pointer_rtx;
13221 base_offset = toffset;
13222 len = tlen;
13227 /* Set the align return value. */
13228 if (align)
13230 if (base_reg == stack_pointer_rtx)
13231 *align = sp_align;
13232 else if (base_reg == crtl->drap_reg)
13233 *align = drap_align;
13234 else if (base_reg == hard_frame_pointer_rtx)
13235 *align = hfp_align;
13239 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13240 the alignment of address. If align is non-null, it should point to
13241 an alignment value (in bits) that is preferred or zero and will
13242 recieve the alignment of the base register that was selected. The
13243 valid base registers are taken from CFUN->MACHINE->FS. */
13245 static rtx
13246 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13248 rtx base_reg = NULL;
13249 HOST_WIDE_INT base_offset = 0;
13251 /* If a specific alignment is requested, try to get a base register
13252 with that alignment first. */
13253 if (align && *align)
13254 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13256 if (!base_reg)
13257 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13259 gcc_assert (base_reg != NULL);
13260 return plus_constant (Pmode, base_reg, base_offset);
13263 /* Emit code to save registers in the prologue. */
13265 static void
13266 ix86_emit_save_regs (void)
13268 unsigned int regno;
13269 rtx_insn *insn;
13271 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13272 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13274 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13275 RTX_FRAME_RELATED_P (insn) = 1;
13279 /* Emit a single register save at CFA - CFA_OFFSET. */
13281 static void
13282 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13283 HOST_WIDE_INT cfa_offset)
13285 struct machine_function *m = cfun->machine;
13286 rtx reg = gen_rtx_REG (mode, regno);
13287 rtx mem, addr, base, insn;
13288 unsigned int align = GET_MODE_ALIGNMENT (mode);
13290 addr = choose_baseaddr (cfa_offset, &align);
13291 mem = gen_frame_mem (mode, addr);
13293 /* The location aligment depends upon the base register. */
13294 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13295 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13296 set_mem_align (mem, align);
13298 insn = emit_insn (gen_rtx_SET (mem, reg));
13299 RTX_FRAME_RELATED_P (insn) = 1;
13301 base = addr;
13302 if (GET_CODE (base) == PLUS)
13303 base = XEXP (base, 0);
13304 gcc_checking_assert (REG_P (base));
13306 /* When saving registers into a re-aligned local stack frame, avoid
13307 any tricky guessing by dwarf2out. */
13308 if (m->fs.realigned)
13310 gcc_checking_assert (stack_realign_drap);
13312 if (regno == REGNO (crtl->drap_reg))
13314 /* A bit of a hack. We force the DRAP register to be saved in
13315 the re-aligned stack frame, which provides us with a copy
13316 of the CFA that will last past the prologue. Install it. */
13317 gcc_checking_assert (cfun->machine->fs.fp_valid);
13318 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13319 cfun->machine->fs.fp_offset - cfa_offset);
13320 mem = gen_rtx_MEM (mode, addr);
13321 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13323 else
13325 /* The frame pointer is a stable reference within the
13326 aligned frame. Use it. */
13327 gcc_checking_assert (cfun->machine->fs.fp_valid);
13328 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13329 cfun->machine->fs.fp_offset - cfa_offset);
13330 mem = gen_rtx_MEM (mode, addr);
13331 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13335 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13336 && cfa_offset >= m->fs.sp_realigned_offset)
13338 gcc_checking_assert (stack_realign_fp);
13339 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13342 /* The memory may not be relative to the current CFA register,
13343 which means that we may need to generate a new pattern for
13344 use by the unwind info. */
13345 else if (base != m->fs.cfa_reg)
13347 addr = plus_constant (Pmode, m->fs.cfa_reg,
13348 m->fs.cfa_offset - cfa_offset);
13349 mem = gen_rtx_MEM (mode, addr);
13350 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13354 /* Emit code to save registers using MOV insns.
13355 First register is stored at CFA - CFA_OFFSET. */
13356 static void
13357 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13359 unsigned int regno;
13361 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13362 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13364 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13365 cfa_offset -= UNITS_PER_WORD;
13369 /* Emit code to save SSE registers using MOV insns.
13370 First register is stored at CFA - CFA_OFFSET. */
13371 static void
13372 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13374 unsigned int regno;
13376 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13377 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13379 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13380 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13384 static GTY(()) rtx queued_cfa_restores;
13386 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13387 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13388 Don't add the note if the previously saved value will be left untouched
13389 within stack red-zone till return, as unwinders can find the same value
13390 in the register and on the stack. */
13392 static void
13393 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13395 if (!crtl->shrink_wrapped
13396 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13397 return;
13399 if (insn)
13401 add_reg_note (insn, REG_CFA_RESTORE, reg);
13402 RTX_FRAME_RELATED_P (insn) = 1;
13404 else
13405 queued_cfa_restores
13406 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13409 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13411 static void
13412 ix86_add_queued_cfa_restore_notes (rtx insn)
13414 rtx last;
13415 if (!queued_cfa_restores)
13416 return;
13417 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13419 XEXP (last, 1) = REG_NOTES (insn);
13420 REG_NOTES (insn) = queued_cfa_restores;
13421 queued_cfa_restores = NULL_RTX;
13422 RTX_FRAME_RELATED_P (insn) = 1;
13425 /* Expand prologue or epilogue stack adjustment.
13426 The pattern exist to put a dependency on all ebp-based memory accesses.
13427 STYLE should be negative if instructions should be marked as frame related,
13428 zero if %r11 register is live and cannot be freely used and positive
13429 otherwise. */
13431 static void
13432 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13433 int style, bool set_cfa)
13435 struct machine_function *m = cfun->machine;
13436 rtx insn;
13437 bool add_frame_related_expr = false;
13439 if (Pmode == SImode)
13440 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13441 else if (x86_64_immediate_operand (offset, DImode))
13442 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13443 else
13445 rtx tmp;
13446 /* r11 is used by indirect sibcall return as well, set before the
13447 epilogue and used after the epilogue. */
13448 if (style)
13449 tmp = gen_rtx_REG (DImode, R11_REG);
13450 else
13452 gcc_assert (src != hard_frame_pointer_rtx
13453 && dest != hard_frame_pointer_rtx);
13454 tmp = hard_frame_pointer_rtx;
13456 insn = emit_insn (gen_rtx_SET (tmp, offset));
13457 if (style < 0)
13458 add_frame_related_expr = true;
13460 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13463 insn = emit_insn (insn);
13464 if (style >= 0)
13465 ix86_add_queued_cfa_restore_notes (insn);
13467 if (set_cfa)
13469 rtx r;
13471 gcc_assert (m->fs.cfa_reg == src);
13472 m->fs.cfa_offset += INTVAL (offset);
13473 m->fs.cfa_reg = dest;
13475 r = gen_rtx_PLUS (Pmode, src, offset);
13476 r = gen_rtx_SET (dest, r);
13477 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13478 RTX_FRAME_RELATED_P (insn) = 1;
13480 else if (style < 0)
13482 RTX_FRAME_RELATED_P (insn) = 1;
13483 if (add_frame_related_expr)
13485 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13486 r = gen_rtx_SET (dest, r);
13487 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13491 if (dest == stack_pointer_rtx)
13493 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13494 bool valid = m->fs.sp_valid;
13495 bool realigned = m->fs.sp_realigned;
13497 if (src == hard_frame_pointer_rtx)
13499 valid = m->fs.fp_valid;
13500 realigned = false;
13501 ooffset = m->fs.fp_offset;
13503 else if (src == crtl->drap_reg)
13505 valid = m->fs.drap_valid;
13506 realigned = false;
13507 ooffset = 0;
13509 else
13511 /* Else there are two possibilities: SP itself, which we set
13512 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13513 taken care of this by hand along the eh_return path. */
13514 gcc_checking_assert (src == stack_pointer_rtx
13515 || offset == const0_rtx);
13518 m->fs.sp_offset = ooffset - INTVAL (offset);
13519 m->fs.sp_valid = valid;
13520 m->fs.sp_realigned = realigned;
13524 /* Find an available register to be used as dynamic realign argument
13525 pointer regsiter. Such a register will be written in prologue and
13526 used in begin of body, so it must not be
13527 1. parameter passing register.
13528 2. GOT pointer.
13529 We reuse static-chain register if it is available. Otherwise, we
13530 use DI for i386 and R13 for x86-64. We chose R13 since it has
13531 shorter encoding.
13533 Return: the regno of chosen register. */
13535 static unsigned int
13536 find_drap_reg (void)
13538 tree decl = cfun->decl;
13540 /* Always use callee-saved register if there are no caller-saved
13541 registers. */
13542 if (TARGET_64BIT)
13544 /* Use R13 for nested function or function need static chain.
13545 Since function with tail call may use any caller-saved
13546 registers in epilogue, DRAP must not use caller-saved
13547 register in such case. */
13548 if (DECL_STATIC_CHAIN (decl)
13549 || cfun->machine->no_caller_saved_registers
13550 || crtl->tail_call_emit)
13551 return R13_REG;
13553 return R10_REG;
13555 else
13557 /* Use DI for nested function or function need static chain.
13558 Since function with tail call may use any caller-saved
13559 registers in epilogue, DRAP must not use caller-saved
13560 register in such case. */
13561 if (DECL_STATIC_CHAIN (decl)
13562 || cfun->machine->no_caller_saved_registers
13563 || crtl->tail_call_emit)
13564 return DI_REG;
13566 /* Reuse static chain register if it isn't used for parameter
13567 passing. */
13568 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13570 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13571 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13572 return CX_REG;
13574 return DI_REG;
13578 /* Handle a "force_align_arg_pointer" attribute. */
13580 static tree
13581 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13582 tree, int, bool *no_add_attrs)
13584 if (TREE_CODE (*node) != FUNCTION_TYPE
13585 && TREE_CODE (*node) != METHOD_TYPE
13586 && TREE_CODE (*node) != FIELD_DECL
13587 && TREE_CODE (*node) != TYPE_DECL)
13589 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13590 name);
13591 *no_add_attrs = true;
13594 return NULL_TREE;
13597 /* Return minimum incoming stack alignment. */
13599 static unsigned int
13600 ix86_minimum_incoming_stack_boundary (bool sibcall)
13602 unsigned int incoming_stack_boundary;
13604 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13605 if (cfun->machine->func_type != TYPE_NORMAL)
13606 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13607 /* Prefer the one specified at command line. */
13608 else if (ix86_user_incoming_stack_boundary)
13609 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13610 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13611 if -mstackrealign is used, it isn't used for sibcall check and
13612 estimated stack alignment is 128bit. */
13613 else if (!sibcall
13614 && ix86_force_align_arg_pointer
13615 && crtl->stack_alignment_estimated == 128)
13616 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13617 else
13618 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13620 /* Incoming stack alignment can be changed on individual functions
13621 via force_align_arg_pointer attribute. We use the smallest
13622 incoming stack boundary. */
13623 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13624 && lookup_attribute (ix86_force_align_arg_pointer_string,
13625 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13626 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13628 /* The incoming stack frame has to be aligned at least at
13629 parm_stack_boundary. */
13630 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13631 incoming_stack_boundary = crtl->parm_stack_boundary;
13633 /* Stack at entrance of main is aligned by runtime. We use the
13634 smallest incoming stack boundary. */
13635 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13636 && DECL_NAME (current_function_decl)
13637 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13638 && DECL_FILE_SCOPE_P (current_function_decl))
13639 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13641 return incoming_stack_boundary;
13644 /* Update incoming stack boundary and estimated stack alignment. */
13646 static void
13647 ix86_update_stack_boundary (void)
13649 ix86_incoming_stack_boundary
13650 = ix86_minimum_incoming_stack_boundary (false);
13652 /* x86_64 vararg needs 16byte stack alignment for register save
13653 area. */
13654 if (TARGET_64BIT
13655 && cfun->stdarg
13656 && crtl->stack_alignment_estimated < 128)
13657 crtl->stack_alignment_estimated = 128;
13659 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13660 if (ix86_tls_descriptor_calls_expanded_in_cfun
13661 && crtl->preferred_stack_boundary < 128)
13662 crtl->preferred_stack_boundary = 128;
13665 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13666 needed or an rtx for DRAP otherwise. */
13668 static rtx
13669 ix86_get_drap_rtx (void)
13671 /* We must use DRAP if there are outgoing arguments on stack and
13672 ACCUMULATE_OUTGOING_ARGS is false. */
13673 if (ix86_force_drap
13674 || (cfun->machine->outgoing_args_on_stack
13675 && !ACCUMULATE_OUTGOING_ARGS))
13676 crtl->need_drap = true;
13678 if (stack_realign_drap)
13680 /* Assign DRAP to vDRAP and returns vDRAP */
13681 unsigned int regno = find_drap_reg ();
13682 rtx drap_vreg;
13683 rtx arg_ptr;
13684 rtx_insn *seq, *insn;
13686 arg_ptr = gen_rtx_REG (Pmode, regno);
13687 crtl->drap_reg = arg_ptr;
13689 start_sequence ();
13690 drap_vreg = copy_to_reg (arg_ptr);
13691 seq = get_insns ();
13692 end_sequence ();
13694 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13695 if (!optimize)
13697 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13698 RTX_FRAME_RELATED_P (insn) = 1;
13700 return drap_vreg;
13702 else
13703 return NULL;
13706 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13708 static rtx
13709 ix86_internal_arg_pointer (void)
13711 return virtual_incoming_args_rtx;
13714 struct scratch_reg {
13715 rtx reg;
13716 bool saved;
13719 /* Return a short-lived scratch register for use on function entry.
13720 In 32-bit mode, it is valid only after the registers are saved
13721 in the prologue. This register must be released by means of
13722 release_scratch_register_on_entry once it is dead. */
13724 static void
13725 get_scratch_register_on_entry (struct scratch_reg *sr)
13727 int regno;
13729 sr->saved = false;
13731 if (TARGET_64BIT)
13733 /* We always use R11 in 64-bit mode. */
13734 regno = R11_REG;
13736 else
13738 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13739 bool fastcall_p
13740 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13741 bool thiscall_p
13742 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13743 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13744 int regparm = ix86_function_regparm (fntype, decl);
13745 int drap_regno
13746 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13748 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13749 for the static chain register. */
13750 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13751 && drap_regno != AX_REG)
13752 regno = AX_REG;
13753 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13754 for the static chain register. */
13755 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13756 regno = AX_REG;
13757 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13758 regno = DX_REG;
13759 /* ecx is the static chain register. */
13760 else if (regparm < 3 && !fastcall_p && !thiscall_p
13761 && !static_chain_p
13762 && drap_regno != CX_REG)
13763 regno = CX_REG;
13764 else if (ix86_save_reg (BX_REG, true, false))
13765 regno = BX_REG;
13766 /* esi is the static chain register. */
13767 else if (!(regparm == 3 && static_chain_p)
13768 && ix86_save_reg (SI_REG, true, false))
13769 regno = SI_REG;
13770 else if (ix86_save_reg (DI_REG, true, false))
13771 regno = DI_REG;
13772 else
13774 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13775 sr->saved = true;
13779 sr->reg = gen_rtx_REG (Pmode, regno);
13780 if (sr->saved)
13782 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13783 RTX_FRAME_RELATED_P (insn) = 1;
13787 /* Release a scratch register obtained from the preceding function. */
13789 static void
13790 release_scratch_register_on_entry (struct scratch_reg *sr)
13792 if (sr->saved)
13794 struct machine_function *m = cfun->machine;
13795 rtx x, insn = emit_insn (gen_pop (sr->reg));
13797 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13798 RTX_FRAME_RELATED_P (insn) = 1;
13799 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13800 x = gen_rtx_SET (stack_pointer_rtx, x);
13801 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13802 m->fs.sp_offset -= UNITS_PER_WORD;
13806 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13808 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13810 static void
13811 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13813 /* We skip the probe for the first interval + a small dope of 4 words and
13814 probe that many bytes past the specified size to maintain a protection
13815 area at the botton of the stack. */
13816 const int dope = 4 * UNITS_PER_WORD;
13817 rtx size_rtx = GEN_INT (size), last;
13819 /* See if we have a constant small number of probes to generate. If so,
13820 that's the easy case. The run-time loop is made up of 9 insns in the
13821 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13822 for n # of intervals. */
13823 if (size <= 4 * PROBE_INTERVAL)
13825 HOST_WIDE_INT i, adjust;
13826 bool first_probe = true;
13828 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13829 values of N from 1 until it exceeds SIZE. If only one probe is
13830 needed, this will not generate any code. Then adjust and probe
13831 to PROBE_INTERVAL + SIZE. */
13832 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13834 if (first_probe)
13836 adjust = 2 * PROBE_INTERVAL + dope;
13837 first_probe = false;
13839 else
13840 adjust = PROBE_INTERVAL;
13842 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13843 plus_constant (Pmode, stack_pointer_rtx,
13844 -adjust)));
13845 emit_stack_probe (stack_pointer_rtx);
13848 if (first_probe)
13849 adjust = size + PROBE_INTERVAL + dope;
13850 else
13851 adjust = size + PROBE_INTERVAL - i;
13853 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13854 plus_constant (Pmode, stack_pointer_rtx,
13855 -adjust)));
13856 emit_stack_probe (stack_pointer_rtx);
13858 /* Adjust back to account for the additional first interval. */
13859 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13860 plus_constant (Pmode, stack_pointer_rtx,
13861 PROBE_INTERVAL + dope)));
13864 /* Otherwise, do the same as above, but in a loop. Note that we must be
13865 extra careful with variables wrapping around because we might be at
13866 the very top (or the very bottom) of the address space and we have
13867 to be able to handle this case properly; in particular, we use an
13868 equality test for the loop condition. */
13869 else
13871 HOST_WIDE_INT rounded_size;
13872 struct scratch_reg sr;
13874 get_scratch_register_on_entry (&sr);
13877 /* Step 1: round SIZE to the previous multiple of the interval. */
13879 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13882 /* Step 2: compute initial and final value of the loop counter. */
13884 /* SP = SP_0 + PROBE_INTERVAL. */
13885 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13886 plus_constant (Pmode, stack_pointer_rtx,
13887 - (PROBE_INTERVAL + dope))));
13889 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
13890 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
13891 emit_insn (gen_rtx_SET (sr.reg,
13892 plus_constant (Pmode, stack_pointer_rtx,
13893 -rounded_size)));
13894 else
13896 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
13897 emit_insn (gen_rtx_SET (sr.reg,
13898 gen_rtx_PLUS (Pmode, sr.reg,
13899 stack_pointer_rtx)));
13903 /* Step 3: the loop
13907 SP = SP + PROBE_INTERVAL
13908 probe at SP
13910 while (SP != LAST_ADDR)
13912 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
13913 values of N from 1 until it is equal to ROUNDED_SIZE. */
13915 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
13918 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
13919 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
13921 if (size != rounded_size)
13923 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13924 plus_constant (Pmode, stack_pointer_rtx,
13925 rounded_size - size)));
13926 emit_stack_probe (stack_pointer_rtx);
13929 /* Adjust back to account for the additional first interval. */
13930 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13931 plus_constant (Pmode, stack_pointer_rtx,
13932 PROBE_INTERVAL + dope)));
13934 release_scratch_register_on_entry (&sr);
13937 /* Even if the stack pointer isn't the CFA register, we need to correctly
13938 describe the adjustments made to it, in particular differentiate the
13939 frame-related ones from the frame-unrelated ones. */
13940 if (size > 0)
13942 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
13943 XVECEXP (expr, 0, 0)
13944 = gen_rtx_SET (stack_pointer_rtx,
13945 plus_constant (Pmode, stack_pointer_rtx, -size));
13946 XVECEXP (expr, 0, 1)
13947 = gen_rtx_SET (stack_pointer_rtx,
13948 plus_constant (Pmode, stack_pointer_rtx,
13949 PROBE_INTERVAL + dope + size));
13950 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
13951 RTX_FRAME_RELATED_P (last) = 1;
13953 cfun->machine->fs.sp_offset += size;
13956 /* Make sure nothing is scheduled before we are done. */
13957 emit_insn (gen_blockage ());
13960 /* Adjust the stack pointer up to REG while probing it. */
13962 const char *
13963 output_adjust_stack_and_probe (rtx reg)
13965 static int labelno = 0;
13966 char loop_lab[32];
13967 rtx xops[2];
13969 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13971 /* Loop. */
13972 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13974 /* SP = SP + PROBE_INTERVAL. */
13975 xops[0] = stack_pointer_rtx;
13976 xops[1] = GEN_INT (PROBE_INTERVAL);
13977 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13979 /* Probe at SP. */
13980 xops[1] = const0_rtx;
13981 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
13983 /* Test if SP == LAST_ADDR. */
13984 xops[0] = stack_pointer_rtx;
13985 xops[1] = reg;
13986 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13988 /* Branch. */
13989 fputs ("\tjne\t", asm_out_file);
13990 assemble_name_raw (asm_out_file, loop_lab);
13991 fputc ('\n', asm_out_file);
13993 return "";
13996 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
13997 inclusive. These are offsets from the current stack pointer. */
13999 static void
14000 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
14002 /* See if we have a constant small number of probes to generate. If so,
14003 that's the easy case. The run-time loop is made up of 6 insns in the
14004 generic case while the compile-time loop is made up of n insns for n #
14005 of intervals. */
14006 if (size <= 6 * PROBE_INTERVAL)
14008 HOST_WIDE_INT i;
14010 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14011 it exceeds SIZE. If only one probe is needed, this will not
14012 generate any code. Then probe at FIRST + SIZE. */
14013 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14014 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14015 -(first + i)));
14017 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14018 -(first + size)));
14021 /* Otherwise, do the same as above, but in a loop. Note that we must be
14022 extra careful with variables wrapping around because we might be at
14023 the very top (or the very bottom) of the address space and we have
14024 to be able to handle this case properly; in particular, we use an
14025 equality test for the loop condition. */
14026 else
14028 HOST_WIDE_INT rounded_size, last;
14029 struct scratch_reg sr;
14031 get_scratch_register_on_entry (&sr);
14034 /* Step 1: round SIZE to the previous multiple of the interval. */
14036 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14039 /* Step 2: compute initial and final value of the loop counter. */
14041 /* TEST_OFFSET = FIRST. */
14042 emit_move_insn (sr.reg, GEN_INT (-first));
14044 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14045 last = first + rounded_size;
14048 /* Step 3: the loop
14052 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14053 probe at TEST_ADDR
14055 while (TEST_ADDR != LAST_ADDR)
14057 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14058 until it is equal to ROUNDED_SIZE. */
14060 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14063 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14064 that SIZE is equal to ROUNDED_SIZE. */
14066 if (size != rounded_size)
14067 emit_stack_probe (plus_constant (Pmode,
14068 gen_rtx_PLUS (Pmode,
14069 stack_pointer_rtx,
14070 sr.reg),
14071 rounded_size - size));
14073 release_scratch_register_on_entry (&sr);
14076 /* Make sure nothing is scheduled before we are done. */
14077 emit_insn (gen_blockage ());
14080 /* Probe a range of stack addresses from REG to END, inclusive. These are
14081 offsets from the current stack pointer. */
14083 const char *
14084 output_probe_stack_range (rtx reg, rtx end)
14086 static int labelno = 0;
14087 char loop_lab[32];
14088 rtx xops[3];
14090 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14092 /* Loop. */
14093 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14095 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14096 xops[0] = reg;
14097 xops[1] = GEN_INT (PROBE_INTERVAL);
14098 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14100 /* Probe at TEST_ADDR. */
14101 xops[0] = stack_pointer_rtx;
14102 xops[1] = reg;
14103 xops[2] = const0_rtx;
14104 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14106 /* Test if TEST_ADDR == LAST_ADDR. */
14107 xops[0] = reg;
14108 xops[1] = end;
14109 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14111 /* Branch. */
14112 fputs ("\tjne\t", asm_out_file);
14113 assemble_name_raw (asm_out_file, loop_lab);
14114 fputc ('\n', asm_out_file);
14116 return "";
14119 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
14120 to be generated in correct form. */
14121 static void
14122 ix86_finalize_stack_realign_flags (void)
14124 /* Check if stack realign is really needed after reload, and
14125 stores result in cfun */
14126 unsigned int incoming_stack_boundary
14127 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14128 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14129 unsigned int stack_realign
14130 = (incoming_stack_boundary
14131 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14132 ? crtl->max_used_stack_slot_alignment
14133 : crtl->stack_alignment_needed));
14134 bool recompute_frame_layout_p = false;
14136 if (crtl->stack_realign_finalized)
14138 /* After stack_realign_needed is finalized, we can't no longer
14139 change it. */
14140 gcc_assert (crtl->stack_realign_needed == stack_realign);
14141 return;
14144 /* If the only reason for frame_pointer_needed is that we conservatively
14145 assumed stack realignment might be needed, but in the end nothing that
14146 needed the stack alignment had been spilled, clear frame_pointer_needed
14147 and say we don't need stack realignment. */
14148 if (stack_realign
14149 && frame_pointer_needed
14150 && crtl->is_leaf
14151 && flag_omit_frame_pointer
14152 && crtl->sp_is_unchanging
14153 && !ix86_current_function_calls_tls_descriptor
14154 && !crtl->accesses_prior_frames
14155 && !cfun->calls_alloca
14156 && !crtl->calls_eh_return
14157 /* See ira_setup_eliminable_regset for the rationale. */
14158 && !(STACK_CHECK_MOVING_SP
14159 && flag_stack_check
14160 && flag_exceptions
14161 && cfun->can_throw_non_call_exceptions)
14162 && !ix86_frame_pointer_required ()
14163 && get_frame_size () == 0
14164 && ix86_nsaved_sseregs () == 0
14165 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14167 HARD_REG_SET set_up_by_prologue, prologue_used;
14168 basic_block bb;
14170 CLEAR_HARD_REG_SET (prologue_used);
14171 CLEAR_HARD_REG_SET (set_up_by_prologue);
14172 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14173 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14174 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14175 HARD_FRAME_POINTER_REGNUM);
14176 FOR_EACH_BB_FN (bb, cfun)
14178 rtx_insn *insn;
14179 FOR_BB_INSNS (bb, insn)
14180 if (NONDEBUG_INSN_P (insn)
14181 && requires_stack_frame_p (insn, prologue_used,
14182 set_up_by_prologue))
14184 if (crtl->stack_realign_needed != stack_realign)
14185 recompute_frame_layout_p = true;
14186 crtl->stack_realign_needed = stack_realign;
14187 crtl->stack_realign_finalized = true;
14188 if (recompute_frame_layout_p)
14189 ix86_compute_frame_layout ();
14190 return;
14194 /* If drap has been set, but it actually isn't live at the start
14195 of the function, there is no reason to set it up. */
14196 if (crtl->drap_reg)
14198 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14199 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
14201 crtl->drap_reg = NULL_RTX;
14202 crtl->need_drap = false;
14205 else
14206 cfun->machine->no_drap_save_restore = true;
14208 frame_pointer_needed = false;
14209 stack_realign = false;
14210 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14211 crtl->stack_alignment_needed = incoming_stack_boundary;
14212 crtl->stack_alignment_estimated = incoming_stack_boundary;
14213 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14214 crtl->preferred_stack_boundary = incoming_stack_boundary;
14215 df_finish_pass (true);
14216 df_scan_alloc (NULL);
14217 df_scan_blocks ();
14218 df_compute_regs_ever_live (true);
14219 df_analyze ();
14220 recompute_frame_layout_p = true;
14223 if (crtl->stack_realign_needed != stack_realign)
14224 recompute_frame_layout_p = true;
14225 crtl->stack_realign_needed = stack_realign;
14226 crtl->stack_realign_finalized = true;
14227 if (recompute_frame_layout_p)
14228 ix86_compute_frame_layout ();
14231 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14233 static void
14234 ix86_elim_entry_set_got (rtx reg)
14236 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14237 rtx_insn *c_insn = BB_HEAD (bb);
14238 if (!NONDEBUG_INSN_P (c_insn))
14239 c_insn = next_nonnote_nondebug_insn (c_insn);
14240 if (c_insn && NONJUMP_INSN_P (c_insn))
14242 rtx pat = PATTERN (c_insn);
14243 if (GET_CODE (pat) == PARALLEL)
14245 rtx vec = XVECEXP (pat, 0, 0);
14246 if (GET_CODE (vec) == SET
14247 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14248 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14249 delete_insn (c_insn);
14254 static rtx
14255 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14257 rtx addr, mem;
14259 if (offset)
14260 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14261 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14262 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14265 static inline rtx
14266 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14268 return gen_frame_set (reg, frame_reg, offset, false);
14271 static inline rtx
14272 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14274 return gen_frame_set (reg, frame_reg, offset, true);
14277 static void
14278 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14280 struct machine_function *m = cfun->machine;
14281 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14282 + m->call_ms2sysv_extra_regs;
14283 rtvec v = rtvec_alloc (ncregs + 1);
14284 unsigned int align, i, vi = 0;
14285 rtx_insn *insn;
14286 rtx sym, addr;
14287 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14288 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14289 HOST_WIDE_INT rax_offset = xlogue.get_stub_ptr_offset () + m->fs.sp_offset;
14290 HOST_WIDE_INT stack_alloc_size = frame.stack_pointer_offset - m->fs.sp_offset;
14291 HOST_WIDE_INT stack_align_off_in = xlogue.get_stack_align_off_in ();
14293 /* Verify that the incoming stack 16-byte alignment offset matches the
14294 layout we're using. */
14295 gcc_assert (stack_align_off_in == (m->fs.sp_offset & UNITS_PER_WORD));
14297 /* Get the stub symbol. */
14298 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14299 : XLOGUE_STUB_SAVE);
14300 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14302 /* Setup RAX as the stub's base pointer. */
14303 align = GET_MODE_ALIGNMENT (V4SFmode);
14304 addr = choose_baseaddr (rax_offset, &align);
14305 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14306 insn = emit_insn (gen_rtx_SET (rax, addr));
14308 gcc_assert (stack_alloc_size >= xlogue.get_stack_space_used ());
14309 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14310 GEN_INT (-stack_alloc_size), -1,
14311 m->fs.cfa_reg == stack_pointer_rtx);
14312 for (i = 0; i < ncregs; ++i)
14314 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14315 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14316 r.regno);
14317 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);;
14320 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14322 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14323 RTX_FRAME_RELATED_P (insn) = true;
14326 /* Expand the prologue into a bunch of separate insns. */
14328 void
14329 ix86_expand_prologue (void)
14331 struct machine_function *m = cfun->machine;
14332 rtx insn, t;
14333 struct ix86_frame frame;
14334 HOST_WIDE_INT allocate;
14335 bool int_registers_saved;
14336 bool sse_registers_saved;
14337 rtx static_chain = NULL_RTX;
14339 if (ix86_function_naked (current_function_decl))
14340 return;
14342 ix86_finalize_stack_realign_flags ();
14344 /* DRAP should not coexist with stack_realign_fp */
14345 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14347 memset (&m->fs, 0, sizeof (m->fs));
14349 /* Initialize CFA state for before the prologue. */
14350 m->fs.cfa_reg = stack_pointer_rtx;
14351 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14353 /* Track SP offset to the CFA. We continue tracking this after we've
14354 swapped the CFA register away from SP. In the case of re-alignment
14355 this is fudged; we're interested to offsets within the local frame. */
14356 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14357 m->fs.sp_valid = true;
14358 m->fs.sp_realigned = false;
14360 frame = m->frame;
14362 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14364 /* We should have already generated an error for any use of
14365 ms_hook on a nested function. */
14366 gcc_checking_assert (!ix86_static_chain_on_stack);
14368 /* Check if profiling is active and we shall use profiling before
14369 prologue variant. If so sorry. */
14370 if (crtl->profile && flag_fentry != 0)
14371 sorry ("ms_hook_prologue attribute isn%'t compatible "
14372 "with -mfentry for 32-bit");
14374 /* In ix86_asm_output_function_label we emitted:
14375 8b ff movl.s %edi,%edi
14376 55 push %ebp
14377 8b ec movl.s %esp,%ebp
14379 This matches the hookable function prologue in Win32 API
14380 functions in Microsoft Windows XP Service Pack 2 and newer.
14381 Wine uses this to enable Windows apps to hook the Win32 API
14382 functions provided by Wine.
14384 What that means is that we've already set up the frame pointer. */
14386 if (frame_pointer_needed
14387 && !(crtl->drap_reg && crtl->stack_realign_needed))
14389 rtx push, mov;
14391 /* We've decided to use the frame pointer already set up.
14392 Describe this to the unwinder by pretending that both
14393 push and mov insns happen right here.
14395 Putting the unwind info here at the end of the ms_hook
14396 is done so that we can make absolutely certain we get
14397 the required byte sequence at the start of the function,
14398 rather than relying on an assembler that can produce
14399 the exact encoding required.
14401 However it does mean (in the unpatched case) that we have
14402 a 1 insn window where the asynchronous unwind info is
14403 incorrect. However, if we placed the unwind info at
14404 its correct location we would have incorrect unwind info
14405 in the patched case. Which is probably all moot since
14406 I don't expect Wine generates dwarf2 unwind info for the
14407 system libraries that use this feature. */
14409 insn = emit_insn (gen_blockage ());
14411 push = gen_push (hard_frame_pointer_rtx);
14412 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14413 stack_pointer_rtx);
14414 RTX_FRAME_RELATED_P (push) = 1;
14415 RTX_FRAME_RELATED_P (mov) = 1;
14417 RTX_FRAME_RELATED_P (insn) = 1;
14418 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14419 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14421 /* Note that gen_push incremented m->fs.cfa_offset, even
14422 though we didn't emit the push insn here. */
14423 m->fs.cfa_reg = hard_frame_pointer_rtx;
14424 m->fs.fp_offset = m->fs.cfa_offset;
14425 m->fs.fp_valid = true;
14427 else
14429 /* The frame pointer is not needed so pop %ebp again.
14430 This leaves us with a pristine state. */
14431 emit_insn (gen_pop (hard_frame_pointer_rtx));
14435 /* The first insn of a function that accepts its static chain on the
14436 stack is to push the register that would be filled in by a direct
14437 call. This insn will be skipped by the trampoline. */
14438 else if (ix86_static_chain_on_stack)
14440 static_chain = ix86_static_chain (cfun->decl, false);
14441 insn = emit_insn (gen_push (static_chain));
14442 emit_insn (gen_blockage ());
14444 /* We don't want to interpret this push insn as a register save,
14445 only as a stack adjustment. The real copy of the register as
14446 a save will be done later, if needed. */
14447 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14448 t = gen_rtx_SET (stack_pointer_rtx, t);
14449 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14450 RTX_FRAME_RELATED_P (insn) = 1;
14453 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14454 of DRAP is needed and stack realignment is really needed after reload */
14455 if (stack_realign_drap)
14457 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14459 /* Can't use DRAP in interrupt function. */
14460 if (cfun->machine->func_type != TYPE_NORMAL)
14461 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14462 "in interrupt service routine. This may be worked "
14463 "around by avoiding functions with aggregate return.");
14465 /* Only need to push parameter pointer reg if it is caller saved. */
14466 if (!call_used_regs[REGNO (crtl->drap_reg)])
14468 /* Push arg pointer reg */
14469 insn = emit_insn (gen_push (crtl->drap_reg));
14470 RTX_FRAME_RELATED_P (insn) = 1;
14473 /* Grab the argument pointer. */
14474 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14475 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14476 RTX_FRAME_RELATED_P (insn) = 1;
14477 m->fs.cfa_reg = crtl->drap_reg;
14478 m->fs.cfa_offset = 0;
14480 /* Align the stack. */
14481 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14482 stack_pointer_rtx,
14483 GEN_INT (-align_bytes)));
14484 RTX_FRAME_RELATED_P (insn) = 1;
14486 /* Replicate the return address on the stack so that return
14487 address can be reached via (argp - 1) slot. This is needed
14488 to implement macro RETURN_ADDR_RTX and intrinsic function
14489 expand_builtin_return_addr etc. */
14490 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14491 t = gen_frame_mem (word_mode, t);
14492 insn = emit_insn (gen_push (t));
14493 RTX_FRAME_RELATED_P (insn) = 1;
14495 /* For the purposes of frame and register save area addressing,
14496 we've started over with a new frame. */
14497 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14498 m->fs.realigned = true;
14500 if (static_chain)
14502 /* Replicate static chain on the stack so that static chain
14503 can be reached via (argp - 2) slot. This is needed for
14504 nested function with stack realignment. */
14505 insn = emit_insn (gen_push (static_chain));
14506 RTX_FRAME_RELATED_P (insn) = 1;
14510 int_registers_saved = (frame.nregs == 0);
14511 sse_registers_saved = (frame.nsseregs == 0);
14513 if (frame_pointer_needed && !m->fs.fp_valid)
14515 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14516 slower on all targets. Also sdb doesn't like it. */
14517 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14518 RTX_FRAME_RELATED_P (insn) = 1;
14520 /* Push registers now, before setting the frame pointer
14521 on SEH target. */
14522 if (!int_registers_saved
14523 && TARGET_SEH
14524 && !frame.save_regs_using_mov)
14526 ix86_emit_save_regs ();
14527 int_registers_saved = true;
14528 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14531 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14533 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14534 RTX_FRAME_RELATED_P (insn) = 1;
14536 if (m->fs.cfa_reg == stack_pointer_rtx)
14537 m->fs.cfa_reg = hard_frame_pointer_rtx;
14538 m->fs.fp_offset = m->fs.sp_offset;
14539 m->fs.fp_valid = true;
14543 if (!int_registers_saved)
14545 /* If saving registers via PUSH, do so now. */
14546 if (!frame.save_regs_using_mov)
14548 ix86_emit_save_regs ();
14549 int_registers_saved = true;
14550 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14553 /* When using red zone we may start register saving before allocating
14554 the stack frame saving one cycle of the prologue. However, avoid
14555 doing this if we have to probe the stack; at least on x86_64 the
14556 stack probe can turn into a call that clobbers a red zone location. */
14557 else if (ix86_using_red_zone ()
14558 && (! TARGET_STACK_PROBE
14559 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14561 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14562 int_registers_saved = true;
14566 if (stack_realign_fp)
14568 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14569 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14571 /* The computation of the size of the re-aligned stack frame means
14572 that we must allocate the size of the register save area before
14573 performing the actual alignment. Otherwise we cannot guarantee
14574 that there's enough storage above the realignment point. */
14575 allocate = frame.stack_realign_allocate_offset - m->fs.sp_offset;
14576 if (allocate && !m->call_ms2sysv)
14577 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14578 GEN_INT (-allocate), -1, false);
14580 /* Align the stack. */
14581 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14582 stack_pointer_rtx,
14583 GEN_INT (-align_bytes)));
14584 /* For the purposes of register save area addressing, the stack
14585 pointer can no longer be used to access anything in the frame
14586 below m->fs.sp_realigned_offset and the frame pointer cannot be
14587 used for anything at or above. */
14588 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14589 m->fs.sp_realigned = true;
14590 m->fs.sp_realigned_offset = m->fs.sp_offset - frame.nsseregs * 16;
14591 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14592 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14593 is needed to describe where a register is saved using a realigned
14594 stack pointer, so we need to invalidate the stack pointer for that
14595 target. */
14596 if (TARGET_SEH)
14597 m->fs.sp_valid = false;
14600 if (m->call_ms2sysv)
14601 ix86_emit_outlined_ms2sysv_save (frame);
14603 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14605 if (flag_stack_usage_info)
14607 /* We start to count from ARG_POINTER. */
14608 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14610 /* If it was realigned, take into account the fake frame. */
14611 if (stack_realign_drap)
14613 if (ix86_static_chain_on_stack)
14614 stack_size += UNITS_PER_WORD;
14616 if (!call_used_regs[REGNO (crtl->drap_reg)])
14617 stack_size += UNITS_PER_WORD;
14619 /* This over-estimates by 1 minimal-stack-alignment-unit but
14620 mitigates that by counting in the new return address slot. */
14621 current_function_dynamic_stack_size
14622 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14625 current_function_static_stack_size = stack_size;
14628 /* On SEH target with very large frame size, allocate an area to save
14629 SSE registers (as the very large allocation won't be described). */
14630 if (TARGET_SEH
14631 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14632 && !sse_registers_saved)
14634 HOST_WIDE_INT sse_size =
14635 frame.sse_reg_save_offset - frame.reg_save_offset;
14637 gcc_assert (int_registers_saved);
14639 /* No need to do stack checking as the area will be immediately
14640 written. */
14641 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14642 GEN_INT (-sse_size), -1,
14643 m->fs.cfa_reg == stack_pointer_rtx);
14644 allocate -= sse_size;
14645 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14646 sse_registers_saved = true;
14649 /* The stack has already been decremented by the instruction calling us
14650 so probe if the size is non-negative to preserve the protection area. */
14651 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14653 /* We expect the registers to be saved when probes are used. */
14654 gcc_assert (int_registers_saved);
14656 if (STACK_CHECK_MOVING_SP)
14658 if (!(crtl->is_leaf && !cfun->calls_alloca
14659 && allocate <= PROBE_INTERVAL))
14661 ix86_adjust_stack_and_probe (allocate);
14662 allocate = 0;
14665 else
14667 HOST_WIDE_INT size = allocate;
14669 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14670 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14672 if (TARGET_STACK_PROBE)
14674 if (crtl->is_leaf && !cfun->calls_alloca)
14676 if (size > PROBE_INTERVAL)
14677 ix86_emit_probe_stack_range (0, size);
14679 else
14680 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14682 else
14684 if (crtl->is_leaf && !cfun->calls_alloca)
14686 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14687 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14688 size - STACK_CHECK_PROTECT);
14690 else
14691 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14696 if (allocate == 0)
14698 else if (!ix86_target_stack_probe ()
14699 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14701 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14702 GEN_INT (-allocate), -1,
14703 m->fs.cfa_reg == stack_pointer_rtx);
14705 else
14707 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14708 rtx r10 = NULL;
14709 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14710 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14711 bool eax_live = ix86_eax_live_at_start_p ();
14712 bool r10_live = false;
14714 if (TARGET_64BIT)
14715 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14717 if (eax_live)
14719 insn = emit_insn (gen_push (eax));
14720 allocate -= UNITS_PER_WORD;
14721 /* Note that SEH directives need to continue tracking the stack
14722 pointer even after the frame pointer has been set up. */
14723 if (sp_is_cfa_reg || TARGET_SEH)
14725 if (sp_is_cfa_reg)
14726 m->fs.cfa_offset += UNITS_PER_WORD;
14727 RTX_FRAME_RELATED_P (insn) = 1;
14728 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14729 gen_rtx_SET (stack_pointer_rtx,
14730 plus_constant (Pmode, stack_pointer_rtx,
14731 -UNITS_PER_WORD)));
14735 if (r10_live)
14737 r10 = gen_rtx_REG (Pmode, R10_REG);
14738 insn = emit_insn (gen_push (r10));
14739 allocate -= UNITS_PER_WORD;
14740 if (sp_is_cfa_reg || TARGET_SEH)
14742 if (sp_is_cfa_reg)
14743 m->fs.cfa_offset += UNITS_PER_WORD;
14744 RTX_FRAME_RELATED_P (insn) = 1;
14745 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14746 gen_rtx_SET (stack_pointer_rtx,
14747 plus_constant (Pmode, stack_pointer_rtx,
14748 -UNITS_PER_WORD)));
14752 emit_move_insn (eax, GEN_INT (allocate));
14753 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14755 /* Use the fact that AX still contains ALLOCATE. */
14756 adjust_stack_insn = (Pmode == DImode
14757 ? gen_pro_epilogue_adjust_stack_di_sub
14758 : gen_pro_epilogue_adjust_stack_si_sub);
14760 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14761 stack_pointer_rtx, eax));
14763 if (sp_is_cfa_reg || TARGET_SEH)
14765 if (sp_is_cfa_reg)
14766 m->fs.cfa_offset += allocate;
14767 RTX_FRAME_RELATED_P (insn) = 1;
14768 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14769 gen_rtx_SET (stack_pointer_rtx,
14770 plus_constant (Pmode, stack_pointer_rtx,
14771 -allocate)));
14773 m->fs.sp_offset += allocate;
14775 /* Use stack_pointer_rtx for relative addressing so that code
14776 works for realigned stack, too. */
14777 if (r10_live && eax_live)
14779 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14780 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14781 gen_frame_mem (word_mode, t));
14782 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14783 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14784 gen_frame_mem (word_mode, t));
14786 else if (eax_live || r10_live)
14788 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14789 emit_move_insn (gen_rtx_REG (word_mode,
14790 (eax_live ? AX_REG : R10_REG)),
14791 gen_frame_mem (word_mode, t));
14794 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14796 /* If we havn't already set up the frame pointer, do so now. */
14797 if (frame_pointer_needed && !m->fs.fp_valid)
14799 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14800 GEN_INT (frame.stack_pointer_offset
14801 - frame.hard_frame_pointer_offset));
14802 insn = emit_insn (insn);
14803 RTX_FRAME_RELATED_P (insn) = 1;
14804 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14806 if (m->fs.cfa_reg == stack_pointer_rtx)
14807 m->fs.cfa_reg = hard_frame_pointer_rtx;
14808 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14809 m->fs.fp_valid = true;
14812 if (!int_registers_saved)
14813 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14814 if (!sse_registers_saved)
14815 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14817 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14818 in PROLOGUE. */
14819 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14821 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14822 insn = emit_insn (gen_set_got (pic));
14823 RTX_FRAME_RELATED_P (insn) = 1;
14824 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14825 emit_insn (gen_prologue_use (pic));
14826 /* Deleting already emmitted SET_GOT if exist and allocated to
14827 REAL_PIC_OFFSET_TABLE_REGNUM. */
14828 ix86_elim_entry_set_got (pic);
14831 if (crtl->drap_reg && !crtl->stack_realign_needed)
14833 /* vDRAP is setup but after reload it turns out stack realign
14834 isn't necessary, here we will emit prologue to setup DRAP
14835 without stack realign adjustment */
14836 t = choose_baseaddr (0, NULL);
14837 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14840 /* Prevent instructions from being scheduled into register save push
14841 sequence when access to the redzone area is done through frame pointer.
14842 The offset between the frame pointer and the stack pointer is calculated
14843 relative to the value of the stack pointer at the end of the function
14844 prologue, and moving instructions that access redzone area via frame
14845 pointer inside push sequence violates this assumption. */
14846 if (frame_pointer_needed && frame.red_zone_size)
14847 emit_insn (gen_memory_blockage ());
14849 /* SEH requires that the prologue end within 256 bytes of the start of
14850 the function. Prevent instruction schedules that would extend that.
14851 Further, prevent alloca modifications to the stack pointer from being
14852 combined with prologue modifications. */
14853 if (TARGET_SEH)
14854 emit_insn (gen_prologue_use (stack_pointer_rtx));
14857 /* Emit code to restore REG using a POP insn. */
14859 static void
14860 ix86_emit_restore_reg_using_pop (rtx reg)
14862 struct machine_function *m = cfun->machine;
14863 rtx_insn *insn = emit_insn (gen_pop (reg));
14865 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
14866 m->fs.sp_offset -= UNITS_PER_WORD;
14868 if (m->fs.cfa_reg == crtl->drap_reg
14869 && REGNO (reg) == REGNO (crtl->drap_reg))
14871 /* Previously we'd represented the CFA as an expression
14872 like *(%ebp - 8). We've just popped that value from
14873 the stack, which means we need to reset the CFA to
14874 the drap register. This will remain until we restore
14875 the stack pointer. */
14876 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14877 RTX_FRAME_RELATED_P (insn) = 1;
14879 /* This means that the DRAP register is valid for addressing too. */
14880 m->fs.drap_valid = true;
14881 return;
14884 if (m->fs.cfa_reg == stack_pointer_rtx)
14886 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14887 x = gen_rtx_SET (stack_pointer_rtx, x);
14888 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14889 RTX_FRAME_RELATED_P (insn) = 1;
14891 m->fs.cfa_offset -= UNITS_PER_WORD;
14894 /* When the frame pointer is the CFA, and we pop it, we are
14895 swapping back to the stack pointer as the CFA. This happens
14896 for stack frames that don't allocate other data, so we assume
14897 the stack pointer is now pointing at the return address, i.e.
14898 the function entry state, which makes the offset be 1 word. */
14899 if (reg == hard_frame_pointer_rtx)
14901 m->fs.fp_valid = false;
14902 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14904 m->fs.cfa_reg = stack_pointer_rtx;
14905 m->fs.cfa_offset -= UNITS_PER_WORD;
14907 add_reg_note (insn, REG_CFA_DEF_CFA,
14908 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14909 GEN_INT (m->fs.cfa_offset)));
14910 RTX_FRAME_RELATED_P (insn) = 1;
14915 /* Emit code to restore saved registers using POP insns. */
14917 static void
14918 ix86_emit_restore_regs_using_pop (void)
14920 unsigned int regno;
14922 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14923 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14924 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14927 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14928 omits the emit and only attaches the notes. */
14930 static void
14931 ix86_emit_leave (rtx_insn *insn)
14933 struct machine_function *m = cfun->machine;
14934 if (!insn)
14935 insn = emit_insn (ix86_gen_leave ());
14937 ix86_add_queued_cfa_restore_notes (insn);
14939 gcc_assert (m->fs.fp_valid);
14940 m->fs.sp_valid = true;
14941 m->fs.sp_realigned = false;
14942 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14943 m->fs.fp_valid = false;
14945 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14947 m->fs.cfa_reg = stack_pointer_rtx;
14948 m->fs.cfa_offset = m->fs.sp_offset;
14950 add_reg_note (insn, REG_CFA_DEF_CFA,
14951 plus_constant (Pmode, stack_pointer_rtx,
14952 m->fs.sp_offset));
14953 RTX_FRAME_RELATED_P (insn) = 1;
14955 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14956 m->fs.fp_offset);
14959 /* Emit code to restore saved registers using MOV insns.
14960 First register is restored from CFA - CFA_OFFSET. */
14961 static void
14962 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14963 bool maybe_eh_return)
14965 struct machine_function *m = cfun->machine;
14966 unsigned int regno;
14968 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14969 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14971 rtx reg = gen_rtx_REG (word_mode, regno);
14972 rtx mem;
14973 rtx_insn *insn;
14975 mem = choose_baseaddr (cfa_offset, NULL);
14976 mem = gen_frame_mem (word_mode, mem);
14977 insn = emit_move_insn (reg, mem);
14979 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14981 /* Previously we'd represented the CFA as an expression
14982 like *(%ebp - 8). We've just popped that value from
14983 the stack, which means we need to reset the CFA to
14984 the drap register. This will remain until we restore
14985 the stack pointer. */
14986 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14987 RTX_FRAME_RELATED_P (insn) = 1;
14989 /* This means that the DRAP register is valid for addressing. */
14990 m->fs.drap_valid = true;
14992 else
14993 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14995 cfa_offset -= UNITS_PER_WORD;
14999 /* Emit code to restore saved registers using MOV insns.
15000 First register is restored from CFA - CFA_OFFSET. */
15001 static void
15002 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
15003 bool maybe_eh_return)
15005 unsigned int regno;
15007 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15008 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15010 rtx reg = gen_rtx_REG (V4SFmode, regno);
15011 rtx mem;
15012 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15014 mem = choose_baseaddr (cfa_offset, &align);
15015 mem = gen_rtx_MEM (V4SFmode, mem);
15017 /* The location aligment depends upon the base register. */
15018 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15019 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15020 set_mem_align (mem, align);
15021 emit_insn (gen_rtx_SET (reg, mem));
15023 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15025 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15029 static void
15030 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15031 bool use_call, int style)
15033 struct machine_function *m = cfun->machine;
15034 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15035 + m->call_ms2sysv_extra_regs;
15036 rtvec v;
15037 unsigned int elems_needed, align, i, vi = 0;
15038 rtx_insn *insn;
15039 rtx sym, tmp;
15040 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15041 rtx r10 = NULL_RTX;
15042 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15043 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15044 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15045 rtx rsi_frame_load = NULL_RTX;
15046 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15047 enum xlogue_stub stub;
15049 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15051 /* If using a realigned stack, we should never start with padding. */
15052 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15054 /* Setup RSI as the stub's base pointer. */
15055 align = GET_MODE_ALIGNMENT (V4SFmode);
15056 tmp = choose_baseaddr (rsi_offset, &align);
15057 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15058 emit_insn (gen_rtx_SET (rsi, tmp));
15060 /* Get a symbol for the stub. */
15061 if (frame_pointer_needed)
15062 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15063 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15064 else
15065 stub = use_call ? XLOGUE_STUB_RESTORE
15066 : XLOGUE_STUB_RESTORE_TAIL;
15067 sym = xlogue.get_stub_rtx (stub);
15069 elems_needed = ncregs;
15070 if (use_call)
15071 elems_needed += 1;
15072 else
15073 elems_needed += frame_pointer_needed ? 5 : 3;
15074 v = rtvec_alloc (elems_needed);
15076 /* We call the epilogue stub when we need to pop incoming args or we are
15077 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15078 epilogue stub and it is the tail-call. */
15079 if (use_call)
15080 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15081 else
15083 RTVEC_ELT (v, vi++) = ret_rtx;
15084 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15085 if (frame_pointer_needed)
15087 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15088 gcc_assert (m->fs.fp_valid);
15089 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15091 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15092 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15093 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15094 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15095 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15097 else
15099 /* If no hard frame pointer, we set R10 to the SP restore value. */
15100 gcc_assert (!m->fs.fp_valid);
15101 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15102 gcc_assert (m->fs.sp_valid);
15104 r10 = gen_rtx_REG (DImode, R10_REG);
15105 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15106 emit_insn (gen_rtx_SET (r10, tmp));
15108 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15112 /* Generate frame load insns and restore notes. */
15113 for (i = 0; i < ncregs; ++i)
15115 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15116 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15117 rtx reg, frame_load;
15119 reg = gen_rtx_REG (mode, r.regno);
15120 frame_load = gen_frame_load (reg, rsi, r.offset);
15122 /* Save RSI frame load insn & note to add last. */
15123 if (r.regno == SI_REG)
15125 gcc_assert (!rsi_frame_load);
15126 rsi_frame_load = frame_load;
15127 rsi_restore_offset = r.offset;
15129 else
15131 RTVEC_ELT (v, vi++) = frame_load;
15132 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15136 /* Add RSI frame load & restore note at the end. */
15137 gcc_assert (rsi_frame_load);
15138 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15139 RTVEC_ELT (v, vi++) = rsi_frame_load;
15140 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15141 rsi_restore_offset);
15143 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15144 if (!use_call && !frame_pointer_needed)
15146 gcc_assert (m->fs.sp_valid);
15147 gcc_assert (!m->fs.sp_realigned);
15149 /* At this point, R10 should point to frame.stack_realign_offset. */
15150 if (m->fs.cfa_reg == stack_pointer_rtx)
15151 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15152 m->fs.sp_offset = frame.stack_realign_offset;
15155 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15156 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15157 if (use_call)
15158 insn = emit_insn (tmp);
15159 else
15161 insn = emit_jump_insn (tmp);
15162 JUMP_LABEL (insn) = ret_rtx;
15164 if (frame_pointer_needed)
15165 ix86_emit_leave (insn);
15166 else
15168 /* Need CFA adjust note. */
15169 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15170 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15174 RTX_FRAME_RELATED_P (insn) = true;
15175 ix86_add_queued_cfa_restore_notes (insn);
15177 /* If we're not doing a tail-call, we need to adjust the stack. */
15178 if (use_call && m->fs.sp_valid)
15180 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15181 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15182 GEN_INT (dealloc), style,
15183 m->fs.cfa_reg == stack_pointer_rtx);
15187 /* Restore function stack, frame, and registers. */
15189 void
15190 ix86_expand_epilogue (int style)
15192 struct machine_function *m = cfun->machine;
15193 struct machine_frame_state frame_state_save = m->fs;
15194 struct ix86_frame frame;
15195 bool restore_regs_via_mov;
15196 bool using_drap;
15197 bool restore_stub_is_tail = false;
15199 if (ix86_function_naked (current_function_decl))
15201 /* The program should not reach this point. */
15202 emit_insn (gen_trap ());
15203 return;
15206 ix86_finalize_stack_realign_flags ();
15207 frame = m->frame;
15209 m->fs.sp_realigned = stack_realign_fp;
15210 m->fs.sp_valid = stack_realign_fp
15211 || !frame_pointer_needed
15212 || crtl->sp_is_unchanging;
15213 gcc_assert (!m->fs.sp_valid
15214 || m->fs.sp_offset == frame.stack_pointer_offset);
15216 /* The FP must be valid if the frame pointer is present. */
15217 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15218 gcc_assert (!m->fs.fp_valid
15219 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15221 /* We must have *some* valid pointer to the stack frame. */
15222 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15224 /* The DRAP is never valid at this point. */
15225 gcc_assert (!m->fs.drap_valid);
15227 /* See the comment about red zone and frame
15228 pointer usage in ix86_expand_prologue. */
15229 if (frame_pointer_needed && frame.red_zone_size)
15230 emit_insn (gen_memory_blockage ());
15232 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15233 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15235 /* Determine the CFA offset of the end of the red-zone. */
15236 m->fs.red_zone_offset = 0;
15237 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15239 /* The red-zone begins below return address and error code in
15240 exception handler. */
15241 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
15243 /* When the register save area is in the aligned portion of
15244 the stack, determine the maximum runtime displacement that
15245 matches up with the aligned frame. */
15246 if (stack_realign_drap)
15247 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15248 + UNITS_PER_WORD);
15251 /* Special care must be taken for the normal return case of a function
15252 using eh_return: the eax and edx registers are marked as saved, but
15253 not restored along this path. Adjust the save location to match. */
15254 if (crtl->calls_eh_return && style != 2)
15255 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15257 /* EH_RETURN requires the use of moves to function properly. */
15258 if (crtl->calls_eh_return)
15259 restore_regs_via_mov = true;
15260 /* SEH requires the use of pops to identify the epilogue. */
15261 else if (TARGET_SEH)
15262 restore_regs_via_mov = false;
15263 /* If we're only restoring one register and sp cannot be used then
15264 using a move instruction to restore the register since it's
15265 less work than reloading sp and popping the register. */
15266 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15267 restore_regs_via_mov = true;
15268 else if (TARGET_EPILOGUE_USING_MOVE
15269 && cfun->machine->use_fast_prologue_epilogue
15270 && (frame.nregs > 1
15271 || m->fs.sp_offset != frame.reg_save_offset))
15272 restore_regs_via_mov = true;
15273 else if (frame_pointer_needed
15274 && !frame.nregs
15275 && m->fs.sp_offset != frame.reg_save_offset)
15276 restore_regs_via_mov = true;
15277 else if (frame_pointer_needed
15278 && TARGET_USE_LEAVE
15279 && cfun->machine->use_fast_prologue_epilogue
15280 && frame.nregs == 1)
15281 restore_regs_via_mov = true;
15282 else
15283 restore_regs_via_mov = false;
15285 if (restore_regs_via_mov || frame.nsseregs)
15287 /* Ensure that the entire register save area is addressable via
15288 the stack pointer, if we will restore via sp. */
15289 if (TARGET_64BIT
15290 && m->fs.sp_offset > 0x7fffffff
15291 && !(fp_valid_at (frame.stack_realign_offset) || m->fs.drap_valid)
15292 && (frame.nsseregs + frame.nregs) != 0)
15294 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15295 GEN_INT (m->fs.sp_offset
15296 - frame.sse_reg_save_offset),
15297 style,
15298 m->fs.cfa_reg == stack_pointer_rtx);
15302 /* If there are any SSE registers to restore, then we have to do it
15303 via moves, since there's obviously no pop for SSE regs. */
15304 if (frame.nsseregs)
15305 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15306 style == 2);
15308 if (m->call_ms2sysv)
15310 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15312 /* We cannot use a tail-call for the stub if:
15313 1. We have to pop incoming args,
15314 2. We have additional int regs to restore, or
15315 3. A sibling call will be the tail-call, or
15316 4. We are emitting an eh_return_internal epilogue.
15318 TODO: Item 4 has not yet tested!
15320 If any of the above are true, we will call the stub rather than
15321 jump to it. */
15322 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15323 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15326 /* If using out-of-line stub that is a tail-call, then...*/
15327 if (m->call_ms2sysv && restore_stub_is_tail)
15329 /* TODO: parinoid tests. (remove eventually) */
15330 gcc_assert (m->fs.sp_valid);
15331 gcc_assert (!m->fs.sp_realigned);
15332 gcc_assert (!m->fs.fp_valid);
15333 gcc_assert (!m->fs.realigned);
15334 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15335 gcc_assert (!crtl->drap_reg);
15336 gcc_assert (!frame.nregs);
15338 else if (restore_regs_via_mov)
15340 rtx t;
15342 if (frame.nregs)
15343 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15345 /* eh_return epilogues need %ecx added to the stack pointer. */
15346 if (style == 2)
15348 rtx sa = EH_RETURN_STACKADJ_RTX;
15349 rtx_insn *insn;
15351 /* %ecx can't be used for both DRAP register and eh_return. */
15352 if (crtl->drap_reg)
15353 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15355 /* regparm nested functions don't work with eh_return. */
15356 gcc_assert (!ix86_static_chain_on_stack);
15358 if (frame_pointer_needed)
15360 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15361 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15362 emit_insn (gen_rtx_SET (sa, t));
15364 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15365 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15367 /* Note that we use SA as a temporary CFA, as the return
15368 address is at the proper place relative to it. We
15369 pretend this happens at the FP restore insn because
15370 prior to this insn the FP would be stored at the wrong
15371 offset relative to SA, and after this insn we have no
15372 other reasonable register to use for the CFA. We don't
15373 bother resetting the CFA to the SP for the duration of
15374 the return insn. */
15375 add_reg_note (insn, REG_CFA_DEF_CFA,
15376 plus_constant (Pmode, sa, UNITS_PER_WORD));
15377 ix86_add_queued_cfa_restore_notes (insn);
15378 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15379 RTX_FRAME_RELATED_P (insn) = 1;
15381 m->fs.cfa_reg = sa;
15382 m->fs.cfa_offset = UNITS_PER_WORD;
15383 m->fs.fp_valid = false;
15385 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15386 const0_rtx, style, false);
15388 else
15390 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15391 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15392 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15393 ix86_add_queued_cfa_restore_notes (insn);
15395 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15396 if (m->fs.cfa_offset != UNITS_PER_WORD)
15398 m->fs.cfa_offset = UNITS_PER_WORD;
15399 add_reg_note (insn, REG_CFA_DEF_CFA,
15400 plus_constant (Pmode, stack_pointer_rtx,
15401 UNITS_PER_WORD));
15402 RTX_FRAME_RELATED_P (insn) = 1;
15405 m->fs.sp_offset = UNITS_PER_WORD;
15406 m->fs.sp_valid = true;
15407 m->fs.sp_realigned = false;
15410 else
15412 /* SEH requires that the function end with (1) a stack adjustment
15413 if necessary, (2) a sequence of pops, and (3) a return or
15414 jump instruction. Prevent insns from the function body from
15415 being scheduled into this sequence. */
15416 if (TARGET_SEH)
15418 /* Prevent a catch region from being adjacent to the standard
15419 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15420 several other flags that would be interesting to test are
15421 not yet set up. */
15422 if (flag_non_call_exceptions)
15423 emit_insn (gen_nops (const1_rtx));
15424 else
15425 emit_insn (gen_blockage ());
15428 /* First step is to deallocate the stack frame so that we can
15429 pop the registers. If the stack pointer was realigned, it needs
15430 to be restored now. Also do it on SEH target for very large
15431 frame as the emitted instructions aren't allowed by the ABI
15432 in epilogues. */
15433 if (!m->fs.sp_valid || m->fs.sp_realigned
15434 || (TARGET_SEH
15435 && (m->fs.sp_offset - frame.reg_save_offset
15436 >= SEH_MAX_FRAME_SIZE)))
15438 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15439 GEN_INT (m->fs.fp_offset
15440 - frame.reg_save_offset),
15441 style, false);
15443 else if (m->fs.sp_offset != frame.reg_save_offset)
15445 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15446 GEN_INT (m->fs.sp_offset
15447 - frame.reg_save_offset),
15448 style,
15449 m->fs.cfa_reg == stack_pointer_rtx);
15452 ix86_emit_restore_regs_using_pop ();
15455 /* If we used a stack pointer and haven't already got rid of it,
15456 then do so now. */
15457 if (m->fs.fp_valid)
15459 /* If the stack pointer is valid and pointing at the frame
15460 pointer store address, then we only need a pop. */
15461 if (sp_valid_at (frame.hfp_save_offset)
15462 && m->fs.sp_offset == frame.hfp_save_offset)
15463 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15464 /* Leave results in shorter dependency chains on CPUs that are
15465 able to grok it fast. */
15466 else if (TARGET_USE_LEAVE
15467 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15468 || !cfun->machine->use_fast_prologue_epilogue)
15469 ix86_emit_leave (NULL);
15470 else
15472 pro_epilogue_adjust_stack (stack_pointer_rtx,
15473 hard_frame_pointer_rtx,
15474 const0_rtx, style, !using_drap);
15475 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15479 if (using_drap)
15481 int param_ptr_offset = UNITS_PER_WORD;
15482 rtx_insn *insn;
15484 gcc_assert (stack_realign_drap);
15486 if (ix86_static_chain_on_stack)
15487 param_ptr_offset += UNITS_PER_WORD;
15488 if (!call_used_regs[REGNO (crtl->drap_reg)])
15489 param_ptr_offset += UNITS_PER_WORD;
15491 insn = emit_insn (gen_rtx_SET
15492 (stack_pointer_rtx,
15493 gen_rtx_PLUS (Pmode,
15494 crtl->drap_reg,
15495 GEN_INT (-param_ptr_offset))));
15496 m->fs.cfa_reg = stack_pointer_rtx;
15497 m->fs.cfa_offset = param_ptr_offset;
15498 m->fs.sp_offset = param_ptr_offset;
15499 m->fs.realigned = false;
15501 add_reg_note (insn, REG_CFA_DEF_CFA,
15502 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15503 GEN_INT (param_ptr_offset)));
15504 RTX_FRAME_RELATED_P (insn) = 1;
15506 if (!call_used_regs[REGNO (crtl->drap_reg)])
15507 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15510 /* At this point the stack pointer must be valid, and we must have
15511 restored all of the registers. We may not have deallocated the
15512 entire stack frame. We've delayed this until now because it may
15513 be possible to merge the local stack deallocation with the
15514 deallocation forced by ix86_static_chain_on_stack. */
15515 gcc_assert (m->fs.sp_valid);
15516 gcc_assert (!m->fs.sp_realigned);
15517 gcc_assert (!m->fs.fp_valid);
15518 gcc_assert (!m->fs.realigned);
15519 if (m->fs.sp_offset != UNITS_PER_WORD)
15521 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15522 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15523 style, true);
15525 else
15526 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15528 /* Sibcall epilogues don't want a return instruction. */
15529 if (style == 0)
15531 m->fs = frame_state_save;
15532 return;
15535 if (cfun->machine->func_type != TYPE_NORMAL)
15536 emit_jump_insn (gen_interrupt_return ());
15537 else if (crtl->args.pops_args && crtl->args.size)
15539 rtx popc = GEN_INT (crtl->args.pops_args);
15541 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15542 address, do explicit add, and jump indirectly to the caller. */
15544 if (crtl->args.pops_args >= 65536)
15546 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15547 rtx_insn *insn;
15549 /* There is no "pascal" calling convention in any 64bit ABI. */
15550 gcc_assert (!TARGET_64BIT);
15552 insn = emit_insn (gen_pop (ecx));
15553 m->fs.cfa_offset -= UNITS_PER_WORD;
15554 m->fs.sp_offset -= UNITS_PER_WORD;
15556 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15557 x = gen_rtx_SET (stack_pointer_rtx, x);
15558 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15559 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15560 RTX_FRAME_RELATED_P (insn) = 1;
15562 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15563 popc, -1, true);
15564 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15566 else
15567 emit_jump_insn (gen_simple_return_pop_internal (popc));
15569 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15570 emit_jump_insn (gen_simple_return_internal ());
15572 /* Restore the state back to the state from the prologue,
15573 so that it's correct for the next epilogue. */
15574 m->fs = frame_state_save;
15577 /* Reset from the function's potential modifications. */
15579 static void
15580 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, HOST_WIDE_INT)
15582 if (pic_offset_table_rtx
15583 && !ix86_use_pseudo_pic_reg ())
15584 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15586 if (TARGET_MACHO)
15588 rtx_insn *insn = get_last_insn ();
15589 rtx_insn *deleted_debug_label = NULL;
15591 /* Mach-O doesn't support labels at the end of objects, so if
15592 it looks like we might want one, take special action.
15593 First, collect any sequence of deleted debug labels. */
15594 while (insn
15595 && NOTE_P (insn)
15596 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15598 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15599 notes only, instead set their CODE_LABEL_NUMBER to -1,
15600 otherwise there would be code generation differences
15601 in between -g and -g0. */
15602 if (NOTE_P (insn) && NOTE_KIND (insn)
15603 == NOTE_INSN_DELETED_DEBUG_LABEL)
15604 deleted_debug_label = insn;
15605 insn = PREV_INSN (insn);
15608 /* If we have:
15609 label:
15610 barrier
15611 then this needs to be detected, so skip past the barrier. */
15613 if (insn && BARRIER_P (insn))
15614 insn = PREV_INSN (insn);
15616 /* Up to now we've only seen notes or barriers. */
15617 if (insn)
15619 if (LABEL_P (insn)
15620 || (NOTE_P (insn)
15621 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15622 /* Trailing label. */
15623 fputs ("\tnop\n", file);
15624 else if (cfun && ! cfun->is_thunk)
15626 /* See if we have a completely empty function body, skipping
15627 the special case of the picbase thunk emitted as asm. */
15628 while (insn && ! INSN_P (insn))
15629 insn = PREV_INSN (insn);
15630 /* If we don't find any insns, we've got an empty function body;
15631 I.e. completely empty - without a return or branch. This is
15632 taken as the case where a function body has been removed
15633 because it contains an inline __builtin_unreachable(). GCC
15634 declares that reaching __builtin_unreachable() means UB so
15635 we're not obliged to do anything special; however, we want
15636 non-zero-sized function bodies. To meet this, and help the
15637 user out, let's trap the case. */
15638 if (insn == NULL)
15639 fputs ("\tud2\n", file);
15642 else if (deleted_debug_label)
15643 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15644 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15645 CODE_LABEL_NUMBER (insn) = -1;
15649 /* Return a scratch register to use in the split stack prologue. The
15650 split stack prologue is used for -fsplit-stack. It is the first
15651 instructions in the function, even before the regular prologue.
15652 The scratch register can be any caller-saved register which is not
15653 used for parameters or for the static chain. */
15655 static unsigned int
15656 split_stack_prologue_scratch_regno (void)
15658 if (TARGET_64BIT)
15659 return R11_REG;
15660 else
15662 bool is_fastcall, is_thiscall;
15663 int regparm;
15665 is_fastcall = (lookup_attribute ("fastcall",
15666 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15667 != NULL);
15668 is_thiscall = (lookup_attribute ("thiscall",
15669 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15670 != NULL);
15671 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15673 if (is_fastcall)
15675 if (DECL_STATIC_CHAIN (cfun->decl))
15677 sorry ("-fsplit-stack does not support fastcall with "
15678 "nested function");
15679 return INVALID_REGNUM;
15681 return AX_REG;
15683 else if (is_thiscall)
15685 if (!DECL_STATIC_CHAIN (cfun->decl))
15686 return DX_REG;
15687 return AX_REG;
15689 else if (regparm < 3)
15691 if (!DECL_STATIC_CHAIN (cfun->decl))
15692 return CX_REG;
15693 else
15695 if (regparm >= 2)
15697 sorry ("-fsplit-stack does not support 2 register "
15698 "parameters for a nested function");
15699 return INVALID_REGNUM;
15701 return DX_REG;
15704 else
15706 /* FIXME: We could make this work by pushing a register
15707 around the addition and comparison. */
15708 sorry ("-fsplit-stack does not support 3 register parameters");
15709 return INVALID_REGNUM;
15714 /* A SYMBOL_REF for the function which allocates new stackspace for
15715 -fsplit-stack. */
15717 static GTY(()) rtx split_stack_fn;
15719 /* A SYMBOL_REF for the more stack function when using the large
15720 model. */
15722 static GTY(()) rtx split_stack_fn_large;
15724 /* Handle -fsplit-stack. These are the first instructions in the
15725 function, even before the regular prologue. */
15727 void
15728 ix86_expand_split_stack_prologue (void)
15730 struct ix86_frame frame;
15731 HOST_WIDE_INT allocate;
15732 unsigned HOST_WIDE_INT args_size;
15733 rtx_code_label *label;
15734 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15735 rtx scratch_reg = NULL_RTX;
15736 rtx_code_label *varargs_label = NULL;
15737 rtx fn;
15739 gcc_assert (flag_split_stack && reload_completed);
15741 ix86_finalize_stack_realign_flags ();
15742 frame = cfun->machine->frame;
15743 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15745 /* This is the label we will branch to if we have enough stack
15746 space. We expect the basic block reordering pass to reverse this
15747 branch if optimizing, so that we branch in the unlikely case. */
15748 label = gen_label_rtx ();
15750 /* We need to compare the stack pointer minus the frame size with
15751 the stack boundary in the TCB. The stack boundary always gives
15752 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15753 can compare directly. Otherwise we need to do an addition. */
15755 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
15756 UNSPEC_STACK_CHECK);
15757 limit = gen_rtx_CONST (Pmode, limit);
15758 limit = gen_rtx_MEM (Pmode, limit);
15759 if (allocate < SPLIT_STACK_AVAILABLE)
15760 current = stack_pointer_rtx;
15761 else
15763 unsigned int scratch_regno;
15764 rtx offset;
15766 /* We need a scratch register to hold the stack pointer minus
15767 the required frame size. Since this is the very start of the
15768 function, the scratch register can be any caller-saved
15769 register which is not used for parameters. */
15770 offset = GEN_INT (- allocate);
15771 scratch_regno = split_stack_prologue_scratch_regno ();
15772 if (scratch_regno == INVALID_REGNUM)
15773 return;
15774 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15775 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15777 /* We don't use ix86_gen_add3 in this case because it will
15778 want to split to lea, but when not optimizing the insn
15779 will not be split after this point. */
15780 emit_insn (gen_rtx_SET (scratch_reg,
15781 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15782 offset)));
15784 else
15786 emit_move_insn (scratch_reg, offset);
15787 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15788 stack_pointer_rtx));
15790 current = scratch_reg;
15793 ix86_expand_branch (GEU, current, limit, label);
15794 rtx_insn *jump_insn = get_last_insn ();
15795 JUMP_LABEL (jump_insn) = label;
15797 /* Mark the jump as very likely to be taken. */
15798 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15800 if (split_stack_fn == NULL_RTX)
15802 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15803 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15805 fn = split_stack_fn;
15807 /* Get more stack space. We pass in the desired stack space and the
15808 size of the arguments to copy to the new stack. In 32-bit mode
15809 we push the parameters; __morestack will return on a new stack
15810 anyhow. In 64-bit mode we pass the parameters in r10 and
15811 r11. */
15812 allocate_rtx = GEN_INT (allocate);
15813 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
15814 call_fusage = NULL_RTX;
15815 rtx pop = NULL_RTX;
15816 if (TARGET_64BIT)
15818 rtx reg10, reg11;
15820 reg10 = gen_rtx_REG (Pmode, R10_REG);
15821 reg11 = gen_rtx_REG (Pmode, R11_REG);
15823 /* If this function uses a static chain, it will be in %r10.
15824 Preserve it across the call to __morestack. */
15825 if (DECL_STATIC_CHAIN (cfun->decl))
15827 rtx rax;
15829 rax = gen_rtx_REG (word_mode, AX_REG);
15830 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15831 use_reg (&call_fusage, rax);
15834 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15835 && !TARGET_PECOFF)
15837 HOST_WIDE_INT argval;
15839 gcc_assert (Pmode == DImode);
15840 /* When using the large model we need to load the address
15841 into a register, and we've run out of registers. So we
15842 switch to a different calling convention, and we call a
15843 different function: __morestack_large. We pass the
15844 argument size in the upper 32 bits of r10 and pass the
15845 frame size in the lower 32 bits. */
15846 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15847 gcc_assert ((args_size & 0xffffffff) == args_size);
15849 if (split_stack_fn_large == NULL_RTX)
15851 split_stack_fn_large =
15852 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15853 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15855 if (ix86_cmodel == CM_LARGE_PIC)
15857 rtx_code_label *label;
15858 rtx x;
15860 label = gen_label_rtx ();
15861 emit_label (label);
15862 LABEL_PRESERVE_P (label) = 1;
15863 emit_insn (gen_set_rip_rex64 (reg10, label));
15864 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15865 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15866 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15867 UNSPEC_GOT);
15868 x = gen_rtx_CONST (Pmode, x);
15869 emit_move_insn (reg11, x);
15870 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15871 x = gen_const_mem (Pmode, x);
15872 emit_move_insn (reg11, x);
15874 else
15875 emit_move_insn (reg11, split_stack_fn_large);
15877 fn = reg11;
15879 argval = ((args_size << 16) << 16) + allocate;
15880 emit_move_insn (reg10, GEN_INT (argval));
15882 else
15884 emit_move_insn (reg10, allocate_rtx);
15885 emit_move_insn (reg11, GEN_INT (args_size));
15886 use_reg (&call_fusage, reg11);
15889 use_reg (&call_fusage, reg10);
15891 else
15893 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15894 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15895 insn = emit_insn (gen_push (allocate_rtx));
15896 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15897 pop = GEN_INT (2 * UNITS_PER_WORD);
15899 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15900 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15901 pop, false);
15902 add_function_usage_to (call_insn, call_fusage);
15903 if (!TARGET_64BIT)
15904 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15905 /* Indicate that this function can't jump to non-local gotos. */
15906 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15908 /* In order to make call/return prediction work right, we now need
15909 to execute a return instruction. See
15910 libgcc/config/i386/morestack.S for the details on how this works.
15912 For flow purposes gcc must not see this as a return
15913 instruction--we need control flow to continue at the subsequent
15914 label. Therefore, we use an unspec. */
15915 gcc_assert (crtl->args.pops_args < 65536);
15916 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15918 /* If we are in 64-bit mode and this function uses a static chain,
15919 we saved %r10 in %rax before calling _morestack. */
15920 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15921 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15922 gen_rtx_REG (word_mode, AX_REG));
15924 /* If this function calls va_start, we need to store a pointer to
15925 the arguments on the old stack, because they may not have been
15926 all copied to the new stack. At this point the old stack can be
15927 found at the frame pointer value used by __morestack, because
15928 __morestack has set that up before calling back to us. Here we
15929 store that pointer in a scratch register, and in
15930 ix86_expand_prologue we store the scratch register in a stack
15931 slot. */
15932 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15934 unsigned int scratch_regno;
15935 rtx frame_reg;
15936 int words;
15938 scratch_regno = split_stack_prologue_scratch_regno ();
15939 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15940 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15942 /* 64-bit:
15943 fp -> old fp value
15944 return address within this function
15945 return address of caller of this function
15946 stack arguments
15947 So we add three words to get to the stack arguments.
15949 32-bit:
15950 fp -> old fp value
15951 return address within this function
15952 first argument to __morestack
15953 second argument to __morestack
15954 return address of caller of this function
15955 stack arguments
15956 So we add five words to get to the stack arguments.
15958 words = TARGET_64BIT ? 3 : 5;
15959 emit_insn (gen_rtx_SET (scratch_reg,
15960 gen_rtx_PLUS (Pmode, frame_reg,
15961 GEN_INT (words * UNITS_PER_WORD))));
15963 varargs_label = gen_label_rtx ();
15964 emit_jump_insn (gen_jump (varargs_label));
15965 JUMP_LABEL (get_last_insn ()) = varargs_label;
15967 emit_barrier ();
15970 emit_label (label);
15971 LABEL_NUSES (label) = 1;
15973 /* If this function calls va_start, we now have to set the scratch
15974 register for the case where we do not call __morestack. In this
15975 case we need to set it based on the stack pointer. */
15976 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15978 emit_insn (gen_rtx_SET (scratch_reg,
15979 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15980 GEN_INT (UNITS_PER_WORD))));
15982 emit_label (varargs_label);
15983 LABEL_NUSES (varargs_label) = 1;
15987 /* We may have to tell the dataflow pass that the split stack prologue
15988 is initializing a scratch register. */
15990 static void
15991 ix86_live_on_entry (bitmap regs)
15993 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15995 gcc_assert (flag_split_stack);
15996 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
16000 /* Extract the parts of an RTL expression that is a valid memory address
16001 for an instruction. Return 0 if the structure of the address is
16002 grossly off. Return -1 if the address contains ASHIFT, so it is not
16003 strictly valid, but still used for computing length of lea instruction. */
16006 ix86_decompose_address (rtx addr, struct ix86_address *out)
16008 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16009 rtx base_reg, index_reg;
16010 HOST_WIDE_INT scale = 1;
16011 rtx scale_rtx = NULL_RTX;
16012 rtx tmp;
16013 int retval = 1;
16014 addr_space_t seg = ADDR_SPACE_GENERIC;
16016 /* Allow zero-extended SImode addresses,
16017 they will be emitted with addr32 prefix. */
16018 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16020 if (GET_CODE (addr) == ZERO_EXTEND
16021 && GET_MODE (XEXP (addr, 0)) == SImode)
16023 addr = XEXP (addr, 0);
16024 if (CONST_INT_P (addr))
16025 return 0;
16027 else if (GET_CODE (addr) == AND
16028 && const_32bit_mask (XEXP (addr, 1), DImode))
16030 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16031 if (addr == NULL_RTX)
16032 return 0;
16034 if (CONST_INT_P (addr))
16035 return 0;
16039 /* Allow SImode subregs of DImode addresses,
16040 they will be emitted with addr32 prefix. */
16041 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16043 if (SUBREG_P (addr)
16044 && GET_MODE (SUBREG_REG (addr)) == DImode)
16046 addr = SUBREG_REG (addr);
16047 if (CONST_INT_P (addr))
16048 return 0;
16052 if (REG_P (addr))
16053 base = addr;
16054 else if (SUBREG_P (addr))
16056 if (REG_P (SUBREG_REG (addr)))
16057 base = addr;
16058 else
16059 return 0;
16061 else if (GET_CODE (addr) == PLUS)
16063 rtx addends[4], op;
16064 int n = 0, i;
16066 op = addr;
16069 if (n >= 4)
16070 return 0;
16071 addends[n++] = XEXP (op, 1);
16072 op = XEXP (op, 0);
16074 while (GET_CODE (op) == PLUS);
16075 if (n >= 4)
16076 return 0;
16077 addends[n] = op;
16079 for (i = n; i >= 0; --i)
16081 op = addends[i];
16082 switch (GET_CODE (op))
16084 case MULT:
16085 if (index)
16086 return 0;
16087 index = XEXP (op, 0);
16088 scale_rtx = XEXP (op, 1);
16089 break;
16091 case ASHIFT:
16092 if (index)
16093 return 0;
16094 index = XEXP (op, 0);
16095 tmp = XEXP (op, 1);
16096 if (!CONST_INT_P (tmp))
16097 return 0;
16098 scale = INTVAL (tmp);
16099 if ((unsigned HOST_WIDE_INT) scale > 3)
16100 return 0;
16101 scale = 1 << scale;
16102 break;
16104 case ZERO_EXTEND:
16105 op = XEXP (op, 0);
16106 if (GET_CODE (op) != UNSPEC)
16107 return 0;
16108 /* FALLTHRU */
16110 case UNSPEC:
16111 if (XINT (op, 1) == UNSPEC_TP
16112 && TARGET_TLS_DIRECT_SEG_REFS
16113 && seg == ADDR_SPACE_GENERIC)
16114 seg = DEFAULT_TLS_SEG_REG;
16115 else
16116 return 0;
16117 break;
16119 case SUBREG:
16120 if (!REG_P (SUBREG_REG (op)))
16121 return 0;
16122 /* FALLTHRU */
16124 case REG:
16125 if (!base)
16126 base = op;
16127 else if (!index)
16128 index = op;
16129 else
16130 return 0;
16131 break;
16133 case CONST:
16134 case CONST_INT:
16135 case SYMBOL_REF:
16136 case LABEL_REF:
16137 if (disp)
16138 return 0;
16139 disp = op;
16140 break;
16142 default:
16143 return 0;
16147 else if (GET_CODE (addr) == MULT)
16149 index = XEXP (addr, 0); /* index*scale */
16150 scale_rtx = XEXP (addr, 1);
16152 else if (GET_CODE (addr) == ASHIFT)
16154 /* We're called for lea too, which implements ashift on occasion. */
16155 index = XEXP (addr, 0);
16156 tmp = XEXP (addr, 1);
16157 if (!CONST_INT_P (tmp))
16158 return 0;
16159 scale = INTVAL (tmp);
16160 if ((unsigned HOST_WIDE_INT) scale > 3)
16161 return 0;
16162 scale = 1 << scale;
16163 retval = -1;
16165 else
16166 disp = addr; /* displacement */
16168 if (index)
16170 if (REG_P (index))
16172 else if (SUBREG_P (index)
16173 && REG_P (SUBREG_REG (index)))
16175 else
16176 return 0;
16179 /* Extract the integral value of scale. */
16180 if (scale_rtx)
16182 if (!CONST_INT_P (scale_rtx))
16183 return 0;
16184 scale = INTVAL (scale_rtx);
16187 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16188 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16190 /* Avoid useless 0 displacement. */
16191 if (disp == const0_rtx && (base || index))
16192 disp = NULL_RTX;
16194 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16195 if (base_reg && index_reg && scale == 1
16196 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16197 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16198 || REGNO (index_reg) == SP_REG))
16200 std::swap (base, index);
16201 std::swap (base_reg, index_reg);
16204 /* Special case: %ebp cannot be encoded as a base without a displacement.
16205 Similarly %r13. */
16206 if (!disp && base_reg
16207 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16208 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16209 || REGNO (base_reg) == BP_REG
16210 || REGNO (base_reg) == R13_REG))
16211 disp = const0_rtx;
16213 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16214 Avoid this by transforming to [%esi+0].
16215 Reload calls address legitimization without cfun defined, so we need
16216 to test cfun for being non-NULL. */
16217 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16218 && base_reg && !index_reg && !disp
16219 && REGNO (base_reg) == SI_REG)
16220 disp = const0_rtx;
16222 /* Special case: encode reg+reg instead of reg*2. */
16223 if (!base && index && scale == 2)
16224 base = index, base_reg = index_reg, scale = 1;
16226 /* Special case: scaling cannot be encoded without base or displacement. */
16227 if (!base && !disp && index && scale != 1)
16228 disp = const0_rtx;
16230 out->base = base;
16231 out->index = index;
16232 out->disp = disp;
16233 out->scale = scale;
16234 out->seg = seg;
16236 return retval;
16239 /* Return cost of the memory address x.
16240 For i386, it is better to use a complex address than let gcc copy
16241 the address into a reg and make a new pseudo. But not if the address
16242 requires to two regs - that would mean more pseudos with longer
16243 lifetimes. */
16244 static int
16245 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16247 struct ix86_address parts;
16248 int cost = 1;
16249 int ok = ix86_decompose_address (x, &parts);
16251 gcc_assert (ok);
16253 if (parts.base && SUBREG_P (parts.base))
16254 parts.base = SUBREG_REG (parts.base);
16255 if (parts.index && SUBREG_P (parts.index))
16256 parts.index = SUBREG_REG (parts.index);
16258 /* Attempt to minimize number of registers in the address by increasing
16259 address cost for each used register. We don't increase address cost
16260 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16261 is not invariant itself it most likely means that base or index is not
16262 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16263 which is not profitable for x86. */
16264 if (parts.base
16265 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16266 && (current_pass->type == GIMPLE_PASS
16267 || !pic_offset_table_rtx
16268 || !REG_P (parts.base)
16269 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16270 cost++;
16272 if (parts.index
16273 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16274 && (current_pass->type == GIMPLE_PASS
16275 || !pic_offset_table_rtx
16276 || !REG_P (parts.index)
16277 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16278 cost++;
16280 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16281 since it's predecode logic can't detect the length of instructions
16282 and it degenerates to vector decoded. Increase cost of such
16283 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16284 to split such addresses or even refuse such addresses at all.
16286 Following addressing modes are affected:
16287 [base+scale*index]
16288 [scale*index+disp]
16289 [base+index]
16291 The first and last case may be avoidable by explicitly coding the zero in
16292 memory address, but I don't have AMD-K6 machine handy to check this
16293 theory. */
16295 if (TARGET_K6
16296 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16297 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16298 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16299 cost += 10;
16301 return cost;
16304 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16305 this is used for to form addresses to local data when -fPIC is in
16306 use. */
16308 static bool
16309 darwin_local_data_pic (rtx disp)
16311 return (GET_CODE (disp) == UNSPEC
16312 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16315 /* True if operand X should be loaded from GOT. */
16317 bool
16318 ix86_force_load_from_GOT_p (rtx x)
16320 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16321 && !TARGET_PECOFF && !TARGET_MACHO
16322 && !flag_plt && !flag_pic
16323 && ix86_cmodel != CM_LARGE
16324 && GET_CODE (x) == SYMBOL_REF
16325 && SYMBOL_REF_FUNCTION_P (x)
16326 && !SYMBOL_REF_LOCAL_P (x));
16329 /* Determine if a given RTX is a valid constant. We already know this
16330 satisfies CONSTANT_P. */
16332 static bool
16333 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16335 /* Pointer bounds constants are not valid. */
16336 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16337 return false;
16339 switch (GET_CODE (x))
16341 case CONST:
16342 x = XEXP (x, 0);
16344 if (GET_CODE (x) == PLUS)
16346 if (!CONST_INT_P (XEXP (x, 1)))
16347 return false;
16348 x = XEXP (x, 0);
16351 if (TARGET_MACHO && darwin_local_data_pic (x))
16352 return true;
16354 /* Only some unspecs are valid as "constants". */
16355 if (GET_CODE (x) == UNSPEC)
16356 switch (XINT (x, 1))
16358 case UNSPEC_GOT:
16359 case UNSPEC_GOTOFF:
16360 case UNSPEC_PLTOFF:
16361 return TARGET_64BIT;
16362 case UNSPEC_TPOFF:
16363 case UNSPEC_NTPOFF:
16364 x = XVECEXP (x, 0, 0);
16365 return (GET_CODE (x) == SYMBOL_REF
16366 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16367 case UNSPEC_DTPOFF:
16368 x = XVECEXP (x, 0, 0);
16369 return (GET_CODE (x) == SYMBOL_REF
16370 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16371 default:
16372 return false;
16375 /* We must have drilled down to a symbol. */
16376 if (GET_CODE (x) == LABEL_REF)
16377 return true;
16378 if (GET_CODE (x) != SYMBOL_REF)
16379 return false;
16380 /* FALLTHRU */
16382 case SYMBOL_REF:
16383 /* TLS symbols are never valid. */
16384 if (SYMBOL_REF_TLS_MODEL (x))
16385 return false;
16387 /* DLLIMPORT symbols are never valid. */
16388 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16389 && SYMBOL_REF_DLLIMPORT_P (x))
16390 return false;
16392 #if TARGET_MACHO
16393 /* mdynamic-no-pic */
16394 if (MACHO_DYNAMIC_NO_PIC_P)
16395 return machopic_symbol_defined_p (x);
16396 #endif
16398 /* External function address should be loaded
16399 via the GOT slot to avoid PLT. */
16400 if (ix86_force_load_from_GOT_p (x))
16401 return false;
16403 break;
16405 CASE_CONST_SCALAR_INT:
16406 switch (mode)
16408 case TImode:
16409 if (TARGET_64BIT)
16410 return true;
16411 /* FALLTHRU */
16412 case OImode:
16413 case XImode:
16414 if (!standard_sse_constant_p (x, mode))
16415 return false;
16416 default:
16417 break;
16419 break;
16421 case CONST_VECTOR:
16422 if (!standard_sse_constant_p (x, mode))
16423 return false;
16425 default:
16426 break;
16429 /* Otherwise we handle everything else in the move patterns. */
16430 return true;
16433 /* Determine if it's legal to put X into the constant pool. This
16434 is not possible for the address of thread-local symbols, which
16435 is checked above. */
16437 static bool
16438 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16440 /* We can put any immediate constant in memory. */
16441 switch (GET_CODE (x))
16443 CASE_CONST_ANY:
16444 return false;
16446 default:
16447 break;
16450 return !ix86_legitimate_constant_p (mode, x);
16453 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16454 otherwise zero. */
16456 static bool
16457 is_imported_p (rtx x)
16459 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16460 || GET_CODE (x) != SYMBOL_REF)
16461 return false;
16463 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16467 /* Nonzero if the constant value X is a legitimate general operand
16468 when generating PIC code. It is given that flag_pic is on and
16469 that X satisfies CONSTANT_P. */
16471 bool
16472 legitimate_pic_operand_p (rtx x)
16474 rtx inner;
16476 switch (GET_CODE (x))
16478 case CONST:
16479 inner = XEXP (x, 0);
16480 if (GET_CODE (inner) == PLUS
16481 && CONST_INT_P (XEXP (inner, 1)))
16482 inner = XEXP (inner, 0);
16484 /* Only some unspecs are valid as "constants". */
16485 if (GET_CODE (inner) == UNSPEC)
16486 switch (XINT (inner, 1))
16488 case UNSPEC_GOT:
16489 case UNSPEC_GOTOFF:
16490 case UNSPEC_PLTOFF:
16491 return TARGET_64BIT;
16492 case UNSPEC_TPOFF:
16493 x = XVECEXP (inner, 0, 0);
16494 return (GET_CODE (x) == SYMBOL_REF
16495 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16496 case UNSPEC_MACHOPIC_OFFSET:
16497 return legitimate_pic_address_disp_p (x);
16498 default:
16499 return false;
16501 /* FALLTHRU */
16503 case SYMBOL_REF:
16504 case LABEL_REF:
16505 return legitimate_pic_address_disp_p (x);
16507 default:
16508 return true;
16512 /* Determine if a given CONST RTX is a valid memory displacement
16513 in PIC mode. */
16515 bool
16516 legitimate_pic_address_disp_p (rtx disp)
16518 bool saw_plus;
16520 /* In 64bit mode we can allow direct addresses of symbols and labels
16521 when they are not dynamic symbols. */
16522 if (TARGET_64BIT)
16524 rtx op0 = disp, op1;
16526 switch (GET_CODE (disp))
16528 case LABEL_REF:
16529 return true;
16531 case CONST:
16532 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16533 break;
16534 op0 = XEXP (XEXP (disp, 0), 0);
16535 op1 = XEXP (XEXP (disp, 0), 1);
16536 if (!CONST_INT_P (op1)
16537 || INTVAL (op1) >= 16*1024*1024
16538 || INTVAL (op1) < -16*1024*1024)
16539 break;
16540 if (GET_CODE (op0) == LABEL_REF)
16541 return true;
16542 if (GET_CODE (op0) == CONST
16543 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16544 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16545 return true;
16546 if (GET_CODE (op0) == UNSPEC
16547 && XINT (op0, 1) == UNSPEC_PCREL)
16548 return true;
16549 if (GET_CODE (op0) != SYMBOL_REF)
16550 break;
16551 /* FALLTHRU */
16553 case SYMBOL_REF:
16554 /* TLS references should always be enclosed in UNSPEC.
16555 The dllimported symbol needs always to be resolved. */
16556 if (SYMBOL_REF_TLS_MODEL (op0)
16557 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16558 return false;
16560 if (TARGET_PECOFF)
16562 if (is_imported_p (op0))
16563 return true;
16565 if (SYMBOL_REF_FAR_ADDR_P (op0)
16566 || !SYMBOL_REF_LOCAL_P (op0))
16567 break;
16569 /* Function-symbols need to be resolved only for
16570 large-model.
16571 For the small-model we don't need to resolve anything
16572 here. */
16573 if ((ix86_cmodel != CM_LARGE_PIC
16574 && SYMBOL_REF_FUNCTION_P (op0))
16575 || ix86_cmodel == CM_SMALL_PIC)
16576 return true;
16577 /* Non-external symbols don't need to be resolved for
16578 large, and medium-model. */
16579 if ((ix86_cmodel == CM_LARGE_PIC
16580 || ix86_cmodel == CM_MEDIUM_PIC)
16581 && !SYMBOL_REF_EXTERNAL_P (op0))
16582 return true;
16584 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16585 && (SYMBOL_REF_LOCAL_P (op0)
16586 || (HAVE_LD_PIE_COPYRELOC
16587 && flag_pie
16588 && !SYMBOL_REF_WEAK (op0)
16589 && !SYMBOL_REF_FUNCTION_P (op0)))
16590 && ix86_cmodel != CM_LARGE_PIC)
16591 return true;
16592 break;
16594 default:
16595 break;
16598 if (GET_CODE (disp) != CONST)
16599 return false;
16600 disp = XEXP (disp, 0);
16602 if (TARGET_64BIT)
16604 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16605 of GOT tables. We should not need these anyway. */
16606 if (GET_CODE (disp) != UNSPEC
16607 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16608 && XINT (disp, 1) != UNSPEC_GOTOFF
16609 && XINT (disp, 1) != UNSPEC_PCREL
16610 && XINT (disp, 1) != UNSPEC_PLTOFF))
16611 return false;
16613 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16614 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16615 return false;
16616 return true;
16619 saw_plus = false;
16620 if (GET_CODE (disp) == PLUS)
16622 if (!CONST_INT_P (XEXP (disp, 1)))
16623 return false;
16624 disp = XEXP (disp, 0);
16625 saw_plus = true;
16628 if (TARGET_MACHO && darwin_local_data_pic (disp))
16629 return true;
16631 if (GET_CODE (disp) != UNSPEC)
16632 return false;
16634 switch (XINT (disp, 1))
16636 case UNSPEC_GOT:
16637 if (saw_plus)
16638 return false;
16639 /* We need to check for both symbols and labels because VxWorks loads
16640 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16641 details. */
16642 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16643 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16644 case UNSPEC_GOTOFF:
16645 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16646 While ABI specify also 32bit relocation but we don't produce it in
16647 small PIC model at all. */
16648 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16649 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16650 && !TARGET_64BIT)
16651 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16652 return false;
16653 case UNSPEC_GOTTPOFF:
16654 case UNSPEC_GOTNTPOFF:
16655 case UNSPEC_INDNTPOFF:
16656 if (saw_plus)
16657 return false;
16658 disp = XVECEXP (disp, 0, 0);
16659 return (GET_CODE (disp) == SYMBOL_REF
16660 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16661 case UNSPEC_NTPOFF:
16662 disp = XVECEXP (disp, 0, 0);
16663 return (GET_CODE (disp) == SYMBOL_REF
16664 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16665 case UNSPEC_DTPOFF:
16666 disp = XVECEXP (disp, 0, 0);
16667 return (GET_CODE (disp) == SYMBOL_REF
16668 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16671 return false;
16674 /* Determine if op is suitable RTX for an address register.
16675 Return naked register if a register or a register subreg is
16676 found, otherwise return NULL_RTX. */
16678 static rtx
16679 ix86_validate_address_register (rtx op)
16681 machine_mode mode = GET_MODE (op);
16683 /* Only SImode or DImode registers can form the address. */
16684 if (mode != SImode && mode != DImode)
16685 return NULL_RTX;
16687 if (REG_P (op))
16688 return op;
16689 else if (SUBREG_P (op))
16691 rtx reg = SUBREG_REG (op);
16693 if (!REG_P (reg))
16694 return NULL_RTX;
16696 mode = GET_MODE (reg);
16698 /* Don't allow SUBREGs that span more than a word. It can
16699 lead to spill failures when the register is one word out
16700 of a two word structure. */
16701 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16702 return NULL_RTX;
16704 /* Allow only SUBREGs of non-eliminable hard registers. */
16705 if (register_no_elim_operand (reg, mode))
16706 return reg;
16709 /* Op is not a register. */
16710 return NULL_RTX;
16713 /* Recognizes RTL expressions that are valid memory addresses for an
16714 instruction. The MODE argument is the machine mode for the MEM
16715 expression that wants to use this address.
16717 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16718 convert common non-canonical forms to canonical form so that they will
16719 be recognized. */
16721 static bool
16722 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16724 struct ix86_address parts;
16725 rtx base, index, disp;
16726 HOST_WIDE_INT scale;
16727 addr_space_t seg;
16729 if (ix86_decompose_address (addr, &parts) <= 0)
16730 /* Decomposition failed. */
16731 return false;
16733 base = parts.base;
16734 index = parts.index;
16735 disp = parts.disp;
16736 scale = parts.scale;
16737 seg = parts.seg;
16739 /* Validate base register. */
16740 if (base)
16742 rtx reg = ix86_validate_address_register (base);
16744 if (reg == NULL_RTX)
16745 return false;
16747 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16748 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16749 /* Base is not valid. */
16750 return false;
16753 /* Validate index register. */
16754 if (index)
16756 rtx reg = ix86_validate_address_register (index);
16758 if (reg == NULL_RTX)
16759 return false;
16761 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16762 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16763 /* Index is not valid. */
16764 return false;
16767 /* Index and base should have the same mode. */
16768 if (base && index
16769 && GET_MODE (base) != GET_MODE (index))
16770 return false;
16772 /* Address override works only on the (%reg) part of %fs:(%reg). */
16773 if (seg != ADDR_SPACE_GENERIC
16774 && ((base && GET_MODE (base) != word_mode)
16775 || (index && GET_MODE (index) != word_mode)))
16776 return false;
16778 /* Validate scale factor. */
16779 if (scale != 1)
16781 if (!index)
16782 /* Scale without index. */
16783 return false;
16785 if (scale != 2 && scale != 4 && scale != 8)
16786 /* Scale is not a valid multiplier. */
16787 return false;
16790 /* Validate displacement. */
16791 if (disp)
16793 if (GET_CODE (disp) == CONST
16794 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16795 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16796 switch (XINT (XEXP (disp, 0), 1))
16798 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16799 when used. While ABI specify also 32bit relocations, we
16800 don't produce them at all and use IP relative instead.
16801 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16802 should be loaded via GOT. */
16803 case UNSPEC_GOT:
16804 if (!TARGET_64BIT
16805 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16806 goto is_legitimate_pic;
16807 /* FALLTHRU */
16808 case UNSPEC_GOTOFF:
16809 gcc_assert (flag_pic);
16810 if (!TARGET_64BIT)
16811 goto is_legitimate_pic;
16813 /* 64bit address unspec. */
16814 return false;
16816 case UNSPEC_GOTPCREL:
16817 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16818 goto is_legitimate_pic;
16819 /* FALLTHRU */
16820 case UNSPEC_PCREL:
16821 gcc_assert (flag_pic);
16822 goto is_legitimate_pic;
16824 case UNSPEC_GOTTPOFF:
16825 case UNSPEC_GOTNTPOFF:
16826 case UNSPEC_INDNTPOFF:
16827 case UNSPEC_NTPOFF:
16828 case UNSPEC_DTPOFF:
16829 break;
16831 case UNSPEC_STACK_CHECK:
16832 gcc_assert (flag_split_stack);
16833 break;
16835 default:
16836 /* Invalid address unspec. */
16837 return false;
16840 else if (SYMBOLIC_CONST (disp)
16841 && (flag_pic
16842 || (TARGET_MACHO
16843 #if TARGET_MACHO
16844 && MACHOPIC_INDIRECT
16845 && !machopic_operand_p (disp)
16846 #endif
16850 is_legitimate_pic:
16851 if (TARGET_64BIT && (index || base))
16853 /* foo@dtpoff(%rX) is ok. */
16854 if (GET_CODE (disp) != CONST
16855 || GET_CODE (XEXP (disp, 0)) != PLUS
16856 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16857 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16858 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16859 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16860 /* Non-constant pic memory reference. */
16861 return false;
16863 else if ((!TARGET_MACHO || flag_pic)
16864 && ! legitimate_pic_address_disp_p (disp))
16865 /* Displacement is an invalid pic construct. */
16866 return false;
16867 #if TARGET_MACHO
16868 else if (MACHO_DYNAMIC_NO_PIC_P
16869 && !ix86_legitimate_constant_p (Pmode, disp))
16870 /* displacment must be referenced via non_lazy_pointer */
16871 return false;
16872 #endif
16874 /* This code used to verify that a symbolic pic displacement
16875 includes the pic_offset_table_rtx register.
16877 While this is good idea, unfortunately these constructs may
16878 be created by "adds using lea" optimization for incorrect
16879 code like:
16881 int a;
16882 int foo(int i)
16884 return *(&a+i);
16887 This code is nonsensical, but results in addressing
16888 GOT table with pic_offset_table_rtx base. We can't
16889 just refuse it easily, since it gets matched by
16890 "addsi3" pattern, that later gets split to lea in the
16891 case output register differs from input. While this
16892 can be handled by separate addsi pattern for this case
16893 that never results in lea, this seems to be easier and
16894 correct fix for crash to disable this test. */
16896 else if (GET_CODE (disp) != LABEL_REF
16897 && !CONST_INT_P (disp)
16898 && (GET_CODE (disp) != CONST
16899 || !ix86_legitimate_constant_p (Pmode, disp))
16900 && (GET_CODE (disp) != SYMBOL_REF
16901 || !ix86_legitimate_constant_p (Pmode, disp)))
16902 /* Displacement is not constant. */
16903 return false;
16904 else if (TARGET_64BIT
16905 && !x86_64_immediate_operand (disp, VOIDmode))
16906 /* Displacement is out of range. */
16907 return false;
16908 /* In x32 mode, constant addresses are sign extended to 64bit, so
16909 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16910 else if (TARGET_X32 && !(index || base)
16911 && CONST_INT_P (disp)
16912 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16913 return false;
16916 /* Everything looks valid. */
16917 return true;
16920 /* Determine if a given RTX is a valid constant address. */
16922 bool
16923 constant_address_p (rtx x)
16925 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16928 /* Return a unique alias set for the GOT. */
16930 static alias_set_type
16931 ix86_GOT_alias_set (void)
16933 static alias_set_type set = -1;
16934 if (set == -1)
16935 set = new_alias_set ();
16936 return set;
16939 /* Return a legitimate reference for ORIG (an address) using the
16940 register REG. If REG is 0, a new pseudo is generated.
16942 There are two types of references that must be handled:
16944 1. Global data references must load the address from the GOT, via
16945 the PIC reg. An insn is emitted to do this load, and the reg is
16946 returned.
16948 2. Static data references, constant pool addresses, and code labels
16949 compute the address as an offset from the GOT, whose base is in
16950 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16951 differentiate them from global data objects. The returned
16952 address is the PIC reg + an unspec constant.
16954 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16955 reg also appears in the address. */
16957 static rtx
16958 legitimize_pic_address (rtx orig, rtx reg)
16960 rtx addr = orig;
16961 rtx new_rtx = orig;
16963 #if TARGET_MACHO
16964 if (TARGET_MACHO && !TARGET_64BIT)
16966 if (reg == 0)
16967 reg = gen_reg_rtx (Pmode);
16968 /* Use the generic Mach-O PIC machinery. */
16969 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16971 #endif
16973 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16975 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16976 if (tmp)
16977 return tmp;
16980 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16981 new_rtx = addr;
16982 else if ((!TARGET_64BIT
16983 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16984 && !TARGET_PECOFF
16985 && gotoff_operand (addr, Pmode))
16987 /* This symbol may be referenced via a displacement
16988 from the PIC base address (@GOTOFF). */
16989 if (GET_CODE (addr) == CONST)
16990 addr = XEXP (addr, 0);
16992 if (GET_CODE (addr) == PLUS)
16994 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16995 UNSPEC_GOTOFF);
16996 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16998 else
16999 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
17001 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17003 if (TARGET_64BIT)
17004 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17006 if (reg != 0)
17008 gcc_assert (REG_P (reg));
17009 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17010 new_rtx, reg, 1, OPTAB_DIRECT);
17012 else
17013 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17015 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17016 /* We can't use @GOTOFF for text labels
17017 on VxWorks, see gotoff_operand. */
17018 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17020 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17021 if (tmp)
17022 return tmp;
17024 /* For x64 PE-COFF there is no GOT table,
17025 so we use address directly. */
17026 if (TARGET_64BIT && TARGET_PECOFF)
17028 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17029 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17031 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17033 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17034 UNSPEC_GOTPCREL);
17035 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17036 new_rtx = gen_const_mem (Pmode, new_rtx);
17037 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17039 else
17041 /* This symbol must be referenced via a load
17042 from the Global Offset Table (@GOT). */
17043 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17044 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17045 if (TARGET_64BIT)
17046 new_rtx = force_reg (Pmode, new_rtx);
17047 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17048 new_rtx = gen_const_mem (Pmode, new_rtx);
17049 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17052 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17054 else
17056 if (CONST_INT_P (addr)
17057 && !x86_64_immediate_operand (addr, VOIDmode))
17058 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17059 else if (GET_CODE (addr) == CONST)
17061 addr = XEXP (addr, 0);
17063 /* We must match stuff we generate before. Assume the only
17064 unspecs that can get here are ours. Not that we could do
17065 anything with them anyway.... */
17066 if (GET_CODE (addr) == UNSPEC
17067 || (GET_CODE (addr) == PLUS
17068 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17069 return orig;
17070 gcc_assert (GET_CODE (addr) == PLUS);
17073 if (GET_CODE (addr) == PLUS)
17075 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17077 /* Check first to see if this is a constant
17078 offset from a @GOTOFF symbol reference. */
17079 if (!TARGET_PECOFF
17080 && gotoff_operand (op0, Pmode)
17081 && CONST_INT_P (op1))
17083 if (!TARGET_64BIT)
17085 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17086 UNSPEC_GOTOFF);
17087 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17088 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17090 if (reg != 0)
17092 gcc_assert (REG_P (reg));
17093 new_rtx = expand_simple_binop (Pmode, PLUS,
17094 pic_offset_table_rtx,
17095 new_rtx, reg, 1,
17096 OPTAB_DIRECT);
17098 else
17099 new_rtx
17100 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17102 else
17104 if (INTVAL (op1) < -16*1024*1024
17105 || INTVAL (op1) >= 16*1024*1024)
17107 if (!x86_64_immediate_operand (op1, Pmode))
17108 op1 = force_reg (Pmode, op1);
17110 new_rtx
17111 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17115 else
17117 rtx base = legitimize_pic_address (op0, reg);
17118 machine_mode mode = GET_MODE (base);
17119 new_rtx
17120 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17122 if (CONST_INT_P (new_rtx))
17124 if (INTVAL (new_rtx) < -16*1024*1024
17125 || INTVAL (new_rtx) >= 16*1024*1024)
17127 if (!x86_64_immediate_operand (new_rtx, mode))
17128 new_rtx = force_reg (mode, new_rtx);
17130 new_rtx
17131 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17133 else
17134 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17136 else
17138 /* For %rip addressing, we have to use
17139 just disp32, not base nor index. */
17140 if (TARGET_64BIT
17141 && (GET_CODE (base) == SYMBOL_REF
17142 || GET_CODE (base) == LABEL_REF))
17143 base = force_reg (mode, base);
17144 if (GET_CODE (new_rtx) == PLUS
17145 && CONSTANT_P (XEXP (new_rtx, 1)))
17147 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17148 new_rtx = XEXP (new_rtx, 1);
17150 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17155 return new_rtx;
17158 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17160 static rtx
17161 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17163 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17165 if (GET_MODE (tp) != tp_mode)
17167 gcc_assert (GET_MODE (tp) == SImode);
17168 gcc_assert (tp_mode == DImode);
17170 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17173 if (to_reg)
17174 tp = copy_to_mode_reg (tp_mode, tp);
17176 return tp;
17179 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17181 static GTY(()) rtx ix86_tls_symbol;
17183 static rtx
17184 ix86_tls_get_addr (void)
17186 if (!ix86_tls_symbol)
17188 const char *sym
17189 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17190 ? "___tls_get_addr" : "__tls_get_addr");
17192 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17195 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17197 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17198 UNSPEC_PLTOFF);
17199 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17200 gen_rtx_CONST (Pmode, unspec));
17203 return ix86_tls_symbol;
17206 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17208 static GTY(()) rtx ix86_tls_module_base_symbol;
17211 ix86_tls_module_base (void)
17213 if (!ix86_tls_module_base_symbol)
17215 ix86_tls_module_base_symbol
17216 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17218 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17219 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17222 return ix86_tls_module_base_symbol;
17225 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17226 false if we expect this to be used for a memory address and true if
17227 we expect to load the address into a register. */
17229 static rtx
17230 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17232 rtx dest, base, off;
17233 rtx pic = NULL_RTX, tp = NULL_RTX;
17234 machine_mode tp_mode = Pmode;
17235 int type;
17237 /* Fall back to global dynamic model if tool chain cannot support local
17238 dynamic. */
17239 if (TARGET_SUN_TLS && !TARGET_64BIT
17240 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17241 && model == TLS_MODEL_LOCAL_DYNAMIC)
17242 model = TLS_MODEL_GLOBAL_DYNAMIC;
17244 switch (model)
17246 case TLS_MODEL_GLOBAL_DYNAMIC:
17247 dest = gen_reg_rtx (Pmode);
17249 if (!TARGET_64BIT)
17251 if (flag_pic && !TARGET_PECOFF)
17252 pic = pic_offset_table_rtx;
17253 else
17255 pic = gen_reg_rtx (Pmode);
17256 emit_insn (gen_set_got (pic));
17260 if (TARGET_GNU2_TLS)
17262 if (TARGET_64BIT)
17263 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17264 else
17265 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17267 tp = get_thread_pointer (Pmode, true);
17268 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17270 if (GET_MODE (x) != Pmode)
17271 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17273 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17275 else
17277 rtx caddr = ix86_tls_get_addr ();
17279 if (TARGET_64BIT)
17281 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17282 rtx_insn *insns;
17284 start_sequence ();
17285 emit_call_insn
17286 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17287 insns = get_insns ();
17288 end_sequence ();
17290 if (GET_MODE (x) != Pmode)
17291 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17293 RTL_CONST_CALL_P (insns) = 1;
17294 emit_libcall_block (insns, dest, rax, x);
17296 else
17297 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17299 break;
17301 case TLS_MODEL_LOCAL_DYNAMIC:
17302 base = gen_reg_rtx (Pmode);
17304 if (!TARGET_64BIT)
17306 if (flag_pic)
17307 pic = pic_offset_table_rtx;
17308 else
17310 pic = gen_reg_rtx (Pmode);
17311 emit_insn (gen_set_got (pic));
17315 if (TARGET_GNU2_TLS)
17317 rtx tmp = ix86_tls_module_base ();
17319 if (TARGET_64BIT)
17320 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17321 else
17322 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17324 tp = get_thread_pointer (Pmode, true);
17325 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17326 gen_rtx_MINUS (Pmode, tmp, tp));
17328 else
17330 rtx caddr = ix86_tls_get_addr ();
17332 if (TARGET_64BIT)
17334 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17335 rtx_insn *insns;
17336 rtx eqv;
17338 start_sequence ();
17339 emit_call_insn
17340 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17341 insns = get_insns ();
17342 end_sequence ();
17344 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17345 share the LD_BASE result with other LD model accesses. */
17346 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17347 UNSPEC_TLS_LD_BASE);
17349 RTL_CONST_CALL_P (insns) = 1;
17350 emit_libcall_block (insns, base, rax, eqv);
17352 else
17353 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17356 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17357 off = gen_rtx_CONST (Pmode, off);
17359 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17361 if (TARGET_GNU2_TLS)
17363 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17365 if (GET_MODE (x) != Pmode)
17366 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17368 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17370 break;
17372 case TLS_MODEL_INITIAL_EXEC:
17373 if (TARGET_64BIT)
17375 if (TARGET_SUN_TLS && !TARGET_X32)
17377 /* The Sun linker took the AMD64 TLS spec literally
17378 and can only handle %rax as destination of the
17379 initial executable code sequence. */
17381 dest = gen_reg_rtx (DImode);
17382 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17383 return dest;
17386 /* Generate DImode references to avoid %fs:(%reg32)
17387 problems and linker IE->LE relaxation bug. */
17388 tp_mode = DImode;
17389 pic = NULL;
17390 type = UNSPEC_GOTNTPOFF;
17392 else if (flag_pic)
17394 pic = pic_offset_table_rtx;
17395 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17397 else if (!TARGET_ANY_GNU_TLS)
17399 pic = gen_reg_rtx (Pmode);
17400 emit_insn (gen_set_got (pic));
17401 type = UNSPEC_GOTTPOFF;
17403 else
17405 pic = NULL;
17406 type = UNSPEC_INDNTPOFF;
17409 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17410 off = gen_rtx_CONST (tp_mode, off);
17411 if (pic)
17412 off = gen_rtx_PLUS (tp_mode, pic, off);
17413 off = gen_const_mem (tp_mode, off);
17414 set_mem_alias_set (off, ix86_GOT_alias_set ());
17416 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17418 base = get_thread_pointer (tp_mode,
17419 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17420 off = force_reg (tp_mode, off);
17421 dest = gen_rtx_PLUS (tp_mode, base, off);
17422 if (tp_mode != Pmode)
17423 dest = convert_to_mode (Pmode, dest, 1);
17425 else
17427 base = get_thread_pointer (Pmode, true);
17428 dest = gen_reg_rtx (Pmode);
17429 emit_insn (ix86_gen_sub3 (dest, base, off));
17431 break;
17433 case TLS_MODEL_LOCAL_EXEC:
17434 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17435 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17436 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17437 off = gen_rtx_CONST (Pmode, off);
17439 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17441 base = get_thread_pointer (Pmode,
17442 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17443 return gen_rtx_PLUS (Pmode, base, off);
17445 else
17447 base = get_thread_pointer (Pmode, true);
17448 dest = gen_reg_rtx (Pmode);
17449 emit_insn (ix86_gen_sub3 (dest, base, off));
17451 break;
17453 default:
17454 gcc_unreachable ();
17457 return dest;
17460 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17461 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17462 unique refptr-DECL symbol corresponding to symbol DECL. */
17464 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17466 static inline hashval_t hash (tree_map *m) { return m->hash; }
17467 static inline bool
17468 equal (tree_map *a, tree_map *b)
17470 return a->base.from == b->base.from;
17473 static int
17474 keep_cache_entry (tree_map *&m)
17476 return ggc_marked_p (m->base.from);
17480 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17482 static tree
17483 get_dllimport_decl (tree decl, bool beimport)
17485 struct tree_map *h, in;
17486 const char *name;
17487 const char *prefix;
17488 size_t namelen, prefixlen;
17489 char *imp_name;
17490 tree to;
17491 rtx rtl;
17493 if (!dllimport_map)
17494 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17496 in.hash = htab_hash_pointer (decl);
17497 in.base.from = decl;
17498 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17499 h = *loc;
17500 if (h)
17501 return h->to;
17503 *loc = h = ggc_alloc<tree_map> ();
17504 h->hash = in.hash;
17505 h->base.from = decl;
17506 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17507 VAR_DECL, NULL, ptr_type_node);
17508 DECL_ARTIFICIAL (to) = 1;
17509 DECL_IGNORED_P (to) = 1;
17510 DECL_EXTERNAL (to) = 1;
17511 TREE_READONLY (to) = 1;
17513 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17514 name = targetm.strip_name_encoding (name);
17515 if (beimport)
17516 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17517 ? "*__imp_" : "*__imp__";
17518 else
17519 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17520 namelen = strlen (name);
17521 prefixlen = strlen (prefix);
17522 imp_name = (char *) alloca (namelen + prefixlen + 1);
17523 memcpy (imp_name, prefix, prefixlen);
17524 memcpy (imp_name + prefixlen, name, namelen + 1);
17526 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17527 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17528 SET_SYMBOL_REF_DECL (rtl, to);
17529 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17530 if (!beimport)
17532 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17533 #ifdef SUB_TARGET_RECORD_STUB
17534 SUB_TARGET_RECORD_STUB (name);
17535 #endif
17538 rtl = gen_const_mem (Pmode, rtl);
17539 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17541 SET_DECL_RTL (to, rtl);
17542 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17544 return to;
17547 /* Expand SYMBOL into its corresponding far-address symbol.
17548 WANT_REG is true if we require the result be a register. */
17550 static rtx
17551 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17553 tree imp_decl;
17554 rtx x;
17556 gcc_assert (SYMBOL_REF_DECL (symbol));
17557 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17559 x = DECL_RTL (imp_decl);
17560 if (want_reg)
17561 x = force_reg (Pmode, x);
17562 return x;
17565 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17566 true if we require the result be a register. */
17568 static rtx
17569 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17571 tree imp_decl;
17572 rtx x;
17574 gcc_assert (SYMBOL_REF_DECL (symbol));
17575 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17577 x = DECL_RTL (imp_decl);
17578 if (want_reg)
17579 x = force_reg (Pmode, x);
17580 return x;
17583 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17584 is true if we require the result be a register. */
17586 static rtx
17587 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17589 if (!TARGET_PECOFF)
17590 return NULL_RTX;
17592 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17594 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17595 return legitimize_dllimport_symbol (addr, inreg);
17596 if (GET_CODE (addr) == CONST
17597 && GET_CODE (XEXP (addr, 0)) == PLUS
17598 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17599 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17601 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17602 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17606 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17607 return NULL_RTX;
17608 if (GET_CODE (addr) == SYMBOL_REF
17609 && !is_imported_p (addr)
17610 && SYMBOL_REF_EXTERNAL_P (addr)
17611 && SYMBOL_REF_DECL (addr))
17612 return legitimize_pe_coff_extern_decl (addr, inreg);
17614 if (GET_CODE (addr) == CONST
17615 && GET_CODE (XEXP (addr, 0)) == PLUS
17616 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17617 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17618 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17619 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17621 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17622 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17624 return NULL_RTX;
17627 /* Try machine-dependent ways of modifying an illegitimate address
17628 to be legitimate. If we find one, return the new, valid address.
17629 This macro is used in only one place: `memory_address' in explow.c.
17631 OLDX is the address as it was before break_out_memory_refs was called.
17632 In some cases it is useful to look at this to decide what needs to be done.
17634 It is always safe for this macro to do nothing. It exists to recognize
17635 opportunities to optimize the output.
17637 For the 80386, we handle X+REG by loading X into a register R and
17638 using R+REG. R will go in a general reg and indexing will be used.
17639 However, if REG is a broken-out memory address or multiplication,
17640 nothing needs to be done because REG can certainly go in a general reg.
17642 When -fpic is used, special handling is needed for symbolic references.
17643 See comments by legitimize_pic_address in i386.c for details. */
17645 static rtx
17646 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17648 bool changed = false;
17649 unsigned log;
17651 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17652 if (log)
17653 return legitimize_tls_address (x, (enum tls_model) log, false);
17654 if (GET_CODE (x) == CONST
17655 && GET_CODE (XEXP (x, 0)) == PLUS
17656 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17657 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17659 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17660 (enum tls_model) log, false);
17661 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17664 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17666 rtx tmp = legitimize_pe_coff_symbol (x, true);
17667 if (tmp)
17668 return tmp;
17671 if (flag_pic && SYMBOLIC_CONST (x))
17672 return legitimize_pic_address (x, 0);
17674 #if TARGET_MACHO
17675 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17676 return machopic_indirect_data_reference (x, 0);
17677 #endif
17679 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17680 if (GET_CODE (x) == ASHIFT
17681 && CONST_INT_P (XEXP (x, 1))
17682 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17684 changed = true;
17685 log = INTVAL (XEXP (x, 1));
17686 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17687 GEN_INT (1 << log));
17690 if (GET_CODE (x) == PLUS)
17692 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17694 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17695 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17696 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17698 changed = true;
17699 log = INTVAL (XEXP (XEXP (x, 0), 1));
17700 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17701 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17702 GEN_INT (1 << log));
17705 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17706 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17707 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17709 changed = true;
17710 log = INTVAL (XEXP (XEXP (x, 1), 1));
17711 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17712 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17713 GEN_INT (1 << log));
17716 /* Put multiply first if it isn't already. */
17717 if (GET_CODE (XEXP (x, 1)) == MULT)
17719 std::swap (XEXP (x, 0), XEXP (x, 1));
17720 changed = true;
17723 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17724 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17725 created by virtual register instantiation, register elimination, and
17726 similar optimizations. */
17727 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17729 changed = true;
17730 x = gen_rtx_PLUS (Pmode,
17731 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17732 XEXP (XEXP (x, 1), 0)),
17733 XEXP (XEXP (x, 1), 1));
17736 /* Canonicalize
17737 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17738 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17739 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17740 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17741 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17742 && CONSTANT_P (XEXP (x, 1)))
17744 rtx constant;
17745 rtx other = NULL_RTX;
17747 if (CONST_INT_P (XEXP (x, 1)))
17749 constant = XEXP (x, 1);
17750 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17752 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17754 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17755 other = XEXP (x, 1);
17757 else
17758 constant = 0;
17760 if (constant)
17762 changed = true;
17763 x = gen_rtx_PLUS (Pmode,
17764 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17765 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17766 plus_constant (Pmode, other,
17767 INTVAL (constant)));
17771 if (changed && ix86_legitimate_address_p (mode, x, false))
17772 return x;
17774 if (GET_CODE (XEXP (x, 0)) == MULT)
17776 changed = true;
17777 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17780 if (GET_CODE (XEXP (x, 1)) == MULT)
17782 changed = true;
17783 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17786 if (changed
17787 && REG_P (XEXP (x, 1))
17788 && REG_P (XEXP (x, 0)))
17789 return x;
17791 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17793 changed = true;
17794 x = legitimize_pic_address (x, 0);
17797 if (changed && ix86_legitimate_address_p (mode, x, false))
17798 return x;
17800 if (REG_P (XEXP (x, 0)))
17802 rtx temp = gen_reg_rtx (Pmode);
17803 rtx val = force_operand (XEXP (x, 1), temp);
17804 if (val != temp)
17806 val = convert_to_mode (Pmode, val, 1);
17807 emit_move_insn (temp, val);
17810 XEXP (x, 1) = temp;
17811 return x;
17814 else if (REG_P (XEXP (x, 1)))
17816 rtx temp = gen_reg_rtx (Pmode);
17817 rtx val = force_operand (XEXP (x, 0), temp);
17818 if (val != temp)
17820 val = convert_to_mode (Pmode, val, 1);
17821 emit_move_insn (temp, val);
17824 XEXP (x, 0) = temp;
17825 return x;
17829 return x;
17832 /* Print an integer constant expression in assembler syntax. Addition
17833 and subtraction are the only arithmetic that may appear in these
17834 expressions. FILE is the stdio stream to write to, X is the rtx, and
17835 CODE is the operand print code from the output string. */
17837 static void
17838 output_pic_addr_const (FILE *file, rtx x, int code)
17840 char buf[256];
17842 switch (GET_CODE (x))
17844 case PC:
17845 gcc_assert (flag_pic);
17846 putc ('.', file);
17847 break;
17849 case SYMBOL_REF:
17850 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17851 output_addr_const (file, x);
17852 else
17854 const char *name = XSTR (x, 0);
17856 /* Mark the decl as referenced so that cgraph will
17857 output the function. */
17858 if (SYMBOL_REF_DECL (x))
17859 mark_decl_referenced (SYMBOL_REF_DECL (x));
17861 #if TARGET_MACHO
17862 if (MACHOPIC_INDIRECT
17863 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17864 name = machopic_indirection_name (x, /*stub_p=*/true);
17865 #endif
17866 assemble_name (file, name);
17868 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17869 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17870 fputs ("@PLT", file);
17871 break;
17873 case LABEL_REF:
17874 x = XEXP (x, 0);
17875 /* FALLTHRU */
17876 case CODE_LABEL:
17877 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17878 assemble_name (asm_out_file, buf);
17879 break;
17881 case CONST_INT:
17882 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17883 break;
17885 case CONST:
17886 /* This used to output parentheses around the expression,
17887 but that does not work on the 386 (either ATT or BSD assembler). */
17888 output_pic_addr_const (file, XEXP (x, 0), code);
17889 break;
17891 case CONST_DOUBLE:
17892 /* We can't handle floating point constants;
17893 TARGET_PRINT_OPERAND must handle them. */
17894 output_operand_lossage ("floating constant misused");
17895 break;
17897 case PLUS:
17898 /* Some assemblers need integer constants to appear first. */
17899 if (CONST_INT_P (XEXP (x, 0)))
17901 output_pic_addr_const (file, XEXP (x, 0), code);
17902 putc ('+', file);
17903 output_pic_addr_const (file, XEXP (x, 1), code);
17905 else
17907 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17908 output_pic_addr_const (file, XEXP (x, 1), code);
17909 putc ('+', file);
17910 output_pic_addr_const (file, XEXP (x, 0), code);
17912 break;
17914 case MINUS:
17915 if (!TARGET_MACHO)
17916 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17917 output_pic_addr_const (file, XEXP (x, 0), code);
17918 putc ('-', file);
17919 output_pic_addr_const (file, XEXP (x, 1), code);
17920 if (!TARGET_MACHO)
17921 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17922 break;
17924 case UNSPEC:
17925 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
17927 bool f = i386_asm_output_addr_const_extra (file, x);
17928 gcc_assert (f);
17929 break;
17932 gcc_assert (XVECLEN (x, 0) == 1);
17933 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17934 switch (XINT (x, 1))
17936 case UNSPEC_GOT:
17937 fputs ("@GOT", file);
17938 break;
17939 case UNSPEC_GOTOFF:
17940 fputs ("@GOTOFF", file);
17941 break;
17942 case UNSPEC_PLTOFF:
17943 fputs ("@PLTOFF", file);
17944 break;
17945 case UNSPEC_PCREL:
17946 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17947 "(%rip)" : "[rip]", file);
17948 break;
17949 case UNSPEC_GOTPCREL:
17950 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17951 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17952 break;
17953 case UNSPEC_GOTTPOFF:
17954 /* FIXME: This might be @TPOFF in Sun ld too. */
17955 fputs ("@gottpoff", file);
17956 break;
17957 case UNSPEC_TPOFF:
17958 fputs ("@tpoff", file);
17959 break;
17960 case UNSPEC_NTPOFF:
17961 if (TARGET_64BIT)
17962 fputs ("@tpoff", file);
17963 else
17964 fputs ("@ntpoff", file);
17965 break;
17966 case UNSPEC_DTPOFF:
17967 fputs ("@dtpoff", file);
17968 break;
17969 case UNSPEC_GOTNTPOFF:
17970 if (TARGET_64BIT)
17971 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17972 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17973 else
17974 fputs ("@gotntpoff", file);
17975 break;
17976 case UNSPEC_INDNTPOFF:
17977 fputs ("@indntpoff", file);
17978 break;
17979 #if TARGET_MACHO
17980 case UNSPEC_MACHOPIC_OFFSET:
17981 putc ('-', file);
17982 machopic_output_function_base_name (file);
17983 break;
17984 #endif
17985 default:
17986 output_operand_lossage ("invalid UNSPEC as operand");
17987 break;
17989 break;
17991 default:
17992 output_operand_lossage ("invalid expression as operand");
17996 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17997 We need to emit DTP-relative relocations. */
17999 static void ATTRIBUTE_UNUSED
18000 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
18002 fputs (ASM_LONG, file);
18003 output_addr_const (file, x);
18004 fputs ("@dtpoff", file);
18005 switch (size)
18007 case 4:
18008 break;
18009 case 8:
18010 fputs (", 0", file);
18011 break;
18012 default:
18013 gcc_unreachable ();
18017 /* Return true if X is a representation of the PIC register. This copes
18018 with calls from ix86_find_base_term, where the register might have
18019 been replaced by a cselib value. */
18021 static bool
18022 ix86_pic_register_p (rtx x)
18024 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18025 return (pic_offset_table_rtx
18026 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18027 else if (!REG_P (x))
18028 return false;
18029 else if (pic_offset_table_rtx)
18031 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18032 return true;
18033 if (HARD_REGISTER_P (x)
18034 && !HARD_REGISTER_P (pic_offset_table_rtx)
18035 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18036 return true;
18037 return false;
18039 else
18040 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18043 /* Helper function for ix86_delegitimize_address.
18044 Attempt to delegitimize TLS local-exec accesses. */
18046 static rtx
18047 ix86_delegitimize_tls_address (rtx orig_x)
18049 rtx x = orig_x, unspec;
18050 struct ix86_address addr;
18052 if (!TARGET_TLS_DIRECT_SEG_REFS)
18053 return orig_x;
18054 if (MEM_P (x))
18055 x = XEXP (x, 0);
18056 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18057 return orig_x;
18058 if (ix86_decompose_address (x, &addr) == 0
18059 || addr.seg != DEFAULT_TLS_SEG_REG
18060 || addr.disp == NULL_RTX
18061 || GET_CODE (addr.disp) != CONST)
18062 return orig_x;
18063 unspec = XEXP (addr.disp, 0);
18064 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18065 unspec = XEXP (unspec, 0);
18066 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18067 return orig_x;
18068 x = XVECEXP (unspec, 0, 0);
18069 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18070 if (unspec != XEXP (addr.disp, 0))
18071 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18072 if (addr.index)
18074 rtx idx = addr.index;
18075 if (addr.scale != 1)
18076 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18077 x = gen_rtx_PLUS (Pmode, idx, x);
18079 if (addr.base)
18080 x = gen_rtx_PLUS (Pmode, addr.base, x);
18081 if (MEM_P (orig_x))
18082 x = replace_equiv_address_nv (orig_x, x);
18083 return x;
18086 /* In the name of slightly smaller debug output, and to cater to
18087 general assembler lossage, recognize PIC+GOTOFF and turn it back
18088 into a direct symbol reference.
18090 On Darwin, this is necessary to avoid a crash, because Darwin
18091 has a different PIC label for each routine but the DWARF debugging
18092 information is not associated with any particular routine, so it's
18093 necessary to remove references to the PIC label from RTL stored by
18094 the DWARF output code.
18096 This helper is used in the normal ix86_delegitimize_address
18097 entrypoint (e.g. used in the target delegitimization hook) and
18098 in ix86_find_base_term. As compile time memory optimization, we
18099 avoid allocating rtxes that will not change anything on the outcome
18100 of the callers (find_base_value and find_base_term). */
18102 static inline rtx
18103 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18105 rtx orig_x = delegitimize_mem_from_attrs (x);
18106 /* addend is NULL or some rtx if x is something+GOTOFF where
18107 something doesn't include the PIC register. */
18108 rtx addend = NULL_RTX;
18109 /* reg_addend is NULL or a multiple of some register. */
18110 rtx reg_addend = NULL_RTX;
18111 /* const_addend is NULL or a const_int. */
18112 rtx const_addend = NULL_RTX;
18113 /* This is the result, or NULL. */
18114 rtx result = NULL_RTX;
18116 x = orig_x;
18118 if (MEM_P (x))
18119 x = XEXP (x, 0);
18121 if (TARGET_64BIT)
18123 if (GET_CODE (x) == CONST
18124 && GET_CODE (XEXP (x, 0)) == PLUS
18125 && GET_MODE (XEXP (x, 0)) == Pmode
18126 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18127 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18128 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18130 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18131 base. A CONST can't be arg_pointer_rtx based. */
18132 if (base_term_p && MEM_P (orig_x))
18133 return orig_x;
18134 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18135 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18136 if (MEM_P (orig_x))
18137 x = replace_equiv_address_nv (orig_x, x);
18138 return x;
18141 if (GET_CODE (x) == CONST
18142 && GET_CODE (XEXP (x, 0)) == UNSPEC
18143 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18144 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18145 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18147 x = XVECEXP (XEXP (x, 0), 0, 0);
18148 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18150 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18151 if (x == NULL_RTX)
18152 return orig_x;
18154 return x;
18157 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18158 return ix86_delegitimize_tls_address (orig_x);
18160 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18161 and -mcmodel=medium -fpic. */
18164 if (GET_CODE (x) != PLUS
18165 || GET_CODE (XEXP (x, 1)) != CONST)
18166 return ix86_delegitimize_tls_address (orig_x);
18168 if (ix86_pic_register_p (XEXP (x, 0)))
18169 /* %ebx + GOT/GOTOFF */
18171 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18173 /* %ebx + %reg * scale + GOT/GOTOFF */
18174 reg_addend = XEXP (x, 0);
18175 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18176 reg_addend = XEXP (reg_addend, 1);
18177 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18178 reg_addend = XEXP (reg_addend, 0);
18179 else
18181 reg_addend = NULL_RTX;
18182 addend = XEXP (x, 0);
18185 else
18186 addend = XEXP (x, 0);
18188 x = XEXP (XEXP (x, 1), 0);
18189 if (GET_CODE (x) == PLUS
18190 && CONST_INT_P (XEXP (x, 1)))
18192 const_addend = XEXP (x, 1);
18193 x = XEXP (x, 0);
18196 if (GET_CODE (x) == UNSPEC
18197 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18198 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18199 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18200 && !MEM_P (orig_x) && !addend)))
18201 result = XVECEXP (x, 0, 0);
18203 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18204 && !MEM_P (orig_x))
18205 result = XVECEXP (x, 0, 0);
18207 if (! result)
18208 return ix86_delegitimize_tls_address (orig_x);
18210 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18211 recurse on the first operand. */
18212 if (const_addend && !base_term_p)
18213 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18214 if (reg_addend)
18215 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18216 if (addend)
18218 /* If the rest of original X doesn't involve the PIC register, add
18219 addend and subtract pic_offset_table_rtx. This can happen e.g.
18220 for code like:
18221 leal (%ebx, %ecx, 4), %ecx
18223 movl foo@GOTOFF(%ecx), %edx
18224 in which case we return (%ecx - %ebx) + foo
18225 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18226 and reload has completed. */
18227 if (pic_offset_table_rtx
18228 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18229 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18230 pic_offset_table_rtx),
18231 result);
18232 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18234 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18235 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18236 result = gen_rtx_PLUS (Pmode, tmp, result);
18238 else
18239 return orig_x;
18241 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18243 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18244 if (result == NULL_RTX)
18245 return orig_x;
18247 return result;
18250 /* The normal instantiation of the above template. */
18252 static rtx
18253 ix86_delegitimize_address (rtx x)
18255 return ix86_delegitimize_address_1 (x, false);
18258 /* If X is a machine specific address (i.e. a symbol or label being
18259 referenced as a displacement from the GOT implemented using an
18260 UNSPEC), then return the base term. Otherwise return X. */
18263 ix86_find_base_term (rtx x)
18265 rtx term;
18267 if (TARGET_64BIT)
18269 if (GET_CODE (x) != CONST)
18270 return x;
18271 term = XEXP (x, 0);
18272 if (GET_CODE (term) == PLUS
18273 && CONST_INT_P (XEXP (term, 1)))
18274 term = XEXP (term, 0);
18275 if (GET_CODE (term) != UNSPEC
18276 || (XINT (term, 1) != UNSPEC_GOTPCREL
18277 && XINT (term, 1) != UNSPEC_PCREL))
18278 return x;
18280 return XVECEXP (term, 0, 0);
18283 return ix86_delegitimize_address_1 (x, true);
18286 static void
18287 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18288 bool fp, FILE *file)
18290 const char *suffix;
18292 if (mode == CCFPmode || mode == CCFPUmode)
18294 code = ix86_fp_compare_code_to_integer (code);
18295 mode = CCmode;
18297 if (reverse)
18298 code = reverse_condition (code);
18300 switch (code)
18302 case EQ:
18303 switch (mode)
18305 case CCAmode:
18306 suffix = "a";
18307 break;
18308 case CCCmode:
18309 suffix = "c";
18310 break;
18311 case CCOmode:
18312 suffix = "o";
18313 break;
18314 case CCPmode:
18315 suffix = "p";
18316 break;
18317 case CCSmode:
18318 suffix = "s";
18319 break;
18320 default:
18321 suffix = "e";
18322 break;
18324 break;
18325 case NE:
18326 switch (mode)
18328 case CCAmode:
18329 suffix = "na";
18330 break;
18331 case CCCmode:
18332 suffix = "nc";
18333 break;
18334 case CCOmode:
18335 suffix = "no";
18336 break;
18337 case CCPmode:
18338 suffix = "np";
18339 break;
18340 case CCSmode:
18341 suffix = "ns";
18342 break;
18343 default:
18344 suffix = "ne";
18345 break;
18347 break;
18348 case GT:
18349 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18350 suffix = "g";
18351 break;
18352 case GTU:
18353 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18354 Those same assemblers have the same but opposite lossage on cmov. */
18355 if (mode == CCmode)
18356 suffix = fp ? "nbe" : "a";
18357 else
18358 gcc_unreachable ();
18359 break;
18360 case LT:
18361 switch (mode)
18363 case CCNOmode:
18364 case CCGOCmode:
18365 suffix = "s";
18366 break;
18368 case CCmode:
18369 case CCGCmode:
18370 suffix = "l";
18371 break;
18373 default:
18374 gcc_unreachable ();
18376 break;
18377 case LTU:
18378 if (mode == CCmode)
18379 suffix = "b";
18380 else if (mode == CCCmode)
18381 suffix = fp ? "b" : "c";
18382 else
18383 gcc_unreachable ();
18384 break;
18385 case GE:
18386 switch (mode)
18388 case CCNOmode:
18389 case CCGOCmode:
18390 suffix = "ns";
18391 break;
18393 case CCmode:
18394 case CCGCmode:
18395 suffix = "ge";
18396 break;
18398 default:
18399 gcc_unreachable ();
18401 break;
18402 case GEU:
18403 if (mode == CCmode)
18404 suffix = "nb";
18405 else if (mode == CCCmode)
18406 suffix = fp ? "nb" : "nc";
18407 else
18408 gcc_unreachable ();
18409 break;
18410 case LE:
18411 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18412 suffix = "le";
18413 break;
18414 case LEU:
18415 if (mode == CCmode)
18416 suffix = "be";
18417 else
18418 gcc_unreachable ();
18419 break;
18420 case UNORDERED:
18421 suffix = fp ? "u" : "p";
18422 break;
18423 case ORDERED:
18424 suffix = fp ? "nu" : "np";
18425 break;
18426 default:
18427 gcc_unreachable ();
18429 fputs (suffix, file);
18432 /* Print the name of register X to FILE based on its machine mode and number.
18433 If CODE is 'w', pretend the mode is HImode.
18434 If CODE is 'b', pretend the mode is QImode.
18435 If CODE is 'k', pretend the mode is SImode.
18436 If CODE is 'q', pretend the mode is DImode.
18437 If CODE is 'x', pretend the mode is V4SFmode.
18438 If CODE is 't', pretend the mode is V8SFmode.
18439 If CODE is 'g', pretend the mode is V16SFmode.
18440 If CODE is 'h', pretend the reg is the 'high' byte register.
18441 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18442 If CODE is 'd', duplicate the operand for AVX instruction.
18445 void
18446 print_reg (rtx x, int code, FILE *file)
18448 const char *reg;
18449 int msize;
18450 unsigned int regno;
18451 bool duplicated;
18453 if (ASSEMBLER_DIALECT == ASM_ATT)
18454 putc ('%', file);
18456 if (x == pc_rtx)
18458 gcc_assert (TARGET_64BIT);
18459 fputs ("rip", file);
18460 return;
18463 if (code == 'y' && STACK_TOP_P (x))
18465 fputs ("st(0)", file);
18466 return;
18469 if (code == 'w')
18470 msize = 2;
18471 else if (code == 'b')
18472 msize = 1;
18473 else if (code == 'k')
18474 msize = 4;
18475 else if (code == 'q')
18476 msize = 8;
18477 else if (code == 'h')
18478 msize = 0;
18479 else if (code == 'x')
18480 msize = 16;
18481 else if (code == 't')
18482 msize = 32;
18483 else if (code == 'g')
18484 msize = 64;
18485 else
18486 msize = GET_MODE_SIZE (GET_MODE (x));
18488 regno = REGNO (x);
18490 if (regno == ARG_POINTER_REGNUM
18491 || regno == FRAME_POINTER_REGNUM
18492 || regno == FPSR_REG
18493 || regno == FPCR_REG)
18495 output_operand_lossage
18496 ("invalid use of register '%s'", reg_names[regno]);
18497 return;
18499 else if (regno == FLAGS_REG)
18501 output_operand_lossage ("invalid use of asm flag output");
18502 return;
18505 duplicated = code == 'd' && TARGET_AVX;
18507 switch (msize)
18509 case 16:
18510 case 12:
18511 case 8:
18512 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18513 warning (0, "unsupported size for integer register");
18514 /* FALLTHRU */
18515 case 4:
18516 if (LEGACY_INT_REGNO_P (regno))
18517 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18518 /* FALLTHRU */
18519 case 2:
18520 normal:
18521 reg = hi_reg_name[regno];
18522 break;
18523 case 1:
18524 if (regno >= ARRAY_SIZE (qi_reg_name))
18525 goto normal;
18526 if (!ANY_QI_REGNO_P (regno))
18527 error ("unsupported size for integer register");
18528 reg = qi_reg_name[regno];
18529 break;
18530 case 0:
18531 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18532 goto normal;
18533 reg = qi_high_reg_name[regno];
18534 break;
18535 case 32:
18536 case 64:
18537 if (SSE_REGNO_P (regno))
18539 gcc_assert (!duplicated);
18540 putc (msize == 32 ? 'y' : 'z', file);
18541 reg = hi_reg_name[regno] + 1;
18542 break;
18544 goto normal;
18545 default:
18546 gcc_unreachable ();
18549 fputs (reg, file);
18551 /* Irritatingly, AMD extended registers use
18552 different naming convention: "r%d[bwd]" */
18553 if (REX_INT_REGNO_P (regno))
18555 gcc_assert (TARGET_64BIT);
18556 switch (msize)
18558 case 0:
18559 error ("extended registers have no high halves");
18560 break;
18561 case 1:
18562 putc ('b', file);
18563 break;
18564 case 2:
18565 putc ('w', file);
18566 break;
18567 case 4:
18568 putc ('d', file);
18569 break;
18570 case 8:
18571 /* no suffix */
18572 break;
18573 default:
18574 error ("unsupported operand size for extended register");
18575 break;
18577 return;
18580 if (duplicated)
18582 if (ASSEMBLER_DIALECT == ASM_ATT)
18583 fprintf (file, ", %%%s", reg);
18584 else
18585 fprintf (file, ", %s", reg);
18589 /* Meaning of CODE:
18590 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18591 C -- print opcode suffix for set/cmov insn.
18592 c -- like C, but print reversed condition
18593 F,f -- likewise, but for floating-point.
18594 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18595 otherwise nothing
18596 R -- print embeded rounding and sae.
18597 r -- print only sae.
18598 z -- print the opcode suffix for the size of the current operand.
18599 Z -- likewise, with special suffixes for x87 instructions.
18600 * -- print a star (in certain assembler syntax)
18601 A -- print an absolute memory reference.
18602 E -- print address with DImode register names if TARGET_64BIT.
18603 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18604 s -- print a shift double count, followed by the assemblers argument
18605 delimiter.
18606 b -- print the QImode name of the register for the indicated operand.
18607 %b0 would print %al if operands[0] is reg 0.
18608 w -- likewise, print the HImode name of the register.
18609 k -- likewise, print the SImode name of the register.
18610 q -- likewise, print the DImode name of the register.
18611 x -- likewise, print the V4SFmode name of the register.
18612 t -- likewise, print the V8SFmode name of the register.
18613 g -- likewise, print the V16SFmode name of the register.
18614 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18615 y -- print "st(0)" instead of "st" as a register.
18616 d -- print duplicated register operand for AVX instruction.
18617 D -- print condition for SSE cmp instruction.
18618 P -- if PIC, print an @PLT suffix.
18619 p -- print raw symbol name.
18620 X -- don't print any sort of PIC '@' suffix for a symbol.
18621 & -- print some in-use local-dynamic symbol name.
18622 H -- print a memory address offset by 8; used for sse high-parts
18623 Y -- print condition for XOP pcom* instruction.
18624 + -- print a branch hint as 'cs' or 'ds' prefix
18625 ; -- print a semicolon (after prefixes due to bug in older gas).
18626 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18627 @ -- print a segment register of thread base pointer load
18628 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18629 ! -- print MPX prefix for jxx/call/ret instructions if required.
18632 void
18633 ix86_print_operand (FILE *file, rtx x, int code)
18635 if (code)
18637 switch (code)
18639 case 'A':
18640 switch (ASSEMBLER_DIALECT)
18642 case ASM_ATT:
18643 putc ('*', file);
18644 break;
18646 case ASM_INTEL:
18647 /* Intel syntax. For absolute addresses, registers should not
18648 be surrounded by braces. */
18649 if (!REG_P (x))
18651 putc ('[', file);
18652 ix86_print_operand (file, x, 0);
18653 putc (']', file);
18654 return;
18656 break;
18658 default:
18659 gcc_unreachable ();
18662 ix86_print_operand (file, x, 0);
18663 return;
18665 case 'E':
18666 /* Wrap address in an UNSPEC to declare special handling. */
18667 if (TARGET_64BIT)
18668 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18670 output_address (VOIDmode, x);
18671 return;
18673 case 'L':
18674 if (ASSEMBLER_DIALECT == ASM_ATT)
18675 putc ('l', file);
18676 return;
18678 case 'W':
18679 if (ASSEMBLER_DIALECT == ASM_ATT)
18680 putc ('w', file);
18681 return;
18683 case 'B':
18684 if (ASSEMBLER_DIALECT == ASM_ATT)
18685 putc ('b', file);
18686 return;
18688 case 'Q':
18689 if (ASSEMBLER_DIALECT == ASM_ATT)
18690 putc ('l', file);
18691 return;
18693 case 'S':
18694 if (ASSEMBLER_DIALECT == ASM_ATT)
18695 putc ('s', file);
18696 return;
18698 case 'T':
18699 if (ASSEMBLER_DIALECT == ASM_ATT)
18700 putc ('t', file);
18701 return;
18703 case 'O':
18704 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18705 if (ASSEMBLER_DIALECT != ASM_ATT)
18706 return;
18708 switch (GET_MODE_SIZE (GET_MODE (x)))
18710 case 2:
18711 putc ('w', file);
18712 break;
18714 case 4:
18715 putc ('l', file);
18716 break;
18718 case 8:
18719 putc ('q', file);
18720 break;
18722 default:
18723 output_operand_lossage ("invalid operand size for operand "
18724 "code 'O'");
18725 return;
18728 putc ('.', file);
18729 #endif
18730 return;
18732 case 'z':
18733 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18735 /* Opcodes don't get size suffixes if using Intel opcodes. */
18736 if (ASSEMBLER_DIALECT == ASM_INTEL)
18737 return;
18739 switch (GET_MODE_SIZE (GET_MODE (x)))
18741 case 1:
18742 putc ('b', file);
18743 return;
18745 case 2:
18746 putc ('w', file);
18747 return;
18749 case 4:
18750 putc ('l', file);
18751 return;
18753 case 8:
18754 putc ('q', file);
18755 return;
18757 default:
18758 output_operand_lossage ("invalid operand size for operand "
18759 "code 'z'");
18760 return;
18764 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18765 warning (0, "non-integer operand used with operand code 'z'");
18766 /* FALLTHRU */
18768 case 'Z':
18769 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18770 if (ASSEMBLER_DIALECT == ASM_INTEL)
18771 return;
18773 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18775 switch (GET_MODE_SIZE (GET_MODE (x)))
18777 case 2:
18778 #ifdef HAVE_AS_IX86_FILDS
18779 putc ('s', file);
18780 #endif
18781 return;
18783 case 4:
18784 putc ('l', file);
18785 return;
18787 case 8:
18788 #ifdef HAVE_AS_IX86_FILDQ
18789 putc ('q', file);
18790 #else
18791 fputs ("ll", file);
18792 #endif
18793 return;
18795 default:
18796 break;
18799 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18801 /* 387 opcodes don't get size suffixes
18802 if the operands are registers. */
18803 if (STACK_REG_P (x))
18804 return;
18806 switch (GET_MODE_SIZE (GET_MODE (x)))
18808 case 4:
18809 putc ('s', file);
18810 return;
18812 case 8:
18813 putc ('l', file);
18814 return;
18816 case 12:
18817 case 16:
18818 putc ('t', file);
18819 return;
18821 default:
18822 break;
18825 else
18827 output_operand_lossage ("invalid operand type used with "
18828 "operand code 'Z'");
18829 return;
18832 output_operand_lossage ("invalid operand size for operand code 'Z'");
18833 return;
18835 case 'd':
18836 case 'b':
18837 case 'w':
18838 case 'k':
18839 case 'q':
18840 case 'h':
18841 case 't':
18842 case 'g':
18843 case 'y':
18844 case 'x':
18845 case 'X':
18846 case 'P':
18847 case 'p':
18848 break;
18850 case 's':
18851 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18853 ix86_print_operand (file, x, 0);
18854 fputs (", ", file);
18856 return;
18858 case 'Y':
18859 switch (GET_CODE (x))
18861 case NE:
18862 fputs ("neq", file);
18863 break;
18864 case EQ:
18865 fputs ("eq", file);
18866 break;
18867 case GE:
18868 case GEU:
18869 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18870 break;
18871 case GT:
18872 case GTU:
18873 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18874 break;
18875 case LE:
18876 case LEU:
18877 fputs ("le", file);
18878 break;
18879 case LT:
18880 case LTU:
18881 fputs ("lt", file);
18882 break;
18883 case UNORDERED:
18884 fputs ("unord", file);
18885 break;
18886 case ORDERED:
18887 fputs ("ord", file);
18888 break;
18889 case UNEQ:
18890 fputs ("ueq", file);
18891 break;
18892 case UNGE:
18893 fputs ("nlt", file);
18894 break;
18895 case UNGT:
18896 fputs ("nle", file);
18897 break;
18898 case UNLE:
18899 fputs ("ule", file);
18900 break;
18901 case UNLT:
18902 fputs ("ult", file);
18903 break;
18904 case LTGT:
18905 fputs ("une", file);
18906 break;
18907 default:
18908 output_operand_lossage ("operand is not a condition code, "
18909 "invalid operand code 'Y'");
18910 return;
18912 return;
18914 case 'D':
18915 /* Little bit of braindamage here. The SSE compare instructions
18916 does use completely different names for the comparisons that the
18917 fp conditional moves. */
18918 switch (GET_CODE (x))
18920 case UNEQ:
18921 if (TARGET_AVX)
18923 fputs ("eq_us", file);
18924 break;
18926 /* FALLTHRU */
18927 case EQ:
18928 fputs ("eq", file);
18929 break;
18930 case UNLT:
18931 if (TARGET_AVX)
18933 fputs ("nge", file);
18934 break;
18936 /* FALLTHRU */
18937 case LT:
18938 fputs ("lt", file);
18939 break;
18940 case UNLE:
18941 if (TARGET_AVX)
18943 fputs ("ngt", file);
18944 break;
18946 /* FALLTHRU */
18947 case LE:
18948 fputs ("le", file);
18949 break;
18950 case UNORDERED:
18951 fputs ("unord", file);
18952 break;
18953 case LTGT:
18954 if (TARGET_AVX)
18956 fputs ("neq_oq", file);
18957 break;
18959 /* FALLTHRU */
18960 case NE:
18961 fputs ("neq", file);
18962 break;
18963 case GE:
18964 if (TARGET_AVX)
18966 fputs ("ge", file);
18967 break;
18969 /* FALLTHRU */
18970 case UNGE:
18971 fputs ("nlt", file);
18972 break;
18973 case GT:
18974 if (TARGET_AVX)
18976 fputs ("gt", file);
18977 break;
18979 /* FALLTHRU */
18980 case UNGT:
18981 fputs ("nle", file);
18982 break;
18983 case ORDERED:
18984 fputs ("ord", file);
18985 break;
18986 default:
18987 output_operand_lossage ("operand is not a condition code, "
18988 "invalid operand code 'D'");
18989 return;
18991 return;
18993 case 'F':
18994 case 'f':
18995 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18996 if (ASSEMBLER_DIALECT == ASM_ATT)
18997 putc ('.', file);
18998 gcc_fallthrough ();
18999 #endif
19001 case 'C':
19002 case 'c':
19003 if (!COMPARISON_P (x))
19005 output_operand_lossage ("operand is not a condition code, "
19006 "invalid operand code '%c'", code);
19007 return;
19009 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19010 code == 'c' || code == 'f',
19011 code == 'F' || code == 'f',
19012 file);
19013 return;
19015 case 'H':
19016 if (!offsettable_memref_p (x))
19018 output_operand_lossage ("operand is not an offsettable memory "
19019 "reference, invalid operand code 'H'");
19020 return;
19022 /* It doesn't actually matter what mode we use here, as we're
19023 only going to use this for printing. */
19024 x = adjust_address_nv (x, DImode, 8);
19025 /* Output 'qword ptr' for intel assembler dialect. */
19026 if (ASSEMBLER_DIALECT == ASM_INTEL)
19027 code = 'q';
19028 break;
19030 case 'K':
19031 if (!CONST_INT_P (x))
19033 output_operand_lossage ("operand is not an integer, invalid "
19034 "operand code 'K'");
19035 return;
19038 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19039 #ifdef HAVE_AS_IX86_HLE
19040 fputs ("xacquire ", file);
19041 #else
19042 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19043 #endif
19044 else if (INTVAL (x) & IX86_HLE_RELEASE)
19045 #ifdef HAVE_AS_IX86_HLE
19046 fputs ("xrelease ", file);
19047 #else
19048 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19049 #endif
19050 /* We do not want to print value of the operand. */
19051 return;
19053 case 'N':
19054 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19055 fputs ("{z}", file);
19056 return;
19058 case 'r':
19059 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19061 output_operand_lossage ("operand is not a specific integer, "
19062 "invalid operand code 'r'");
19063 return;
19066 if (ASSEMBLER_DIALECT == ASM_INTEL)
19067 fputs (", ", file);
19069 fputs ("{sae}", file);
19071 if (ASSEMBLER_DIALECT == ASM_ATT)
19072 fputs (", ", file);
19074 return;
19076 case 'R':
19077 if (!CONST_INT_P (x))
19079 output_operand_lossage ("operand is not an integer, invalid "
19080 "operand code 'R'");
19081 return;
19084 if (ASSEMBLER_DIALECT == ASM_INTEL)
19085 fputs (", ", file);
19087 switch (INTVAL (x))
19089 case ROUND_NEAREST_INT | ROUND_SAE:
19090 fputs ("{rn-sae}", file);
19091 break;
19092 case ROUND_NEG_INF | ROUND_SAE:
19093 fputs ("{rd-sae}", file);
19094 break;
19095 case ROUND_POS_INF | ROUND_SAE:
19096 fputs ("{ru-sae}", file);
19097 break;
19098 case ROUND_ZERO | ROUND_SAE:
19099 fputs ("{rz-sae}", file);
19100 break;
19101 default:
19102 output_operand_lossage ("operand is not a specific integer, "
19103 "invalid operand code 'R'");
19106 if (ASSEMBLER_DIALECT == ASM_ATT)
19107 fputs (", ", file);
19109 return;
19111 case '*':
19112 if (ASSEMBLER_DIALECT == ASM_ATT)
19113 putc ('*', file);
19114 return;
19116 case '&':
19118 const char *name = get_some_local_dynamic_name ();
19119 if (name == NULL)
19120 output_operand_lossage ("'%%&' used without any "
19121 "local dynamic TLS references");
19122 else
19123 assemble_name (file, name);
19124 return;
19127 case '+':
19129 rtx x;
19131 if (!optimize
19132 || optimize_function_for_size_p (cfun)
19133 || !TARGET_BRANCH_PREDICTION_HINTS)
19134 return;
19136 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19137 if (x)
19139 int pred_val = profile_probability::from_reg_br_prob_note
19140 (XINT (x, 0)).to_reg_br_prob_base ();
19142 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19143 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19145 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19146 bool cputaken
19147 = final_forward_branch_p (current_output_insn) == 0;
19149 /* Emit hints only in the case default branch prediction
19150 heuristics would fail. */
19151 if (taken != cputaken)
19153 /* We use 3e (DS) prefix for taken branches and
19154 2e (CS) prefix for not taken branches. */
19155 if (taken)
19156 fputs ("ds ; ", file);
19157 else
19158 fputs ("cs ; ", file);
19162 return;
19165 case ';':
19166 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19167 putc (';', file);
19168 #endif
19169 return;
19171 case '@':
19172 if (ASSEMBLER_DIALECT == ASM_ATT)
19173 putc ('%', file);
19175 /* The kernel uses a different segment register for performance
19176 reasons; a system call would not have to trash the userspace
19177 segment register, which would be expensive. */
19178 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
19179 fputs ("fs", file);
19180 else
19181 fputs ("gs", file);
19182 return;
19184 case '~':
19185 putc (TARGET_AVX2 ? 'i' : 'f', file);
19186 return;
19188 case '^':
19189 if (TARGET_64BIT && Pmode != word_mode)
19190 fputs ("addr32 ", file);
19191 return;
19193 case '!':
19194 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19195 fputs ("bnd ", file);
19196 return;
19198 default:
19199 output_operand_lossage ("invalid operand code '%c'", code);
19203 if (REG_P (x))
19204 print_reg (x, code, file);
19206 else if (MEM_P (x))
19208 rtx addr = XEXP (x, 0);
19210 /* No `byte ptr' prefix for call instructions ... */
19211 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19213 machine_mode mode = GET_MODE (x);
19214 const char *size;
19216 /* Check for explicit size override codes. */
19217 if (code == 'b')
19218 size = "BYTE";
19219 else if (code == 'w')
19220 size = "WORD";
19221 else if (code == 'k')
19222 size = "DWORD";
19223 else if (code == 'q')
19224 size = "QWORD";
19225 else if (code == 'x')
19226 size = "XMMWORD";
19227 else if (code == 't')
19228 size = "YMMWORD";
19229 else if (code == 'g')
19230 size = "ZMMWORD";
19231 else if (mode == BLKmode)
19232 /* ... or BLKmode operands, when not overridden. */
19233 size = NULL;
19234 else
19235 switch (GET_MODE_SIZE (mode))
19237 case 1: size = "BYTE"; break;
19238 case 2: size = "WORD"; break;
19239 case 4: size = "DWORD"; break;
19240 case 8: size = "QWORD"; break;
19241 case 12: size = "TBYTE"; break;
19242 case 16:
19243 if (mode == XFmode)
19244 size = "TBYTE";
19245 else
19246 size = "XMMWORD";
19247 break;
19248 case 32: size = "YMMWORD"; break;
19249 case 64: size = "ZMMWORD"; break;
19250 default:
19251 gcc_unreachable ();
19253 if (size)
19255 fputs (size, file);
19256 fputs (" PTR ", file);
19260 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19261 output_operand_lossage ("invalid constraints for operand");
19262 else
19263 ix86_print_operand_address_as
19264 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19267 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19269 long l;
19271 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19273 if (ASSEMBLER_DIALECT == ASM_ATT)
19274 putc ('$', file);
19275 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19276 if (code == 'q')
19277 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19278 (unsigned long long) (int) l);
19279 else
19280 fprintf (file, "0x%08x", (unsigned int) l);
19283 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19285 long l[2];
19287 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19289 if (ASSEMBLER_DIALECT == ASM_ATT)
19290 putc ('$', file);
19291 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19294 /* These float cases don't actually occur as immediate operands. */
19295 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19297 char dstr[30];
19299 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19300 fputs (dstr, file);
19303 else
19305 /* We have patterns that allow zero sets of memory, for instance.
19306 In 64-bit mode, we should probably support all 8-byte vectors,
19307 since we can in fact encode that into an immediate. */
19308 if (GET_CODE (x) == CONST_VECTOR)
19310 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19311 x = const0_rtx;
19314 if (code != 'P' && code != 'p')
19316 if (CONST_INT_P (x))
19318 if (ASSEMBLER_DIALECT == ASM_ATT)
19319 putc ('$', file);
19321 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19322 || GET_CODE (x) == LABEL_REF)
19324 if (ASSEMBLER_DIALECT == ASM_ATT)
19325 putc ('$', file);
19326 else
19327 fputs ("OFFSET FLAT:", file);
19330 if (CONST_INT_P (x))
19331 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19332 else if (flag_pic || MACHOPIC_INDIRECT)
19333 output_pic_addr_const (file, x, code);
19334 else
19335 output_addr_const (file, x);
19339 static bool
19340 ix86_print_operand_punct_valid_p (unsigned char code)
19342 return (code == '@' || code == '*' || code == '+' || code == '&'
19343 || code == ';' || code == '~' || code == '^' || code == '!');
19346 /* Print a memory operand whose address is ADDR. */
19348 static void
19349 ix86_print_operand_address_as (FILE *file, rtx addr,
19350 addr_space_t as, bool no_rip)
19352 struct ix86_address parts;
19353 rtx base, index, disp;
19354 int scale;
19355 int ok;
19356 bool vsib = false;
19357 int code = 0;
19359 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19361 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19362 gcc_assert (parts.index == NULL_RTX);
19363 parts.index = XVECEXP (addr, 0, 1);
19364 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19365 addr = XVECEXP (addr, 0, 0);
19366 vsib = true;
19368 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19370 gcc_assert (TARGET_64BIT);
19371 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19372 code = 'q';
19374 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19376 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19377 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19378 if (parts.base != NULL_RTX)
19380 parts.index = parts.base;
19381 parts.scale = 1;
19383 parts.base = XVECEXP (addr, 0, 0);
19384 addr = XVECEXP (addr, 0, 0);
19386 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19388 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19389 gcc_assert (parts.index == NULL_RTX);
19390 parts.index = XVECEXP (addr, 0, 1);
19391 addr = XVECEXP (addr, 0, 0);
19393 else
19394 ok = ix86_decompose_address (addr, &parts);
19396 gcc_assert (ok);
19398 base = parts.base;
19399 index = parts.index;
19400 disp = parts.disp;
19401 scale = parts.scale;
19403 if (ADDR_SPACE_GENERIC_P (as))
19404 as = parts.seg;
19405 else
19406 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19408 if (!ADDR_SPACE_GENERIC_P (as))
19410 const char *string;
19412 if (as == ADDR_SPACE_SEG_FS)
19413 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19414 else if (as == ADDR_SPACE_SEG_GS)
19415 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19416 else
19417 gcc_unreachable ();
19418 fputs (string, file);
19421 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19422 if (TARGET_64BIT && !base && !index && !no_rip)
19424 rtx symbol = disp;
19426 if (GET_CODE (disp) == CONST
19427 && GET_CODE (XEXP (disp, 0)) == PLUS
19428 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19429 symbol = XEXP (XEXP (disp, 0), 0);
19431 if (GET_CODE (symbol) == LABEL_REF
19432 || (GET_CODE (symbol) == SYMBOL_REF
19433 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19434 base = pc_rtx;
19437 if (!base && !index)
19439 /* Displacement only requires special attention. */
19440 if (CONST_INT_P (disp))
19442 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
19443 fputs ("ds:", file);
19444 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19446 /* Load the external function address via the GOT slot to avoid PLT. */
19447 else if (GET_CODE (disp) == CONST
19448 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19449 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19450 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19451 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19452 output_pic_addr_const (file, disp, 0);
19453 else if (flag_pic)
19454 output_pic_addr_const (file, disp, 0);
19455 else
19456 output_addr_const (file, disp);
19458 else
19460 /* Print SImode register names to force addr32 prefix. */
19461 if (SImode_address_operand (addr, VOIDmode))
19463 if (flag_checking)
19465 gcc_assert (TARGET_64BIT);
19466 switch (GET_CODE (addr))
19468 case SUBREG:
19469 gcc_assert (GET_MODE (addr) == SImode);
19470 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19471 break;
19472 case ZERO_EXTEND:
19473 case AND:
19474 gcc_assert (GET_MODE (addr) == DImode);
19475 break;
19476 default:
19477 gcc_unreachable ();
19480 gcc_assert (!code);
19481 code = 'k';
19483 else if (code == 0
19484 && TARGET_X32
19485 && disp
19486 && CONST_INT_P (disp)
19487 && INTVAL (disp) < -16*1024*1024)
19489 /* X32 runs in 64-bit mode, where displacement, DISP, in
19490 address DISP(%r64), is encoded as 32-bit immediate sign-
19491 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19492 address is %r64 + 0xffffffffbffffd00. When %r64 <
19493 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19494 which is invalid for x32. The correct address is %r64
19495 - 0x40000300 == 0xf7ffdd64. To properly encode
19496 -0x40000300(%r64) for x32, we zero-extend negative
19497 displacement by forcing addr32 prefix which truncates
19498 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19499 zero-extend all negative displacements, including -1(%rsp).
19500 However, for small negative displacements, sign-extension
19501 won't cause overflow. We only zero-extend negative
19502 displacements if they < -16*1024*1024, which is also used
19503 to check legitimate address displacements for PIC. */
19504 code = 'k';
19507 if (ASSEMBLER_DIALECT == ASM_ATT)
19509 if (disp)
19511 if (flag_pic)
19512 output_pic_addr_const (file, disp, 0);
19513 else if (GET_CODE (disp) == LABEL_REF)
19514 output_asm_label (disp);
19515 else
19516 output_addr_const (file, disp);
19519 putc ('(', file);
19520 if (base)
19521 print_reg (base, code, file);
19522 if (index)
19524 putc (',', file);
19525 print_reg (index, vsib ? 0 : code, file);
19526 if (scale != 1 || vsib)
19527 fprintf (file, ",%d", scale);
19529 putc (')', file);
19531 else
19533 rtx offset = NULL_RTX;
19535 if (disp)
19537 /* Pull out the offset of a symbol; print any symbol itself. */
19538 if (GET_CODE (disp) == CONST
19539 && GET_CODE (XEXP (disp, 0)) == PLUS
19540 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19542 offset = XEXP (XEXP (disp, 0), 1);
19543 disp = gen_rtx_CONST (VOIDmode,
19544 XEXP (XEXP (disp, 0), 0));
19547 if (flag_pic)
19548 output_pic_addr_const (file, disp, 0);
19549 else if (GET_CODE (disp) == LABEL_REF)
19550 output_asm_label (disp);
19551 else if (CONST_INT_P (disp))
19552 offset = disp;
19553 else
19554 output_addr_const (file, disp);
19557 putc ('[', file);
19558 if (base)
19560 print_reg (base, code, file);
19561 if (offset)
19563 if (INTVAL (offset) >= 0)
19564 putc ('+', file);
19565 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19568 else if (offset)
19569 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19570 else
19571 putc ('0', file);
19573 if (index)
19575 putc ('+', file);
19576 print_reg (index, vsib ? 0 : code, file);
19577 if (scale != 1 || vsib)
19578 fprintf (file, "*%d", scale);
19580 putc (']', file);
19585 static void
19586 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19588 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19591 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19593 static bool
19594 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19596 rtx op;
19598 if (GET_CODE (x) != UNSPEC)
19599 return false;
19601 op = XVECEXP (x, 0, 0);
19602 switch (XINT (x, 1))
19604 case UNSPEC_GOTTPOFF:
19605 output_addr_const (file, op);
19606 /* FIXME: This might be @TPOFF in Sun ld. */
19607 fputs ("@gottpoff", file);
19608 break;
19609 case UNSPEC_TPOFF:
19610 output_addr_const (file, op);
19611 fputs ("@tpoff", file);
19612 break;
19613 case UNSPEC_NTPOFF:
19614 output_addr_const (file, op);
19615 if (TARGET_64BIT)
19616 fputs ("@tpoff", file);
19617 else
19618 fputs ("@ntpoff", file);
19619 break;
19620 case UNSPEC_DTPOFF:
19621 output_addr_const (file, op);
19622 fputs ("@dtpoff", file);
19623 break;
19624 case UNSPEC_GOTNTPOFF:
19625 output_addr_const (file, op);
19626 if (TARGET_64BIT)
19627 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19628 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19629 else
19630 fputs ("@gotntpoff", file);
19631 break;
19632 case UNSPEC_INDNTPOFF:
19633 output_addr_const (file, op);
19634 fputs ("@indntpoff", file);
19635 break;
19636 #if TARGET_MACHO
19637 case UNSPEC_MACHOPIC_OFFSET:
19638 output_addr_const (file, op);
19639 putc ('-', file);
19640 machopic_output_function_base_name (file);
19641 break;
19642 #endif
19644 case UNSPEC_STACK_CHECK:
19646 int offset;
19648 gcc_assert (flag_split_stack);
19650 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
19651 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
19652 #else
19653 gcc_unreachable ();
19654 #endif
19656 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
19658 break;
19660 default:
19661 return false;
19664 return true;
19667 /* Split one or more double-mode RTL references into pairs of half-mode
19668 references. The RTL can be REG, offsettable MEM, integer constant, or
19669 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19670 split and "num" is its length. lo_half and hi_half are output arrays
19671 that parallel "operands". */
19673 void
19674 split_double_mode (machine_mode mode, rtx operands[],
19675 int num, rtx lo_half[], rtx hi_half[])
19677 machine_mode half_mode;
19678 unsigned int byte;
19680 switch (mode)
19682 case TImode:
19683 half_mode = DImode;
19684 break;
19685 case DImode:
19686 half_mode = SImode;
19687 break;
19688 default:
19689 gcc_unreachable ();
19692 byte = GET_MODE_SIZE (half_mode);
19694 while (num--)
19696 rtx op = operands[num];
19698 /* simplify_subreg refuse to split volatile memory addresses,
19699 but we still have to handle it. */
19700 if (MEM_P (op))
19702 lo_half[num] = adjust_address (op, half_mode, 0);
19703 hi_half[num] = adjust_address (op, half_mode, byte);
19705 else
19707 lo_half[num] = simplify_gen_subreg (half_mode, op,
19708 GET_MODE (op) == VOIDmode
19709 ? mode : GET_MODE (op), 0);
19710 hi_half[num] = simplify_gen_subreg (half_mode, op,
19711 GET_MODE (op) == VOIDmode
19712 ? mode : GET_MODE (op), byte);
19717 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19718 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19719 is the expression of the binary operation. The output may either be
19720 emitted here, or returned to the caller, like all output_* functions.
19722 There is no guarantee that the operands are the same mode, as they
19723 might be within FLOAT or FLOAT_EXTEND expressions. */
19725 #ifndef SYSV386_COMPAT
19726 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19727 wants to fix the assemblers because that causes incompatibility
19728 with gcc. No-one wants to fix gcc because that causes
19729 incompatibility with assemblers... You can use the option of
19730 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19731 #define SYSV386_COMPAT 1
19732 #endif
19734 const char *
19735 output_387_binary_op (rtx_insn *insn, rtx *operands)
19737 static char buf[40];
19738 const char *p;
19739 const char *ssep;
19740 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
19742 /* Even if we do not want to check the inputs, this documents input
19743 constraints. Which helps in understanding the following code. */
19744 if (flag_checking)
19746 if (STACK_REG_P (operands[0])
19747 && ((REG_P (operands[1])
19748 && REGNO (operands[0]) == REGNO (operands[1])
19749 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19750 || (REG_P (operands[2])
19751 && REGNO (operands[0]) == REGNO (operands[2])
19752 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19753 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19754 ; /* ok */
19755 else
19756 gcc_assert (is_sse);
19759 switch (GET_CODE (operands[3]))
19761 case PLUS:
19762 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19763 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19764 p = "fiadd";
19765 else
19766 p = "fadd";
19767 ssep = "vadd";
19768 break;
19770 case MINUS:
19771 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19772 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19773 p = "fisub";
19774 else
19775 p = "fsub";
19776 ssep = "vsub";
19777 break;
19779 case MULT:
19780 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19781 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19782 p = "fimul";
19783 else
19784 p = "fmul";
19785 ssep = "vmul";
19786 break;
19788 case DIV:
19789 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19790 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19791 p = "fidiv";
19792 else
19793 p = "fdiv";
19794 ssep = "vdiv";
19795 break;
19797 default:
19798 gcc_unreachable ();
19801 if (is_sse)
19803 if (TARGET_AVX)
19805 strcpy (buf, ssep);
19806 if (GET_MODE (operands[0]) == SFmode)
19807 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
19808 else
19809 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
19811 else
19813 strcpy (buf, ssep + 1);
19814 if (GET_MODE (operands[0]) == SFmode)
19815 strcat (buf, "ss\t{%2, %0|%0, %2}");
19816 else
19817 strcat (buf, "sd\t{%2, %0|%0, %2}");
19819 return buf;
19821 strcpy (buf, p);
19823 switch (GET_CODE (operands[3]))
19825 case MULT:
19826 case PLUS:
19827 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19828 std::swap (operands[1], operands[2]);
19830 /* know operands[0] == operands[1]. */
19832 if (MEM_P (operands[2]))
19834 p = "%Z2\t%2";
19835 break;
19838 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19840 if (STACK_TOP_P (operands[0]))
19841 /* How is it that we are storing to a dead operand[2]?
19842 Well, presumably operands[1] is dead too. We can't
19843 store the result to st(0) as st(0) gets popped on this
19844 instruction. Instead store to operands[2] (which I
19845 think has to be st(1)). st(1) will be popped later.
19846 gcc <= 2.8.1 didn't have this check and generated
19847 assembly code that the Unixware assembler rejected. */
19848 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19849 else
19850 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19851 break;
19854 if (STACK_TOP_P (operands[0]))
19855 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19856 else
19857 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19858 break;
19860 case MINUS:
19861 case DIV:
19862 if (MEM_P (operands[1]))
19864 p = "r%Z1\t%1";
19865 break;
19868 if (MEM_P (operands[2]))
19870 p = "%Z2\t%2";
19871 break;
19874 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19876 #if SYSV386_COMPAT
19877 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19878 derived assemblers, confusingly reverse the direction of
19879 the operation for fsub{r} and fdiv{r} when the
19880 destination register is not st(0). The Intel assembler
19881 doesn't have this brain damage. Read !SYSV386_COMPAT to
19882 figure out what the hardware really does. */
19883 if (STACK_TOP_P (operands[0]))
19884 p = "{p\t%0, %2|rp\t%2, %0}";
19885 else
19886 p = "{rp\t%2, %0|p\t%0, %2}";
19887 #else
19888 if (STACK_TOP_P (operands[0]))
19889 /* As above for fmul/fadd, we can't store to st(0). */
19890 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19891 else
19892 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19893 #endif
19894 break;
19897 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19899 #if SYSV386_COMPAT
19900 if (STACK_TOP_P (operands[0]))
19901 p = "{rp\t%0, %1|p\t%1, %0}";
19902 else
19903 p = "{p\t%1, %0|rp\t%0, %1}";
19904 #else
19905 if (STACK_TOP_P (operands[0]))
19906 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19907 else
19908 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19909 #endif
19910 break;
19913 if (STACK_TOP_P (operands[0]))
19915 if (STACK_TOP_P (operands[1]))
19916 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19917 else
19918 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19919 break;
19921 else if (STACK_TOP_P (operands[1]))
19923 #if SYSV386_COMPAT
19924 p = "{\t%1, %0|r\t%0, %1}";
19925 #else
19926 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19927 #endif
19929 else
19931 #if SYSV386_COMPAT
19932 p = "{r\t%2, %0|\t%0, %2}";
19933 #else
19934 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19935 #endif
19937 break;
19939 default:
19940 gcc_unreachable ();
19943 strcat (buf, p);
19944 return buf;
19947 /* Return needed mode for entity in optimize_mode_switching pass. */
19949 static int
19950 ix86_dirflag_mode_needed (rtx_insn *insn)
19952 if (CALL_P (insn))
19954 if (cfun->machine->func_type == TYPE_NORMAL)
19955 return X86_DIRFLAG_ANY;
19956 else
19957 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19958 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19961 if (recog_memoized (insn) < 0)
19962 return X86_DIRFLAG_ANY;
19964 if (get_attr_type (insn) == TYPE_STR)
19966 /* Emit cld instruction if stringops are used in the function. */
19967 if (cfun->machine->func_type == TYPE_NORMAL)
19968 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19969 else
19970 return X86_DIRFLAG_RESET;
19973 return X86_DIRFLAG_ANY;
19976 /* Check if a 256bit AVX register is referenced inside of EXP. */
19978 static bool
19979 ix86_check_avx256_register (const_rtx exp)
19981 if (SUBREG_P (exp))
19982 exp = SUBREG_REG (exp);
19984 return (REG_P (exp)
19985 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
19988 /* Return needed mode for entity in optimize_mode_switching pass. */
19990 static int
19991 ix86_avx_u128_mode_needed (rtx_insn *insn)
19993 if (CALL_P (insn))
19995 rtx link;
19997 /* Needed mode is set to AVX_U128_CLEAN if there are
19998 no 256bit modes used in function arguments. */
19999 for (link = CALL_INSN_FUNCTION_USAGE (insn);
20000 link;
20001 link = XEXP (link, 1))
20003 if (GET_CODE (XEXP (link, 0)) == USE)
20005 rtx arg = XEXP (XEXP (link, 0), 0);
20007 if (ix86_check_avx256_register (arg))
20008 return AVX_U128_DIRTY;
20012 return AVX_U128_CLEAN;
20015 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20016 changes state only when a 256bit register is written to, but we need
20017 to prevent the compiler from moving optimal insertion point above
20018 eventual read from 256bit register. */
20019 subrtx_iterator::array_type array;
20020 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20021 if (ix86_check_avx256_register (*iter))
20022 return AVX_U128_DIRTY;
20024 return AVX_U128_ANY;
20027 /* Return mode that i387 must be switched into
20028 prior to the execution of insn. */
20030 static int
20031 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20033 enum attr_i387_cw mode;
20035 /* The mode UNINITIALIZED is used to store control word after a
20036 function call or ASM pattern. The mode ANY specify that function
20037 has no requirements on the control word and make no changes in the
20038 bits we are interested in. */
20040 if (CALL_P (insn)
20041 || (NONJUMP_INSN_P (insn)
20042 && (asm_noperands (PATTERN (insn)) >= 0
20043 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20044 return I387_CW_UNINITIALIZED;
20046 if (recog_memoized (insn) < 0)
20047 return I387_CW_ANY;
20049 mode = get_attr_i387_cw (insn);
20051 switch (entity)
20053 case I387_TRUNC:
20054 if (mode == I387_CW_TRUNC)
20055 return mode;
20056 break;
20058 case I387_FLOOR:
20059 if (mode == I387_CW_FLOOR)
20060 return mode;
20061 break;
20063 case I387_CEIL:
20064 if (mode == I387_CW_CEIL)
20065 return mode;
20066 break;
20068 case I387_MASK_PM:
20069 if (mode == I387_CW_MASK_PM)
20070 return mode;
20071 break;
20073 default:
20074 gcc_unreachable ();
20077 return I387_CW_ANY;
20080 /* Return mode that entity must be switched into
20081 prior to the execution of insn. */
20083 static int
20084 ix86_mode_needed (int entity, rtx_insn *insn)
20086 switch (entity)
20088 case X86_DIRFLAG:
20089 return ix86_dirflag_mode_needed (insn);
20090 case AVX_U128:
20091 return ix86_avx_u128_mode_needed (insn);
20092 case I387_TRUNC:
20093 case I387_FLOOR:
20094 case I387_CEIL:
20095 case I387_MASK_PM:
20096 return ix86_i387_mode_needed (entity, insn);
20097 default:
20098 gcc_unreachable ();
20100 return 0;
20103 /* Check if a 256bit AVX register is referenced in stores. */
20105 static void
20106 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20108 if (ix86_check_avx256_register (dest))
20110 bool *used = (bool *) data;
20111 *used = true;
20115 /* Calculate mode of upper 128bit AVX registers after the insn. */
20117 static int
20118 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20120 rtx pat = PATTERN (insn);
20122 if (vzeroupper_operation (pat, VOIDmode)
20123 || vzeroall_operation (pat, VOIDmode))
20124 return AVX_U128_CLEAN;
20126 /* We know that state is clean after CALL insn if there are no
20127 256bit registers used in the function return register. */
20128 if (CALL_P (insn))
20130 bool avx_reg256_found = false;
20131 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20133 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20136 /* Otherwise, return current mode. Remember that if insn
20137 references AVX 256bit registers, the mode was already changed
20138 to DIRTY from MODE_NEEDED. */
20139 return mode;
20142 /* Return the mode that an insn results in. */
20144 static int
20145 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20147 switch (entity)
20149 case X86_DIRFLAG:
20150 return mode;
20151 case AVX_U128:
20152 return ix86_avx_u128_mode_after (mode, insn);
20153 case I387_TRUNC:
20154 case I387_FLOOR:
20155 case I387_CEIL:
20156 case I387_MASK_PM:
20157 return mode;
20158 default:
20159 gcc_unreachable ();
20163 static int
20164 ix86_dirflag_mode_entry (void)
20166 /* For TARGET_CLD or in the interrupt handler we can't assume
20167 direction flag state at function entry. */
20168 if (TARGET_CLD
20169 || cfun->machine->func_type != TYPE_NORMAL)
20170 return X86_DIRFLAG_ANY;
20172 return X86_DIRFLAG_RESET;
20175 static int
20176 ix86_avx_u128_mode_entry (void)
20178 tree arg;
20180 /* Entry mode is set to AVX_U128_DIRTY if there are
20181 256bit modes used in function arguments. */
20182 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20183 arg = TREE_CHAIN (arg))
20185 rtx incoming = DECL_INCOMING_RTL (arg);
20187 if (incoming && ix86_check_avx256_register (incoming))
20188 return AVX_U128_DIRTY;
20191 return AVX_U128_CLEAN;
20194 /* Return a mode that ENTITY is assumed to be
20195 switched to at function entry. */
20197 static int
20198 ix86_mode_entry (int entity)
20200 switch (entity)
20202 case X86_DIRFLAG:
20203 return ix86_dirflag_mode_entry ();
20204 case AVX_U128:
20205 return ix86_avx_u128_mode_entry ();
20206 case I387_TRUNC:
20207 case I387_FLOOR:
20208 case I387_CEIL:
20209 case I387_MASK_PM:
20210 return I387_CW_ANY;
20211 default:
20212 gcc_unreachable ();
20216 static int
20217 ix86_avx_u128_mode_exit (void)
20219 rtx reg = crtl->return_rtx;
20221 /* Exit mode is set to AVX_U128_DIRTY if there are
20222 256bit modes used in the function return register. */
20223 if (reg && ix86_check_avx256_register (reg))
20224 return AVX_U128_DIRTY;
20226 return AVX_U128_CLEAN;
20229 /* Return a mode that ENTITY is assumed to be
20230 switched to at function exit. */
20232 static int
20233 ix86_mode_exit (int entity)
20235 switch (entity)
20237 case X86_DIRFLAG:
20238 return X86_DIRFLAG_ANY;
20239 case AVX_U128:
20240 return ix86_avx_u128_mode_exit ();
20241 case I387_TRUNC:
20242 case I387_FLOOR:
20243 case I387_CEIL:
20244 case I387_MASK_PM:
20245 return I387_CW_ANY;
20246 default:
20247 gcc_unreachable ();
20251 static int
20252 ix86_mode_priority (int, int n)
20254 return n;
20257 /* Output code to initialize control word copies used by trunc?f?i and
20258 rounding patterns. CURRENT_MODE is set to current control word,
20259 while NEW_MODE is set to new control word. */
20261 static void
20262 emit_i387_cw_initialization (int mode)
20264 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20265 rtx new_mode;
20267 enum ix86_stack_slot slot;
20269 rtx reg = gen_reg_rtx (HImode);
20271 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20272 emit_move_insn (reg, copy_rtx (stored_mode));
20274 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20275 || optimize_insn_for_size_p ())
20277 switch (mode)
20279 case I387_CW_TRUNC:
20280 /* round toward zero (truncate) */
20281 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20282 slot = SLOT_CW_TRUNC;
20283 break;
20285 case I387_CW_FLOOR:
20286 /* round down toward -oo */
20287 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20288 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20289 slot = SLOT_CW_FLOOR;
20290 break;
20292 case I387_CW_CEIL:
20293 /* round up toward +oo */
20294 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20295 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20296 slot = SLOT_CW_CEIL;
20297 break;
20299 case I387_CW_MASK_PM:
20300 /* mask precision exception for nearbyint() */
20301 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20302 slot = SLOT_CW_MASK_PM;
20303 break;
20305 default:
20306 gcc_unreachable ();
20309 else
20311 switch (mode)
20313 case I387_CW_TRUNC:
20314 /* round toward zero (truncate) */
20315 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20316 slot = SLOT_CW_TRUNC;
20317 break;
20319 case I387_CW_FLOOR:
20320 /* round down toward -oo */
20321 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20322 slot = SLOT_CW_FLOOR;
20323 break;
20325 case I387_CW_CEIL:
20326 /* round up toward +oo */
20327 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20328 slot = SLOT_CW_CEIL;
20329 break;
20331 case I387_CW_MASK_PM:
20332 /* mask precision exception for nearbyint() */
20333 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20334 slot = SLOT_CW_MASK_PM;
20335 break;
20337 default:
20338 gcc_unreachable ();
20342 gcc_assert (slot < MAX_386_STACK_LOCALS);
20344 new_mode = assign_386_stack_local (HImode, slot);
20345 emit_move_insn (new_mode, reg);
20348 /* Emit vzeroupper. */
20350 void
20351 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20353 int i;
20355 /* Cancel automatic vzeroupper insertion if there are
20356 live call-saved SSE registers at the insertion point. */
20358 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20359 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20360 return;
20362 if (TARGET_64BIT)
20363 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20364 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20365 return;
20367 emit_insn (gen_avx_vzeroupper ());
20370 /* Generate one or more insns to set ENTITY to MODE. */
20372 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20373 is the set of hard registers live at the point where the insn(s)
20374 are to be inserted. */
20376 static void
20377 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20378 HARD_REG_SET regs_live)
20380 switch (entity)
20382 case X86_DIRFLAG:
20383 if (mode == X86_DIRFLAG_RESET)
20384 emit_insn (gen_cld ());
20385 break;
20386 case AVX_U128:
20387 if (mode == AVX_U128_CLEAN)
20388 ix86_avx_emit_vzeroupper (regs_live);
20389 break;
20390 case I387_TRUNC:
20391 case I387_FLOOR:
20392 case I387_CEIL:
20393 case I387_MASK_PM:
20394 if (mode != I387_CW_ANY
20395 && mode != I387_CW_UNINITIALIZED)
20396 emit_i387_cw_initialization (mode);
20397 break;
20398 default:
20399 gcc_unreachable ();
20403 /* Output code for INSN to convert a float to a signed int. OPERANDS
20404 are the insn operands. The output may be [HSD]Imode and the input
20405 operand may be [SDX]Fmode. */
20407 const char *
20408 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20410 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20411 int dimode_p = GET_MODE (operands[0]) == DImode;
20412 int round_mode = get_attr_i387_cw (insn);
20414 /* Jump through a hoop or two for DImode, since the hardware has no
20415 non-popping instruction. We used to do this a different way, but
20416 that was somewhat fragile and broke with post-reload splitters. */
20417 if ((dimode_p || fisttp) && !stack_top_dies)
20418 output_asm_insn ("fld\t%y1", operands);
20420 gcc_assert (STACK_TOP_P (operands[1]));
20421 gcc_assert (MEM_P (operands[0]));
20422 gcc_assert (GET_MODE (operands[1]) != TFmode);
20424 if (fisttp)
20425 output_asm_insn ("fisttp%Z0\t%0", operands);
20426 else
20428 if (round_mode != I387_CW_ANY)
20429 output_asm_insn ("fldcw\t%3", operands);
20430 if (stack_top_dies || dimode_p)
20431 output_asm_insn ("fistp%Z0\t%0", operands);
20432 else
20433 output_asm_insn ("fist%Z0\t%0", operands);
20434 if (round_mode != I387_CW_ANY)
20435 output_asm_insn ("fldcw\t%2", operands);
20438 return "";
20441 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20442 have the values zero or one, indicates the ffreep insn's operand
20443 from the OPERANDS array. */
20445 static const char *
20446 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20448 if (TARGET_USE_FFREEP)
20449 #ifdef HAVE_AS_IX86_FFREEP
20450 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20451 #else
20453 static char retval[32];
20454 int regno = REGNO (operands[opno]);
20456 gcc_assert (STACK_REGNO_P (regno));
20458 regno -= FIRST_STACK_REG;
20460 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20461 return retval;
20463 #endif
20465 return opno ? "fstp\t%y1" : "fstp\t%y0";
20469 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20470 should be used. UNORDERED_P is true when fucom should be used. */
20472 const char *
20473 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20475 int stack_top_dies;
20476 rtx cmp_op0, cmp_op1;
20477 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20479 if (eflags_p)
20481 cmp_op0 = operands[0];
20482 cmp_op1 = operands[1];
20484 else
20486 cmp_op0 = operands[1];
20487 cmp_op1 = operands[2];
20490 if (is_sse)
20492 if (GET_MODE (operands[0]) == SFmode)
20493 if (unordered_p)
20494 return "%vucomiss\t{%1, %0|%0, %1}";
20495 else
20496 return "%vcomiss\t{%1, %0|%0, %1}";
20497 else
20498 if (unordered_p)
20499 return "%vucomisd\t{%1, %0|%0, %1}";
20500 else
20501 return "%vcomisd\t{%1, %0|%0, %1}";
20504 gcc_assert (STACK_TOP_P (cmp_op0));
20506 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20508 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20510 if (stack_top_dies)
20512 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20513 return output_387_ffreep (operands, 1);
20515 else
20516 return "ftst\n\tfnstsw\t%0";
20519 if (STACK_REG_P (cmp_op1)
20520 && stack_top_dies
20521 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20522 && REGNO (cmp_op1) != FIRST_STACK_REG)
20524 /* If both the top of the 387 stack dies, and the other operand
20525 is also a stack register that dies, then this must be a
20526 `fcompp' float compare */
20528 if (eflags_p)
20530 /* There is no double popping fcomi variant. Fortunately,
20531 eflags is immune from the fstp's cc clobbering. */
20532 if (unordered_p)
20533 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20534 else
20535 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20536 return output_387_ffreep (operands, 0);
20538 else
20540 if (unordered_p)
20541 return "fucompp\n\tfnstsw\t%0";
20542 else
20543 return "fcompp\n\tfnstsw\t%0";
20546 else
20548 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20550 static const char * const alt[16] =
20552 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20553 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20554 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20555 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20557 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20558 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20559 NULL,
20560 NULL,
20562 "fcomi\t{%y1, %0|%0, %y1}",
20563 "fcomip\t{%y1, %0|%0, %y1}",
20564 "fucomi\t{%y1, %0|%0, %y1}",
20565 "fucomip\t{%y1, %0|%0, %y1}",
20567 NULL,
20568 NULL,
20569 NULL,
20570 NULL
20573 int mask;
20574 const char *ret;
20576 mask = eflags_p << 3;
20577 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20578 mask |= unordered_p << 1;
20579 mask |= stack_top_dies;
20581 gcc_assert (mask < 16);
20582 ret = alt[mask];
20583 gcc_assert (ret);
20585 return ret;
20589 void
20590 ix86_output_addr_vec_elt (FILE *file, int value)
20592 const char *directive = ASM_LONG;
20594 #ifdef ASM_QUAD
20595 if (TARGET_LP64)
20596 directive = ASM_QUAD;
20597 #else
20598 gcc_assert (!TARGET_64BIT);
20599 #endif
20601 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20604 void
20605 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20607 const char *directive = ASM_LONG;
20609 #ifdef ASM_QUAD
20610 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20611 directive = ASM_QUAD;
20612 #else
20613 gcc_assert (!TARGET_64BIT);
20614 #endif
20615 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20616 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20617 fprintf (file, "%s%s%d-%s%d\n",
20618 directive, LPREFIX, value, LPREFIX, rel);
20619 else if (HAVE_AS_GOTOFF_IN_DATA)
20620 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20621 #if TARGET_MACHO
20622 else if (TARGET_MACHO)
20624 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20625 machopic_output_function_base_name (file);
20626 putc ('\n', file);
20628 #endif
20629 else
20630 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20631 GOT_SYMBOL_NAME, LPREFIX, value);
20634 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20635 for the target. */
20637 void
20638 ix86_expand_clear (rtx dest)
20640 rtx tmp;
20642 /* We play register width games, which are only valid after reload. */
20643 gcc_assert (reload_completed);
20645 /* Avoid HImode and its attendant prefix byte. */
20646 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20647 dest = gen_rtx_REG (SImode, REGNO (dest));
20648 tmp = gen_rtx_SET (dest, const0_rtx);
20650 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20652 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20653 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20656 emit_insn (tmp);
20659 /* X is an unchanging MEM. If it is a constant pool reference, return
20660 the constant pool rtx, else NULL. */
20663 maybe_get_pool_constant (rtx x)
20665 x = ix86_delegitimize_address (XEXP (x, 0));
20667 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20668 return get_pool_constant (x);
20670 return NULL_RTX;
20673 void
20674 ix86_expand_move (machine_mode mode, rtx operands[])
20676 rtx op0, op1;
20677 rtx tmp, addend = NULL_RTX;
20678 enum tls_model model;
20680 op0 = operands[0];
20681 op1 = operands[1];
20683 switch (GET_CODE (op1))
20685 case CONST:
20686 tmp = XEXP (op1, 0);
20688 if (GET_CODE (tmp) != PLUS
20689 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20690 break;
20692 op1 = XEXP (tmp, 0);
20693 addend = XEXP (tmp, 1);
20694 /* FALLTHRU */
20696 case SYMBOL_REF:
20697 model = SYMBOL_REF_TLS_MODEL (op1);
20699 if (model)
20700 op1 = legitimize_tls_address (op1, model, true);
20701 else if (ix86_force_load_from_GOT_p (op1))
20703 /* Load the external function address via GOT slot to avoid PLT. */
20704 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20705 (TARGET_64BIT
20706 ? UNSPEC_GOTPCREL
20707 : UNSPEC_GOT));
20708 op1 = gen_rtx_CONST (Pmode, op1);
20709 op1 = gen_const_mem (Pmode, op1);
20710 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20712 else
20714 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20715 if (tmp)
20717 op1 = tmp;
20718 if (!addend)
20719 break;
20721 else
20723 op1 = operands[1];
20724 break;
20728 if (addend)
20730 op1 = force_operand (op1, NULL_RTX);
20731 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20732 op0, 1, OPTAB_DIRECT);
20734 else
20735 op1 = force_operand (op1, op0);
20737 if (op1 == op0)
20738 return;
20740 op1 = convert_to_mode (mode, op1, 1);
20742 default:
20743 break;
20746 if ((flag_pic || MACHOPIC_INDIRECT)
20747 && symbolic_operand (op1, mode))
20749 if (TARGET_MACHO && !TARGET_64BIT)
20751 #if TARGET_MACHO
20752 /* dynamic-no-pic */
20753 if (MACHOPIC_INDIRECT)
20755 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20756 ? op0 : gen_reg_rtx (Pmode);
20757 op1 = machopic_indirect_data_reference (op1, temp);
20758 if (MACHOPIC_PURE)
20759 op1 = machopic_legitimize_pic_address (op1, mode,
20760 temp == op1 ? 0 : temp);
20762 if (op0 != op1 && GET_CODE (op0) != MEM)
20764 rtx insn = gen_rtx_SET (op0, op1);
20765 emit_insn (insn);
20766 return;
20768 if (GET_CODE (op0) == MEM)
20769 op1 = force_reg (Pmode, op1);
20770 else
20772 rtx temp = op0;
20773 if (GET_CODE (temp) != REG)
20774 temp = gen_reg_rtx (Pmode);
20775 temp = legitimize_pic_address (op1, temp);
20776 if (temp == op0)
20777 return;
20778 op1 = temp;
20780 /* dynamic-no-pic */
20781 #endif
20783 else
20785 if (MEM_P (op0))
20786 op1 = force_reg (mode, op1);
20787 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20789 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20790 op1 = legitimize_pic_address (op1, reg);
20791 if (op0 == op1)
20792 return;
20793 op1 = convert_to_mode (mode, op1, 1);
20797 else
20799 if (MEM_P (op0)
20800 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20801 || !push_operand (op0, mode))
20802 && MEM_P (op1))
20803 op1 = force_reg (mode, op1);
20805 if (push_operand (op0, mode)
20806 && ! general_no_elim_operand (op1, mode))
20807 op1 = copy_to_mode_reg (mode, op1);
20809 /* Force large constants in 64bit compilation into register
20810 to get them CSEed. */
20811 if (can_create_pseudo_p ()
20812 && (mode == DImode) && TARGET_64BIT
20813 && immediate_operand (op1, mode)
20814 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20815 && !register_operand (op0, mode)
20816 && optimize)
20817 op1 = copy_to_mode_reg (mode, op1);
20819 if (can_create_pseudo_p ()
20820 && CONST_DOUBLE_P (op1))
20822 /* If we are loading a floating point constant to a register,
20823 force the value to memory now, since we'll get better code
20824 out the back end. */
20826 op1 = validize_mem (force_const_mem (mode, op1));
20827 if (!register_operand (op0, mode))
20829 rtx temp = gen_reg_rtx (mode);
20830 emit_insn (gen_rtx_SET (temp, op1));
20831 emit_move_insn (op0, temp);
20832 return;
20837 emit_insn (gen_rtx_SET (op0, op1));
20840 void
20841 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20843 rtx op0 = operands[0], op1 = operands[1];
20844 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20845 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20846 unsigned int align = (TARGET_IAMCU
20847 ? GET_MODE_BITSIZE (mode)
20848 : GET_MODE_ALIGNMENT (mode));
20850 if (push_operand (op0, VOIDmode))
20851 op0 = emit_move_resolve_push (mode, op0);
20853 /* Force constants other than zero into memory. We do not know how
20854 the instructions used to build constants modify the upper 64 bits
20855 of the register, once we have that information we may be able
20856 to handle some of them more efficiently. */
20857 if (can_create_pseudo_p ()
20858 && (CONSTANT_P (op1)
20859 || (SUBREG_P (op1)
20860 && CONSTANT_P (SUBREG_REG (op1))))
20861 && ((register_operand (op0, mode)
20862 && !standard_sse_constant_p (op1, mode))
20863 /* ix86_expand_vector_move_misalign() does not like constants. */
20864 || (SSE_REG_MODE_P (mode)
20865 && MEM_P (op0)
20866 && MEM_ALIGN (op0) < align)))
20868 if (SUBREG_P (op1))
20870 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20871 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20872 if (r)
20873 r = validize_mem (r);
20874 else
20875 r = force_reg (imode, SUBREG_REG (op1));
20876 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20878 else
20879 op1 = validize_mem (force_const_mem (mode, op1));
20882 /* We need to check memory alignment for SSE mode since attribute
20883 can make operands unaligned. */
20884 if (can_create_pseudo_p ()
20885 && SSE_REG_MODE_P (mode)
20886 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20887 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20889 rtx tmp[2];
20891 /* ix86_expand_vector_move_misalign() does not like both
20892 arguments in memory. */
20893 if (!register_operand (op0, mode)
20894 && !register_operand (op1, mode))
20895 op1 = force_reg (mode, op1);
20897 tmp[0] = op0; tmp[1] = op1;
20898 ix86_expand_vector_move_misalign (mode, tmp);
20899 return;
20902 /* Make operand1 a register if it isn't already. */
20903 if (can_create_pseudo_p ()
20904 && !register_operand (op0, mode)
20905 && !register_operand (op1, mode))
20907 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20908 return;
20911 emit_insn (gen_rtx_SET (op0, op1));
20914 /* Split 32-byte AVX unaligned load and store if needed. */
20916 static void
20917 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20919 rtx m;
20920 rtx (*extract) (rtx, rtx, rtx);
20921 machine_mode mode;
20923 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20924 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20926 emit_insn (gen_rtx_SET (op0, op1));
20927 return;
20930 rtx orig_op0 = NULL_RTX;
20931 mode = GET_MODE (op0);
20932 switch (GET_MODE_CLASS (mode))
20934 case MODE_VECTOR_INT:
20935 case MODE_INT:
20936 if (mode != V32QImode)
20938 if (!MEM_P (op0))
20940 orig_op0 = op0;
20941 op0 = gen_reg_rtx (V32QImode);
20943 else
20944 op0 = gen_lowpart (V32QImode, op0);
20945 op1 = gen_lowpart (V32QImode, op1);
20946 mode = V32QImode;
20948 break;
20949 case MODE_VECTOR_FLOAT:
20950 break;
20951 default:
20952 gcc_unreachable ();
20955 switch (mode)
20957 default:
20958 gcc_unreachable ();
20959 case V32QImode:
20960 extract = gen_avx_vextractf128v32qi;
20961 mode = V16QImode;
20962 break;
20963 case V8SFmode:
20964 extract = gen_avx_vextractf128v8sf;
20965 mode = V4SFmode;
20966 break;
20967 case V4DFmode:
20968 extract = gen_avx_vextractf128v4df;
20969 mode = V2DFmode;
20970 break;
20973 if (MEM_P (op1))
20975 rtx r = gen_reg_rtx (mode);
20976 m = adjust_address (op1, mode, 0);
20977 emit_move_insn (r, m);
20978 m = adjust_address (op1, mode, 16);
20979 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20980 emit_move_insn (op0, r);
20982 else if (MEM_P (op0))
20984 m = adjust_address (op0, mode, 0);
20985 emit_insn (extract (m, op1, const0_rtx));
20986 m = adjust_address (op0, mode, 16);
20987 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20989 else
20990 gcc_unreachable ();
20992 if (orig_op0)
20993 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20996 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20997 straight to ix86_expand_vector_move. */
20998 /* Code generation for scalar reg-reg moves of single and double precision data:
20999 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
21000 movaps reg, reg
21001 else
21002 movss reg, reg
21003 if (x86_sse_partial_reg_dependency == true)
21004 movapd reg, reg
21005 else
21006 movsd reg, reg
21008 Code generation for scalar loads of double precision data:
21009 if (x86_sse_split_regs == true)
21010 movlpd mem, reg (gas syntax)
21011 else
21012 movsd mem, reg
21014 Code generation for unaligned packed loads of single precision data
21015 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21016 if (x86_sse_unaligned_move_optimal)
21017 movups mem, reg
21019 if (x86_sse_partial_reg_dependency == true)
21021 xorps reg, reg
21022 movlps mem, reg
21023 movhps mem+8, reg
21025 else
21027 movlps mem, reg
21028 movhps mem+8, reg
21031 Code generation for unaligned packed loads of double precision data
21032 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21033 if (x86_sse_unaligned_move_optimal)
21034 movupd mem, reg
21036 if (x86_sse_split_regs == true)
21038 movlpd mem, reg
21039 movhpd mem+8, reg
21041 else
21043 movsd mem, reg
21044 movhpd mem+8, reg
21048 void
21049 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21051 rtx op0, op1, m;
21053 op0 = operands[0];
21054 op1 = operands[1];
21056 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21057 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21059 emit_insn (gen_rtx_SET (op0, op1));
21060 return;
21063 if (TARGET_AVX)
21065 if (GET_MODE_SIZE (mode) == 32)
21066 ix86_avx256_split_vector_move_misalign (op0, op1);
21067 else
21068 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21069 emit_insn (gen_rtx_SET (op0, op1));
21070 return;
21073 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21074 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21076 emit_insn (gen_rtx_SET (op0, op1));
21077 return;
21080 /* ??? If we have typed data, then it would appear that using
21081 movdqu is the only way to get unaligned data loaded with
21082 integer type. */
21083 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21085 emit_insn (gen_rtx_SET (op0, op1));
21086 return;
21089 if (MEM_P (op1))
21091 if (TARGET_SSE2 && mode == V2DFmode)
21093 rtx zero;
21095 /* When SSE registers are split into halves, we can avoid
21096 writing to the top half twice. */
21097 if (TARGET_SSE_SPLIT_REGS)
21099 emit_clobber (op0);
21100 zero = op0;
21102 else
21104 /* ??? Not sure about the best option for the Intel chips.
21105 The following would seem to satisfy; the register is
21106 entirely cleared, breaking the dependency chain. We
21107 then store to the upper half, with a dependency depth
21108 of one. A rumor has it that Intel recommends two movsd
21109 followed by an unpacklpd, but this is unconfirmed. And
21110 given that the dependency depth of the unpacklpd would
21111 still be one, I'm not sure why this would be better. */
21112 zero = CONST0_RTX (V2DFmode);
21115 m = adjust_address (op1, DFmode, 0);
21116 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21117 m = adjust_address (op1, DFmode, 8);
21118 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21120 else
21122 rtx t;
21124 if (mode != V4SFmode)
21125 t = gen_reg_rtx (V4SFmode);
21126 else
21127 t = op0;
21129 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21130 emit_move_insn (t, CONST0_RTX (V4SFmode));
21131 else
21132 emit_clobber (t);
21134 m = adjust_address (op1, V2SFmode, 0);
21135 emit_insn (gen_sse_loadlps (t, t, m));
21136 m = adjust_address (op1, V2SFmode, 8);
21137 emit_insn (gen_sse_loadhps (t, t, m));
21138 if (mode != V4SFmode)
21139 emit_move_insn (op0, gen_lowpart (mode, t));
21142 else if (MEM_P (op0))
21144 if (TARGET_SSE2 && mode == V2DFmode)
21146 m = adjust_address (op0, DFmode, 0);
21147 emit_insn (gen_sse2_storelpd (m, op1));
21148 m = adjust_address (op0, DFmode, 8);
21149 emit_insn (gen_sse2_storehpd (m, op1));
21151 else
21153 if (mode != V4SFmode)
21154 op1 = gen_lowpart (V4SFmode, op1);
21156 m = adjust_address (op0, V2SFmode, 0);
21157 emit_insn (gen_sse_storelps (m, op1));
21158 m = adjust_address (op0, V2SFmode, 8);
21159 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21162 else
21163 gcc_unreachable ();
21166 /* Helper function of ix86_fixup_binary_operands to canonicalize
21167 operand order. Returns true if the operands should be swapped. */
21169 static bool
21170 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21171 rtx operands[])
21173 rtx dst = operands[0];
21174 rtx src1 = operands[1];
21175 rtx src2 = operands[2];
21177 /* If the operation is not commutative, we can't do anything. */
21178 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21179 return false;
21181 /* Highest priority is that src1 should match dst. */
21182 if (rtx_equal_p (dst, src1))
21183 return false;
21184 if (rtx_equal_p (dst, src2))
21185 return true;
21187 /* Next highest priority is that immediate constants come second. */
21188 if (immediate_operand (src2, mode))
21189 return false;
21190 if (immediate_operand (src1, mode))
21191 return true;
21193 /* Lowest priority is that memory references should come second. */
21194 if (MEM_P (src2))
21195 return false;
21196 if (MEM_P (src1))
21197 return true;
21199 return false;
21203 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21204 destination to use for the operation. If different from the true
21205 destination in operands[0], a copy operation will be required. */
21208 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21209 rtx operands[])
21211 rtx dst = operands[0];
21212 rtx src1 = operands[1];
21213 rtx src2 = operands[2];
21215 /* Canonicalize operand order. */
21216 if (ix86_swap_binary_operands_p (code, mode, operands))
21218 /* It is invalid to swap operands of different modes. */
21219 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21221 std::swap (src1, src2);
21224 /* Both source operands cannot be in memory. */
21225 if (MEM_P (src1) && MEM_P (src2))
21227 /* Optimization: Only read from memory once. */
21228 if (rtx_equal_p (src1, src2))
21230 src2 = force_reg (mode, src2);
21231 src1 = src2;
21233 else if (rtx_equal_p (dst, src1))
21234 src2 = force_reg (mode, src2);
21235 else
21236 src1 = force_reg (mode, src1);
21239 /* If the destination is memory, and we do not have matching source
21240 operands, do things in registers. */
21241 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21242 dst = gen_reg_rtx (mode);
21244 /* Source 1 cannot be a constant. */
21245 if (CONSTANT_P (src1))
21246 src1 = force_reg (mode, src1);
21248 /* Source 1 cannot be a non-matching memory. */
21249 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21250 src1 = force_reg (mode, src1);
21252 /* Improve address combine. */
21253 if (code == PLUS
21254 && GET_MODE_CLASS (mode) == MODE_INT
21255 && MEM_P (src2))
21256 src2 = force_reg (mode, src2);
21258 operands[1] = src1;
21259 operands[2] = src2;
21260 return dst;
21263 /* Similarly, but assume that the destination has already been
21264 set up properly. */
21266 void
21267 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21268 machine_mode mode, rtx operands[])
21270 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21271 gcc_assert (dst == operands[0]);
21274 /* Attempt to expand a binary operator. Make the expansion closer to the
21275 actual machine, then just general_operand, which will allow 3 separate
21276 memory references (one output, two input) in a single insn. */
21278 void
21279 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21280 rtx operands[])
21282 rtx src1, src2, dst, op, clob;
21284 dst = ix86_fixup_binary_operands (code, mode, operands);
21285 src1 = operands[1];
21286 src2 = operands[2];
21288 /* Emit the instruction. */
21290 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21292 if (reload_completed
21293 && code == PLUS
21294 && !rtx_equal_p (dst, src1))
21296 /* This is going to be an LEA; avoid splitting it later. */
21297 emit_insn (op);
21299 else
21301 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21302 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21305 /* Fix up the destination if needed. */
21306 if (dst != operands[0])
21307 emit_move_insn (operands[0], dst);
21310 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21311 the given OPERANDS. */
21313 void
21314 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21315 rtx operands[])
21317 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21318 if (SUBREG_P (operands[1]))
21320 op1 = operands[1];
21321 op2 = operands[2];
21323 else if (SUBREG_P (operands[2]))
21325 op1 = operands[2];
21326 op2 = operands[1];
21328 /* Optimize (__m128i) d | (__m128i) e and similar code
21329 when d and e are float vectors into float vector logical
21330 insn. In C/C++ without using intrinsics there is no other way
21331 to express vector logical operation on float vectors than
21332 to cast them temporarily to integer vectors. */
21333 if (op1
21334 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21335 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21336 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21337 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21338 && SUBREG_BYTE (op1) == 0
21339 && (GET_CODE (op2) == CONST_VECTOR
21340 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21341 && SUBREG_BYTE (op2) == 0))
21342 && can_create_pseudo_p ())
21344 rtx dst;
21345 switch (GET_MODE (SUBREG_REG (op1)))
21347 case V4SFmode:
21348 case V8SFmode:
21349 case V16SFmode:
21350 case V2DFmode:
21351 case V4DFmode:
21352 case V8DFmode:
21353 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21354 if (GET_CODE (op2) == CONST_VECTOR)
21356 op2 = gen_lowpart (GET_MODE (dst), op2);
21357 op2 = force_reg (GET_MODE (dst), op2);
21359 else
21361 op1 = operands[1];
21362 op2 = SUBREG_REG (operands[2]);
21363 if (!vector_operand (op2, GET_MODE (dst)))
21364 op2 = force_reg (GET_MODE (dst), op2);
21366 op1 = SUBREG_REG (op1);
21367 if (!vector_operand (op1, GET_MODE (dst)))
21368 op1 = force_reg (GET_MODE (dst), op1);
21369 emit_insn (gen_rtx_SET (dst,
21370 gen_rtx_fmt_ee (code, GET_MODE (dst),
21371 op1, op2)));
21372 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21373 return;
21374 default:
21375 break;
21378 if (!vector_operand (operands[1], mode))
21379 operands[1] = force_reg (mode, operands[1]);
21380 if (!vector_operand (operands[2], mode))
21381 operands[2] = force_reg (mode, operands[2]);
21382 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21383 emit_insn (gen_rtx_SET (operands[0],
21384 gen_rtx_fmt_ee (code, mode, operands[1],
21385 operands[2])));
21388 /* Return TRUE or FALSE depending on whether the binary operator meets the
21389 appropriate constraints. */
21391 bool
21392 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21393 rtx operands[3])
21395 rtx dst = operands[0];
21396 rtx src1 = operands[1];
21397 rtx src2 = operands[2];
21399 /* Both source operands cannot be in memory. */
21400 if (MEM_P (src1) && MEM_P (src2))
21401 return false;
21403 /* Canonicalize operand order for commutative operators. */
21404 if (ix86_swap_binary_operands_p (code, mode, operands))
21405 std::swap (src1, src2);
21407 /* If the destination is memory, we must have a matching source operand. */
21408 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21409 return false;
21411 /* Source 1 cannot be a constant. */
21412 if (CONSTANT_P (src1))
21413 return false;
21415 /* Source 1 cannot be a non-matching memory. */
21416 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21417 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21418 return (code == AND
21419 && (mode == HImode
21420 || mode == SImode
21421 || (TARGET_64BIT && mode == DImode))
21422 && satisfies_constraint_L (src2));
21424 return true;
21427 /* Attempt to expand a unary operator. Make the expansion closer to the
21428 actual machine, then just general_operand, which will allow 2 separate
21429 memory references (one output, one input) in a single insn. */
21431 void
21432 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21433 rtx operands[])
21435 bool matching_memory = false;
21436 rtx src, dst, op, clob;
21438 dst = operands[0];
21439 src = operands[1];
21441 /* If the destination is memory, and we do not have matching source
21442 operands, do things in registers. */
21443 if (MEM_P (dst))
21445 if (rtx_equal_p (dst, src))
21446 matching_memory = true;
21447 else
21448 dst = gen_reg_rtx (mode);
21451 /* When source operand is memory, destination must match. */
21452 if (MEM_P (src) && !matching_memory)
21453 src = force_reg (mode, src);
21455 /* Emit the instruction. */
21457 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21459 if (code == NOT)
21460 emit_insn (op);
21461 else
21463 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21464 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21467 /* Fix up the destination if needed. */
21468 if (dst != operands[0])
21469 emit_move_insn (operands[0], dst);
21472 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21473 divisor are within the range [0-255]. */
21475 void
21476 ix86_split_idivmod (machine_mode mode, rtx operands[],
21477 bool signed_p)
21479 rtx_code_label *end_label, *qimode_label;
21480 rtx div, mod;
21481 rtx_insn *insn;
21482 rtx scratch, tmp0, tmp1, tmp2;
21483 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21484 rtx (*gen_zero_extend) (rtx, rtx);
21485 rtx (*gen_test_ccno_1) (rtx, rtx);
21487 switch (mode)
21489 case SImode:
21490 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21491 gen_test_ccno_1 = gen_testsi_ccno_1;
21492 gen_zero_extend = gen_zero_extendqisi2;
21493 break;
21494 case DImode:
21495 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21496 gen_test_ccno_1 = gen_testdi_ccno_1;
21497 gen_zero_extend = gen_zero_extendqidi2;
21498 break;
21499 default:
21500 gcc_unreachable ();
21503 end_label = gen_label_rtx ();
21504 qimode_label = gen_label_rtx ();
21506 scratch = gen_reg_rtx (mode);
21508 /* Use 8bit unsigned divimod if dividend and divisor are within
21509 the range [0-255]. */
21510 emit_move_insn (scratch, operands[2]);
21511 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21512 scratch, 1, OPTAB_DIRECT);
21513 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21514 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21515 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21516 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21517 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21518 pc_rtx);
21519 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21520 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21521 JUMP_LABEL (insn) = qimode_label;
21523 /* Generate original signed/unsigned divimod. */
21524 div = gen_divmod4_1 (operands[0], operands[1],
21525 operands[2], operands[3]);
21526 emit_insn (div);
21528 /* Branch to the end. */
21529 emit_jump_insn (gen_jump (end_label));
21530 emit_barrier ();
21532 /* Generate 8bit unsigned divide. */
21533 emit_label (qimode_label);
21534 /* Don't use operands[0] for result of 8bit divide since not all
21535 registers support QImode ZERO_EXTRACT. */
21536 tmp0 = lowpart_subreg (HImode, scratch, mode);
21537 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21538 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21539 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21541 if (signed_p)
21543 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21544 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21546 else
21548 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21549 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21552 /* Extract remainder from AH. */
21553 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21554 if (REG_P (operands[1]))
21555 insn = emit_move_insn (operands[1], tmp1);
21556 else
21558 /* Need a new scratch register since the old one has result
21559 of 8bit divide. */
21560 scratch = gen_reg_rtx (mode);
21561 emit_move_insn (scratch, tmp1);
21562 insn = emit_move_insn (operands[1], scratch);
21564 set_unique_reg_note (insn, REG_EQUAL, mod);
21566 /* Zero extend quotient from AL. */
21567 tmp1 = gen_lowpart (QImode, tmp0);
21568 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21569 set_unique_reg_note (insn, REG_EQUAL, div);
21571 emit_label (end_label);
21574 #define LEA_MAX_STALL (3)
21575 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21577 /* Increase given DISTANCE in half-cycles according to
21578 dependencies between PREV and NEXT instructions.
21579 Add 1 half-cycle if there is no dependency and
21580 go to next cycle if there is some dependecy. */
21582 static unsigned int
21583 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21585 df_ref def, use;
21587 if (!prev || !next)
21588 return distance + (distance & 1) + 2;
21590 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21591 return distance + 1;
21593 FOR_EACH_INSN_USE (use, next)
21594 FOR_EACH_INSN_DEF (def, prev)
21595 if (!DF_REF_IS_ARTIFICIAL (def)
21596 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21597 return distance + (distance & 1) + 2;
21599 return distance + 1;
21602 /* Function checks if instruction INSN defines register number
21603 REGNO1 or REGNO2. */
21605 static bool
21606 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21607 rtx_insn *insn)
21609 df_ref def;
21611 FOR_EACH_INSN_DEF (def, insn)
21612 if (DF_REF_REG_DEF_P (def)
21613 && !DF_REF_IS_ARTIFICIAL (def)
21614 && (regno1 == DF_REF_REGNO (def)
21615 || regno2 == DF_REF_REGNO (def)))
21616 return true;
21618 return false;
21621 /* Function checks if instruction INSN uses register number
21622 REGNO as a part of address expression. */
21624 static bool
21625 insn_uses_reg_mem (unsigned int regno, rtx insn)
21627 df_ref use;
21629 FOR_EACH_INSN_USE (use, insn)
21630 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21631 return true;
21633 return false;
21636 /* Search backward for non-agu definition of register number REGNO1
21637 or register number REGNO2 in basic block starting from instruction
21638 START up to head of basic block or instruction INSN.
21640 Function puts true value into *FOUND var if definition was found
21641 and false otherwise.
21643 Distance in half-cycles between START and found instruction or head
21644 of BB is added to DISTANCE and returned. */
21646 static int
21647 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21648 rtx_insn *insn, int distance,
21649 rtx_insn *start, bool *found)
21651 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21652 rtx_insn *prev = start;
21653 rtx_insn *next = NULL;
21655 *found = false;
21657 while (prev
21658 && prev != insn
21659 && distance < LEA_SEARCH_THRESHOLD)
21661 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21663 distance = increase_distance (prev, next, distance);
21664 if (insn_defines_reg (regno1, regno2, prev))
21666 if (recog_memoized (prev) < 0
21667 || get_attr_type (prev) != TYPE_LEA)
21669 *found = true;
21670 return distance;
21674 next = prev;
21676 if (prev == BB_HEAD (bb))
21677 break;
21679 prev = PREV_INSN (prev);
21682 return distance;
21685 /* Search backward for non-agu definition of register number REGNO1
21686 or register number REGNO2 in INSN's basic block until
21687 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21688 2. Reach neighbor BBs boundary, or
21689 3. Reach agu definition.
21690 Returns the distance between the non-agu definition point and INSN.
21691 If no definition point, returns -1. */
21693 static int
21694 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21695 rtx_insn *insn)
21697 basic_block bb = BLOCK_FOR_INSN (insn);
21698 int distance = 0;
21699 bool found = false;
21701 if (insn != BB_HEAD (bb))
21702 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21703 distance, PREV_INSN (insn),
21704 &found);
21706 if (!found && distance < LEA_SEARCH_THRESHOLD)
21708 edge e;
21709 edge_iterator ei;
21710 bool simple_loop = false;
21712 FOR_EACH_EDGE (e, ei, bb->preds)
21713 if (e->src == bb)
21715 simple_loop = true;
21716 break;
21719 if (simple_loop)
21720 distance = distance_non_agu_define_in_bb (regno1, regno2,
21721 insn, distance,
21722 BB_END (bb), &found);
21723 else
21725 int shortest_dist = -1;
21726 bool found_in_bb = false;
21728 FOR_EACH_EDGE (e, ei, bb->preds)
21730 int bb_dist
21731 = distance_non_agu_define_in_bb (regno1, regno2,
21732 insn, distance,
21733 BB_END (e->src),
21734 &found_in_bb);
21735 if (found_in_bb)
21737 if (shortest_dist < 0)
21738 shortest_dist = bb_dist;
21739 else if (bb_dist > 0)
21740 shortest_dist = MIN (bb_dist, shortest_dist);
21742 found = true;
21746 distance = shortest_dist;
21750 /* get_attr_type may modify recog data. We want to make sure
21751 that recog data is valid for instruction INSN, on which
21752 distance_non_agu_define is called. INSN is unchanged here. */
21753 extract_insn_cached (insn);
21755 if (!found)
21756 return -1;
21758 return distance >> 1;
21761 /* Return the distance in half-cycles between INSN and the next
21762 insn that uses register number REGNO in memory address added
21763 to DISTANCE. Return -1 if REGNO0 is set.
21765 Put true value into *FOUND if register usage was found and
21766 false otherwise.
21767 Put true value into *REDEFINED if register redefinition was
21768 found and false otherwise. */
21770 static int
21771 distance_agu_use_in_bb (unsigned int regno,
21772 rtx_insn *insn, int distance, rtx_insn *start,
21773 bool *found, bool *redefined)
21775 basic_block bb = NULL;
21776 rtx_insn *next = start;
21777 rtx_insn *prev = NULL;
21779 *found = false;
21780 *redefined = false;
21782 if (start != NULL_RTX)
21784 bb = BLOCK_FOR_INSN (start);
21785 if (start != BB_HEAD (bb))
21786 /* If insn and start belong to the same bb, set prev to insn,
21787 so the call to increase_distance will increase the distance
21788 between insns by 1. */
21789 prev = insn;
21792 while (next
21793 && next != insn
21794 && distance < LEA_SEARCH_THRESHOLD)
21796 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21798 distance = increase_distance(prev, next, distance);
21799 if (insn_uses_reg_mem (regno, next))
21801 /* Return DISTANCE if OP0 is used in memory
21802 address in NEXT. */
21803 *found = true;
21804 return distance;
21807 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21809 /* Return -1 if OP0 is set in NEXT. */
21810 *redefined = true;
21811 return -1;
21814 prev = next;
21817 if (next == BB_END (bb))
21818 break;
21820 next = NEXT_INSN (next);
21823 return distance;
21826 /* Return the distance between INSN and the next insn that uses
21827 register number REGNO0 in memory address. Return -1 if no such
21828 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21830 static int
21831 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21833 basic_block bb = BLOCK_FOR_INSN (insn);
21834 int distance = 0;
21835 bool found = false;
21836 bool redefined = false;
21838 if (insn != BB_END (bb))
21839 distance = distance_agu_use_in_bb (regno0, insn, distance,
21840 NEXT_INSN (insn),
21841 &found, &redefined);
21843 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21845 edge e;
21846 edge_iterator ei;
21847 bool simple_loop = false;
21849 FOR_EACH_EDGE (e, ei, bb->succs)
21850 if (e->dest == bb)
21852 simple_loop = true;
21853 break;
21856 if (simple_loop)
21857 distance = distance_agu_use_in_bb (regno0, insn,
21858 distance, BB_HEAD (bb),
21859 &found, &redefined);
21860 else
21862 int shortest_dist = -1;
21863 bool found_in_bb = false;
21864 bool redefined_in_bb = false;
21866 FOR_EACH_EDGE (e, ei, bb->succs)
21868 int bb_dist
21869 = distance_agu_use_in_bb (regno0, insn,
21870 distance, BB_HEAD (e->dest),
21871 &found_in_bb, &redefined_in_bb);
21872 if (found_in_bb)
21874 if (shortest_dist < 0)
21875 shortest_dist = bb_dist;
21876 else if (bb_dist > 0)
21877 shortest_dist = MIN (bb_dist, shortest_dist);
21879 found = true;
21883 distance = shortest_dist;
21887 if (!found || redefined)
21888 return -1;
21890 return distance >> 1;
21893 /* Define this macro to tune LEA priority vs ADD, it take effect when
21894 there is a dilemma of choicing LEA or ADD
21895 Negative value: ADD is more preferred than LEA
21896 Zero: Netrual
21897 Positive value: LEA is more preferred than ADD*/
21898 #define IX86_LEA_PRIORITY 0
21900 /* Return true if usage of lea INSN has performance advantage
21901 over a sequence of instructions. Instructions sequence has
21902 SPLIT_COST cycles higher latency than lea latency. */
21904 static bool
21905 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21906 unsigned int regno2, int split_cost, bool has_scale)
21908 int dist_define, dist_use;
21910 /* For Silvermont if using a 2-source or 3-source LEA for
21911 non-destructive destination purposes, or due to wanting
21912 ability to use SCALE, the use of LEA is justified. */
21913 if (TARGET_SILVERMONT || TARGET_INTEL)
21915 if (has_scale)
21916 return true;
21917 if (split_cost < 1)
21918 return false;
21919 if (regno0 == regno1 || regno0 == regno2)
21920 return false;
21921 return true;
21924 dist_define = distance_non_agu_define (regno1, regno2, insn);
21925 dist_use = distance_agu_use (regno0, insn);
21927 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21929 /* If there is no non AGU operand definition, no AGU
21930 operand usage and split cost is 0 then both lea
21931 and non lea variants have same priority. Currently
21932 we prefer lea for 64 bit code and non lea on 32 bit
21933 code. */
21934 if (dist_use < 0 && split_cost == 0)
21935 return TARGET_64BIT || IX86_LEA_PRIORITY;
21936 else
21937 return true;
21940 /* With longer definitions distance lea is more preferable.
21941 Here we change it to take into account splitting cost and
21942 lea priority. */
21943 dist_define += split_cost + IX86_LEA_PRIORITY;
21945 /* If there is no use in memory addess then we just check
21946 that split cost exceeds AGU stall. */
21947 if (dist_use < 0)
21948 return dist_define > LEA_MAX_STALL;
21950 /* If this insn has both backward non-agu dependence and forward
21951 agu dependence, the one with short distance takes effect. */
21952 return dist_define >= dist_use;
21955 /* Return true if it is legal to clobber flags by INSN and
21956 false otherwise. */
21958 static bool
21959 ix86_ok_to_clobber_flags (rtx_insn *insn)
21961 basic_block bb = BLOCK_FOR_INSN (insn);
21962 df_ref use;
21963 bitmap live;
21965 while (insn)
21967 if (NONDEBUG_INSN_P (insn))
21969 FOR_EACH_INSN_USE (use, insn)
21970 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21971 return false;
21973 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21974 return true;
21977 if (insn == BB_END (bb))
21978 break;
21980 insn = NEXT_INSN (insn);
21983 live = df_get_live_out(bb);
21984 return !REGNO_REG_SET_P (live, FLAGS_REG);
21987 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21988 move and add to avoid AGU stalls. */
21990 bool
21991 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21993 unsigned int regno0, regno1, regno2;
21995 /* Check if we need to optimize. */
21996 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21997 return false;
21999 /* Check it is correct to split here. */
22000 if (!ix86_ok_to_clobber_flags(insn))
22001 return false;
22003 regno0 = true_regnum (operands[0]);
22004 regno1 = true_regnum (operands[1]);
22005 regno2 = true_regnum (operands[2]);
22007 /* We need to split only adds with non destructive
22008 destination operand. */
22009 if (regno0 == regno1 || regno0 == regno2)
22010 return false;
22011 else
22012 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22015 /* Return true if we should emit lea instruction instead of mov
22016 instruction. */
22018 bool
22019 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22021 unsigned int regno0, regno1;
22023 /* Check if we need to optimize. */
22024 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22025 return false;
22027 /* Use lea for reg to reg moves only. */
22028 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22029 return false;
22031 regno0 = true_regnum (operands[0]);
22032 regno1 = true_regnum (operands[1]);
22034 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22037 /* Return true if we need to split lea into a sequence of
22038 instructions to avoid AGU stalls. */
22040 bool
22041 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22043 unsigned int regno0, regno1, regno2;
22044 int split_cost;
22045 struct ix86_address parts;
22046 int ok;
22048 /* Check we need to optimize. */
22049 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22050 return false;
22052 /* The "at least two components" test below might not catch simple
22053 move or zero extension insns if parts.base is non-NULL and parts.disp
22054 is const0_rtx as the only components in the address, e.g. if the
22055 register is %rbp or %r13. As this test is much cheaper and moves or
22056 zero extensions are the common case, do this check first. */
22057 if (REG_P (operands[1])
22058 || (SImode_address_operand (operands[1], VOIDmode)
22059 && REG_P (XEXP (operands[1], 0))))
22060 return false;
22062 /* Check if it is OK to split here. */
22063 if (!ix86_ok_to_clobber_flags (insn))
22064 return false;
22066 ok = ix86_decompose_address (operands[1], &parts);
22067 gcc_assert (ok);
22069 /* There should be at least two components in the address. */
22070 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22071 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22072 return false;
22074 /* We should not split into add if non legitimate pic
22075 operand is used as displacement. */
22076 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22077 return false;
22079 regno0 = true_regnum (operands[0]) ;
22080 regno1 = INVALID_REGNUM;
22081 regno2 = INVALID_REGNUM;
22083 if (parts.base)
22084 regno1 = true_regnum (parts.base);
22085 if (parts.index)
22086 regno2 = true_regnum (parts.index);
22088 split_cost = 0;
22090 /* Compute how many cycles we will add to execution time
22091 if split lea into a sequence of instructions. */
22092 if (parts.base || parts.index)
22094 /* Have to use mov instruction if non desctructive
22095 destination form is used. */
22096 if (regno1 != regno0 && regno2 != regno0)
22097 split_cost += 1;
22099 /* Have to add index to base if both exist. */
22100 if (parts.base && parts.index)
22101 split_cost += 1;
22103 /* Have to use shift and adds if scale is 2 or greater. */
22104 if (parts.scale > 1)
22106 if (regno0 != regno1)
22107 split_cost += 1;
22108 else if (regno2 == regno0)
22109 split_cost += 4;
22110 else
22111 split_cost += parts.scale;
22114 /* Have to use add instruction with immediate if
22115 disp is non zero. */
22116 if (parts.disp && parts.disp != const0_rtx)
22117 split_cost += 1;
22119 /* Subtract the price of lea. */
22120 split_cost -= 1;
22123 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22124 parts.scale > 1);
22127 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22128 matches destination. RTX includes clobber of FLAGS_REG. */
22130 static void
22131 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22132 rtx dst, rtx src)
22134 rtx op, clob;
22136 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22137 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22139 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22142 /* Return true if regno1 def is nearest to the insn. */
22144 static bool
22145 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22147 rtx_insn *prev = insn;
22148 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22150 if (insn == start)
22151 return false;
22152 while (prev && prev != start)
22154 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22156 prev = PREV_INSN (prev);
22157 continue;
22159 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22160 return true;
22161 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22162 return false;
22163 prev = PREV_INSN (prev);
22166 /* None of the regs is defined in the bb. */
22167 return false;
22170 /* Split lea instructions into a sequence of instructions
22171 which are executed on ALU to avoid AGU stalls.
22172 It is assumed that it is allowed to clobber flags register
22173 at lea position. */
22175 void
22176 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22178 unsigned int regno0, regno1, regno2;
22179 struct ix86_address parts;
22180 rtx target, tmp;
22181 int ok, adds;
22183 ok = ix86_decompose_address (operands[1], &parts);
22184 gcc_assert (ok);
22186 target = gen_lowpart (mode, operands[0]);
22188 regno0 = true_regnum (target);
22189 regno1 = INVALID_REGNUM;
22190 regno2 = INVALID_REGNUM;
22192 if (parts.base)
22194 parts.base = gen_lowpart (mode, parts.base);
22195 regno1 = true_regnum (parts.base);
22198 if (parts.index)
22200 parts.index = gen_lowpart (mode, parts.index);
22201 regno2 = true_regnum (parts.index);
22204 if (parts.disp)
22205 parts.disp = gen_lowpart (mode, parts.disp);
22207 if (parts.scale > 1)
22209 /* Case r1 = r1 + ... */
22210 if (regno1 == regno0)
22212 /* If we have a case r1 = r1 + C * r2 then we
22213 should use multiplication which is very
22214 expensive. Assume cost model is wrong if we
22215 have such case here. */
22216 gcc_assert (regno2 != regno0);
22218 for (adds = parts.scale; adds > 0; adds--)
22219 ix86_emit_binop (PLUS, mode, target, parts.index);
22221 else
22223 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22224 if (regno0 != regno2)
22225 emit_insn (gen_rtx_SET (target, parts.index));
22227 /* Use shift for scaling. */
22228 ix86_emit_binop (ASHIFT, mode, target,
22229 GEN_INT (exact_log2 (parts.scale)));
22231 if (parts.base)
22232 ix86_emit_binop (PLUS, mode, target, parts.base);
22234 if (parts.disp && parts.disp != const0_rtx)
22235 ix86_emit_binop (PLUS, mode, target, parts.disp);
22238 else if (!parts.base && !parts.index)
22240 gcc_assert(parts.disp);
22241 emit_insn (gen_rtx_SET (target, parts.disp));
22243 else
22245 if (!parts.base)
22247 if (regno0 != regno2)
22248 emit_insn (gen_rtx_SET (target, parts.index));
22250 else if (!parts.index)
22252 if (regno0 != regno1)
22253 emit_insn (gen_rtx_SET (target, parts.base));
22255 else
22257 if (regno0 == regno1)
22258 tmp = parts.index;
22259 else if (regno0 == regno2)
22260 tmp = parts.base;
22261 else
22263 rtx tmp1;
22265 /* Find better operand for SET instruction, depending
22266 on which definition is farther from the insn. */
22267 if (find_nearest_reg_def (insn, regno1, regno2))
22268 tmp = parts.index, tmp1 = parts.base;
22269 else
22270 tmp = parts.base, tmp1 = parts.index;
22272 emit_insn (gen_rtx_SET (target, tmp));
22274 if (parts.disp && parts.disp != const0_rtx)
22275 ix86_emit_binop (PLUS, mode, target, parts.disp);
22277 ix86_emit_binop (PLUS, mode, target, tmp1);
22278 return;
22281 ix86_emit_binop (PLUS, mode, target, tmp);
22284 if (parts.disp && parts.disp != const0_rtx)
22285 ix86_emit_binop (PLUS, mode, target, parts.disp);
22289 /* Return true if it is ok to optimize an ADD operation to LEA
22290 operation to avoid flag register consumation. For most processors,
22291 ADD is faster than LEA. For the processors like BONNELL, if the
22292 destination register of LEA holds an actual address which will be
22293 used soon, LEA is better and otherwise ADD is better. */
22295 bool
22296 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22298 unsigned int regno0 = true_regnum (operands[0]);
22299 unsigned int regno1 = true_regnum (operands[1]);
22300 unsigned int regno2 = true_regnum (operands[2]);
22302 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22303 if (regno0 != regno1 && regno0 != regno2)
22304 return true;
22306 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22307 return false;
22309 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22312 /* Return true if destination reg of SET_BODY is shift count of
22313 USE_BODY. */
22315 static bool
22316 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22318 rtx set_dest;
22319 rtx shift_rtx;
22320 int i;
22322 /* Retrieve destination of SET_BODY. */
22323 switch (GET_CODE (set_body))
22325 case SET:
22326 set_dest = SET_DEST (set_body);
22327 if (!set_dest || !REG_P (set_dest))
22328 return false;
22329 break;
22330 case PARALLEL:
22331 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22332 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22333 use_body))
22334 return true;
22335 /* FALLTHROUGH */
22336 default:
22337 return false;
22340 /* Retrieve shift count of USE_BODY. */
22341 switch (GET_CODE (use_body))
22343 case SET:
22344 shift_rtx = XEXP (use_body, 1);
22345 break;
22346 case PARALLEL:
22347 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22348 if (ix86_dep_by_shift_count_body (set_body,
22349 XVECEXP (use_body, 0, i)))
22350 return true;
22351 /* FALLTHROUGH */
22352 default:
22353 return false;
22356 if (shift_rtx
22357 && (GET_CODE (shift_rtx) == ASHIFT
22358 || GET_CODE (shift_rtx) == LSHIFTRT
22359 || GET_CODE (shift_rtx) == ASHIFTRT
22360 || GET_CODE (shift_rtx) == ROTATE
22361 || GET_CODE (shift_rtx) == ROTATERT))
22363 rtx shift_count = XEXP (shift_rtx, 1);
22365 /* Return true if shift count is dest of SET_BODY. */
22366 if (REG_P (shift_count))
22368 /* Add check since it can be invoked before register
22369 allocation in pre-reload schedule. */
22370 if (reload_completed
22371 && true_regnum (set_dest) == true_regnum (shift_count))
22372 return true;
22373 else if (REGNO(set_dest) == REGNO(shift_count))
22374 return true;
22378 return false;
22381 /* Return true if destination reg of SET_INSN is shift count of
22382 USE_INSN. */
22384 bool
22385 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22387 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22388 PATTERN (use_insn));
22391 /* Return TRUE or FALSE depending on whether the unary operator meets the
22392 appropriate constraints. */
22394 bool
22395 ix86_unary_operator_ok (enum rtx_code,
22396 machine_mode,
22397 rtx operands[2])
22399 /* If one of operands is memory, source and destination must match. */
22400 if ((MEM_P (operands[0])
22401 || MEM_P (operands[1]))
22402 && ! rtx_equal_p (operands[0], operands[1]))
22403 return false;
22404 return true;
22407 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22408 are ok, keeping in mind the possible movddup alternative. */
22410 bool
22411 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22413 if (MEM_P (operands[0]))
22414 return rtx_equal_p (operands[0], operands[1 + high]);
22415 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22416 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22417 return true;
22420 /* Post-reload splitter for converting an SF or DFmode value in an
22421 SSE register into an unsigned SImode. */
22423 void
22424 ix86_split_convert_uns_si_sse (rtx operands[])
22426 machine_mode vecmode;
22427 rtx value, large, zero_or_two31, input, two31, x;
22429 large = operands[1];
22430 zero_or_two31 = operands[2];
22431 input = operands[3];
22432 two31 = operands[4];
22433 vecmode = GET_MODE (large);
22434 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22436 /* Load up the value into the low element. We must ensure that the other
22437 elements are valid floats -- zero is the easiest such value. */
22438 if (MEM_P (input))
22440 if (vecmode == V4SFmode)
22441 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22442 else
22443 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22445 else
22447 input = gen_rtx_REG (vecmode, REGNO (input));
22448 emit_move_insn (value, CONST0_RTX (vecmode));
22449 if (vecmode == V4SFmode)
22450 emit_insn (gen_sse_movss (value, value, input));
22451 else
22452 emit_insn (gen_sse2_movsd (value, value, input));
22455 emit_move_insn (large, two31);
22456 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22458 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22459 emit_insn (gen_rtx_SET (large, x));
22461 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22462 emit_insn (gen_rtx_SET (zero_or_two31, x));
22464 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22465 emit_insn (gen_rtx_SET (value, x));
22467 large = gen_rtx_REG (V4SImode, REGNO (large));
22468 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22470 x = gen_rtx_REG (V4SImode, REGNO (value));
22471 if (vecmode == V4SFmode)
22472 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22473 else
22474 emit_insn (gen_sse2_cvttpd2dq (x, value));
22475 value = x;
22477 emit_insn (gen_xorv4si3 (value, value, large));
22480 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22481 Expects the 64-bit DImode to be supplied in a pair of integral
22482 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22483 -mfpmath=sse, !optimize_size only. */
22485 void
22486 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22488 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22489 rtx int_xmm, fp_xmm;
22490 rtx biases, exponents;
22491 rtx x;
22493 int_xmm = gen_reg_rtx (V4SImode);
22494 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22495 emit_insn (gen_movdi_to_sse (int_xmm, input));
22496 else if (TARGET_SSE_SPLIT_REGS)
22498 emit_clobber (int_xmm);
22499 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22501 else
22503 x = gen_reg_rtx (V2DImode);
22504 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22505 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22508 x = gen_rtx_CONST_VECTOR (V4SImode,
22509 gen_rtvec (4, GEN_INT (0x43300000UL),
22510 GEN_INT (0x45300000UL),
22511 const0_rtx, const0_rtx));
22512 exponents = validize_mem (force_const_mem (V4SImode, x));
22514 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22515 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22517 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22518 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22519 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22520 (0x1.0p84 + double(fp_value_hi_xmm)).
22521 Note these exponents differ by 32. */
22523 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22525 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22526 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22527 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22528 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22529 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22530 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22531 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22532 biases = validize_mem (force_const_mem (V2DFmode, biases));
22533 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22535 /* Add the upper and lower DFmode values together. */
22536 if (TARGET_SSE3)
22537 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22538 else
22540 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22541 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22542 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22545 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22548 /* Not used, but eases macroization of patterns. */
22549 void
22550 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22552 gcc_unreachable ();
22555 /* Convert an unsigned SImode value into a DFmode. Only currently used
22556 for SSE, but applicable anywhere. */
22558 void
22559 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22561 REAL_VALUE_TYPE TWO31r;
22562 rtx x, fp;
22564 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22565 NULL, 1, OPTAB_DIRECT);
22567 fp = gen_reg_rtx (DFmode);
22568 emit_insn (gen_floatsidf2 (fp, x));
22570 real_ldexp (&TWO31r, &dconst1, 31);
22571 x = const_double_from_real_value (TWO31r, DFmode);
22573 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22574 if (x != target)
22575 emit_move_insn (target, x);
22578 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22579 32-bit mode; otherwise we have a direct convert instruction. */
22581 void
22582 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22584 REAL_VALUE_TYPE TWO32r;
22585 rtx fp_lo, fp_hi, x;
22587 fp_lo = gen_reg_rtx (DFmode);
22588 fp_hi = gen_reg_rtx (DFmode);
22590 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22592 real_ldexp (&TWO32r, &dconst1, 32);
22593 x = const_double_from_real_value (TWO32r, DFmode);
22594 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22596 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22598 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22599 0, OPTAB_DIRECT);
22600 if (x != target)
22601 emit_move_insn (target, x);
22604 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22605 For x86_32, -mfpmath=sse, !optimize_size only. */
22606 void
22607 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22609 REAL_VALUE_TYPE ONE16r;
22610 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22612 real_ldexp (&ONE16r, &dconst1, 16);
22613 x = const_double_from_real_value (ONE16r, SFmode);
22614 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22615 NULL, 0, OPTAB_DIRECT);
22616 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22617 NULL, 0, OPTAB_DIRECT);
22618 fp_hi = gen_reg_rtx (SFmode);
22619 fp_lo = gen_reg_rtx (SFmode);
22620 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22621 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22622 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22623 0, OPTAB_DIRECT);
22624 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22625 0, OPTAB_DIRECT);
22626 if (!rtx_equal_p (target, fp_hi))
22627 emit_move_insn (target, fp_hi);
22630 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22631 a vector of unsigned ints VAL to vector of floats TARGET. */
22633 void
22634 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22636 rtx tmp[8];
22637 REAL_VALUE_TYPE TWO16r;
22638 machine_mode intmode = GET_MODE (val);
22639 machine_mode fltmode = GET_MODE (target);
22640 rtx (*cvt) (rtx, rtx);
22642 if (intmode == V4SImode)
22643 cvt = gen_floatv4siv4sf2;
22644 else
22645 cvt = gen_floatv8siv8sf2;
22646 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22647 tmp[0] = force_reg (intmode, tmp[0]);
22648 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22649 OPTAB_DIRECT);
22650 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22651 NULL_RTX, 1, OPTAB_DIRECT);
22652 tmp[3] = gen_reg_rtx (fltmode);
22653 emit_insn (cvt (tmp[3], tmp[1]));
22654 tmp[4] = gen_reg_rtx (fltmode);
22655 emit_insn (cvt (tmp[4], tmp[2]));
22656 real_ldexp (&TWO16r, &dconst1, 16);
22657 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22658 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22659 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22660 OPTAB_DIRECT);
22661 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22662 OPTAB_DIRECT);
22663 if (tmp[7] != target)
22664 emit_move_insn (target, tmp[7]);
22667 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22668 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22669 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22670 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22673 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22675 REAL_VALUE_TYPE TWO31r;
22676 rtx two31r, tmp[4];
22677 machine_mode mode = GET_MODE (val);
22678 machine_mode scalarmode = GET_MODE_INNER (mode);
22679 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22680 rtx (*cmp) (rtx, rtx, rtx, rtx);
22681 int i;
22683 for (i = 0; i < 3; i++)
22684 tmp[i] = gen_reg_rtx (mode);
22685 real_ldexp (&TWO31r, &dconst1, 31);
22686 two31r = const_double_from_real_value (TWO31r, scalarmode);
22687 two31r = ix86_build_const_vector (mode, 1, two31r);
22688 two31r = force_reg (mode, two31r);
22689 switch (mode)
22691 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22692 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22693 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22694 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22695 default: gcc_unreachable ();
22697 tmp[3] = gen_rtx_LE (mode, two31r, val);
22698 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22699 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22700 0, OPTAB_DIRECT);
22701 if (intmode == V4SImode || TARGET_AVX2)
22702 *xorp = expand_simple_binop (intmode, ASHIFT,
22703 gen_lowpart (intmode, tmp[0]),
22704 GEN_INT (31), NULL_RTX, 0,
22705 OPTAB_DIRECT);
22706 else
22708 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22709 two31 = ix86_build_const_vector (intmode, 1, two31);
22710 *xorp = expand_simple_binop (intmode, AND,
22711 gen_lowpart (intmode, tmp[0]),
22712 two31, NULL_RTX, 0,
22713 OPTAB_DIRECT);
22715 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22716 0, OPTAB_DIRECT);
22719 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22720 then replicate the value for all elements of the vector
22721 register. */
22724 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22726 int i, n_elt;
22727 rtvec v;
22728 machine_mode scalar_mode;
22730 switch (mode)
22732 case V64QImode:
22733 case V32QImode:
22734 case V16QImode:
22735 case V32HImode:
22736 case V16HImode:
22737 case V8HImode:
22738 case V16SImode:
22739 case V8SImode:
22740 case V4SImode:
22741 case V8DImode:
22742 case V4DImode:
22743 case V2DImode:
22744 gcc_assert (vect);
22745 /* FALLTHRU */
22746 case V16SFmode:
22747 case V8SFmode:
22748 case V4SFmode:
22749 case V8DFmode:
22750 case V4DFmode:
22751 case V2DFmode:
22752 n_elt = GET_MODE_NUNITS (mode);
22753 v = rtvec_alloc (n_elt);
22754 scalar_mode = GET_MODE_INNER (mode);
22756 RTVEC_ELT (v, 0) = value;
22758 for (i = 1; i < n_elt; ++i)
22759 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22761 return gen_rtx_CONST_VECTOR (mode, v);
22763 default:
22764 gcc_unreachable ();
22768 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22769 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22770 for an SSE register. If VECT is true, then replicate the mask for
22771 all elements of the vector register. If INVERT is true, then create
22772 a mask excluding the sign bit. */
22775 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22777 machine_mode vec_mode, imode;
22778 wide_int w;
22779 rtx mask, v;
22781 switch (mode)
22783 case V16SImode:
22784 case V16SFmode:
22785 case V8SImode:
22786 case V4SImode:
22787 case V8SFmode:
22788 case V4SFmode:
22789 vec_mode = mode;
22790 imode = SImode;
22791 break;
22793 case V8DImode:
22794 case V4DImode:
22795 case V2DImode:
22796 case V8DFmode:
22797 case V4DFmode:
22798 case V2DFmode:
22799 vec_mode = mode;
22800 imode = DImode;
22801 break;
22803 case TImode:
22804 case TFmode:
22805 vec_mode = VOIDmode;
22806 imode = TImode;
22807 break;
22809 default:
22810 gcc_unreachable ();
22813 machine_mode inner_mode = GET_MODE_INNER (mode);
22814 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22815 GET_MODE_BITSIZE (inner_mode));
22816 if (invert)
22817 w = wi::bit_not (w);
22819 /* Force this value into the low part of a fp vector constant. */
22820 mask = immed_wide_int_const (w, imode);
22821 mask = gen_lowpart (inner_mode, mask);
22823 if (vec_mode == VOIDmode)
22824 return force_reg (inner_mode, mask);
22826 v = ix86_build_const_vector (vec_mode, vect, mask);
22827 return force_reg (vec_mode, v);
22830 /* Generate code for floating point ABS or NEG. */
22832 void
22833 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22834 rtx operands[])
22836 rtx mask, set, dst, src;
22837 bool use_sse = false;
22838 bool vector_mode = VECTOR_MODE_P (mode);
22839 machine_mode vmode = mode;
22841 if (vector_mode)
22842 use_sse = true;
22843 else if (mode == TFmode)
22844 use_sse = true;
22845 else if (TARGET_SSE_MATH)
22847 use_sse = SSE_FLOAT_MODE_P (mode);
22848 if (mode == SFmode)
22849 vmode = V4SFmode;
22850 else if (mode == DFmode)
22851 vmode = V2DFmode;
22854 /* NEG and ABS performed with SSE use bitwise mask operations.
22855 Create the appropriate mask now. */
22856 if (use_sse)
22857 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22858 else
22859 mask = NULL_RTX;
22861 dst = operands[0];
22862 src = operands[1];
22864 set = gen_rtx_fmt_e (code, mode, src);
22865 set = gen_rtx_SET (dst, set);
22867 if (mask)
22869 rtx use, clob;
22870 rtvec par;
22872 use = gen_rtx_USE (VOIDmode, mask);
22873 if (vector_mode)
22874 par = gen_rtvec (2, set, use);
22875 else
22877 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22878 par = gen_rtvec (3, set, use, clob);
22880 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22882 else
22883 emit_insn (set);
22886 /* Expand a copysign operation. Special case operand 0 being a constant. */
22888 void
22889 ix86_expand_copysign (rtx operands[])
22891 machine_mode mode, vmode;
22892 rtx dest, op0, op1, mask, nmask;
22894 dest = operands[0];
22895 op0 = operands[1];
22896 op1 = operands[2];
22898 mode = GET_MODE (dest);
22900 if (mode == SFmode)
22901 vmode = V4SFmode;
22902 else if (mode == DFmode)
22903 vmode = V2DFmode;
22904 else
22905 vmode = mode;
22907 if (CONST_DOUBLE_P (op0))
22909 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22911 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22912 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22914 if (mode == SFmode || mode == DFmode)
22916 if (op0 == CONST0_RTX (mode))
22917 op0 = CONST0_RTX (vmode);
22918 else
22920 rtx v = ix86_build_const_vector (vmode, false, op0);
22922 op0 = force_reg (vmode, v);
22925 else if (op0 != CONST0_RTX (mode))
22926 op0 = force_reg (mode, op0);
22928 mask = ix86_build_signbit_mask (vmode, 0, 0);
22930 if (mode == SFmode)
22931 copysign_insn = gen_copysignsf3_const;
22932 else if (mode == DFmode)
22933 copysign_insn = gen_copysigndf3_const;
22934 else
22935 copysign_insn = gen_copysigntf3_const;
22937 emit_insn (copysign_insn (dest, op0, op1, mask));
22939 else
22941 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22943 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22944 mask = ix86_build_signbit_mask (vmode, 0, 0);
22946 if (mode == SFmode)
22947 copysign_insn = gen_copysignsf3_var;
22948 else if (mode == DFmode)
22949 copysign_insn = gen_copysigndf3_var;
22950 else
22951 copysign_insn = gen_copysigntf3_var;
22953 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22957 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22958 be a constant, and so has already been expanded into a vector constant. */
22960 void
22961 ix86_split_copysign_const (rtx operands[])
22963 machine_mode mode, vmode;
22964 rtx dest, op0, mask, x;
22966 dest = operands[0];
22967 op0 = operands[1];
22968 mask = operands[3];
22970 mode = GET_MODE (dest);
22971 vmode = GET_MODE (mask);
22973 dest = lowpart_subreg (vmode, dest, mode);
22974 x = gen_rtx_AND (vmode, dest, mask);
22975 emit_insn (gen_rtx_SET (dest, x));
22977 if (op0 != CONST0_RTX (vmode))
22979 x = gen_rtx_IOR (vmode, dest, op0);
22980 emit_insn (gen_rtx_SET (dest, x));
22984 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22985 so we have to do two masks. */
22987 void
22988 ix86_split_copysign_var (rtx operands[])
22990 machine_mode mode, vmode;
22991 rtx dest, scratch, op0, op1, mask, nmask, x;
22993 dest = operands[0];
22994 scratch = operands[1];
22995 op0 = operands[2];
22996 op1 = operands[3];
22997 nmask = operands[4];
22998 mask = operands[5];
23000 mode = GET_MODE (dest);
23001 vmode = GET_MODE (mask);
23003 if (rtx_equal_p (op0, op1))
23005 /* Shouldn't happen often (it's useless, obviously), but when it does
23006 we'd generate incorrect code if we continue below. */
23007 emit_move_insn (dest, op0);
23008 return;
23011 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23013 gcc_assert (REGNO (op1) == REGNO (scratch));
23015 x = gen_rtx_AND (vmode, scratch, mask);
23016 emit_insn (gen_rtx_SET (scratch, x));
23018 dest = mask;
23019 op0 = lowpart_subreg (vmode, op0, mode);
23020 x = gen_rtx_NOT (vmode, dest);
23021 x = gen_rtx_AND (vmode, x, op0);
23022 emit_insn (gen_rtx_SET (dest, x));
23024 else
23026 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23028 x = gen_rtx_AND (vmode, scratch, mask);
23030 else /* alternative 2,4 */
23032 gcc_assert (REGNO (mask) == REGNO (scratch));
23033 op1 = lowpart_subreg (vmode, op1, mode);
23034 x = gen_rtx_AND (vmode, scratch, op1);
23036 emit_insn (gen_rtx_SET (scratch, x));
23038 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23040 dest = lowpart_subreg (vmode, op0, mode);
23041 x = gen_rtx_AND (vmode, dest, nmask);
23043 else /* alternative 3,4 */
23045 gcc_assert (REGNO (nmask) == REGNO (dest));
23046 dest = nmask;
23047 op0 = lowpart_subreg (vmode, op0, mode);
23048 x = gen_rtx_AND (vmode, dest, op0);
23050 emit_insn (gen_rtx_SET (dest, x));
23053 x = gen_rtx_IOR (vmode, dest, scratch);
23054 emit_insn (gen_rtx_SET (dest, x));
23057 /* Return TRUE or FALSE depending on whether the first SET in INSN
23058 has source and destination with matching CC modes, and that the
23059 CC mode is at least as constrained as REQ_MODE. */
23061 bool
23062 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23064 rtx set;
23065 machine_mode set_mode;
23067 set = PATTERN (insn);
23068 if (GET_CODE (set) == PARALLEL)
23069 set = XVECEXP (set, 0, 0);
23070 gcc_assert (GET_CODE (set) == SET);
23071 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23073 set_mode = GET_MODE (SET_DEST (set));
23074 switch (set_mode)
23076 case CCNOmode:
23077 if (req_mode != CCNOmode
23078 && (req_mode != CCmode
23079 || XEXP (SET_SRC (set), 1) != const0_rtx))
23080 return false;
23081 break;
23082 case CCmode:
23083 if (req_mode == CCGCmode)
23084 return false;
23085 /* FALLTHRU */
23086 case CCGCmode:
23087 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23088 return false;
23089 /* FALLTHRU */
23090 case CCGOCmode:
23091 if (req_mode == CCZmode)
23092 return false;
23093 /* FALLTHRU */
23094 case CCZmode:
23095 break;
23097 case CCAmode:
23098 case CCCmode:
23099 case CCOmode:
23100 case CCPmode:
23101 case CCSmode:
23102 if (set_mode != req_mode)
23103 return false;
23104 break;
23106 default:
23107 gcc_unreachable ();
23110 return GET_MODE (SET_SRC (set)) == set_mode;
23113 /* Generate insn patterns to do an integer compare of OPERANDS. */
23115 static rtx
23116 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23118 machine_mode cmpmode;
23119 rtx tmp, flags;
23121 cmpmode = SELECT_CC_MODE (code, op0, op1);
23122 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23124 /* This is very simple, but making the interface the same as in the
23125 FP case makes the rest of the code easier. */
23126 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23127 emit_insn (gen_rtx_SET (flags, tmp));
23129 /* Return the test that should be put into the flags user, i.e.
23130 the bcc, scc, or cmov instruction. */
23131 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23134 /* Figure out whether to use ordered or unordered fp comparisons.
23135 Return the appropriate mode to use. */
23137 machine_mode
23138 ix86_fp_compare_mode (enum rtx_code)
23140 /* ??? In order to make all comparisons reversible, we do all comparisons
23141 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23142 all forms trapping and nontrapping comparisons, we can make inequality
23143 comparisons trapping again, since it results in better code when using
23144 FCOM based compares. */
23145 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23148 machine_mode
23149 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23151 machine_mode mode = GET_MODE (op0);
23153 if (SCALAR_FLOAT_MODE_P (mode))
23155 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23156 return ix86_fp_compare_mode (code);
23159 switch (code)
23161 /* Only zero flag is needed. */
23162 case EQ: /* ZF=0 */
23163 case NE: /* ZF!=0 */
23164 return CCZmode;
23165 /* Codes needing carry flag. */
23166 case GEU: /* CF=0 */
23167 case LTU: /* CF=1 */
23168 /* Detect overflow checks. They need just the carry flag. */
23169 if (GET_CODE (op0) == PLUS
23170 && (rtx_equal_p (op1, XEXP (op0, 0))
23171 || rtx_equal_p (op1, XEXP (op0, 1))))
23172 return CCCmode;
23173 else
23174 return CCmode;
23175 case GTU: /* CF=0 & ZF=0 */
23176 case LEU: /* CF=1 | ZF=1 */
23177 return CCmode;
23178 /* Codes possibly doable only with sign flag when
23179 comparing against zero. */
23180 case GE: /* SF=OF or SF=0 */
23181 case LT: /* SF<>OF or SF=1 */
23182 if (op1 == const0_rtx)
23183 return CCGOCmode;
23184 else
23185 /* For other cases Carry flag is not required. */
23186 return CCGCmode;
23187 /* Codes doable only with sign flag when comparing
23188 against zero, but we miss jump instruction for it
23189 so we need to use relational tests against overflow
23190 that thus needs to be zero. */
23191 case GT: /* ZF=0 & SF=OF */
23192 case LE: /* ZF=1 | SF<>OF */
23193 if (op1 == const0_rtx)
23194 return CCNOmode;
23195 else
23196 return CCGCmode;
23197 /* strcmp pattern do (use flags) and combine may ask us for proper
23198 mode. */
23199 case USE:
23200 return CCmode;
23201 default:
23202 gcc_unreachable ();
23206 /* Return the fixed registers used for condition codes. */
23208 static bool
23209 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23211 *p1 = FLAGS_REG;
23212 *p2 = FPSR_REG;
23213 return true;
23216 /* If two condition code modes are compatible, return a condition code
23217 mode which is compatible with both. Otherwise, return
23218 VOIDmode. */
23220 static machine_mode
23221 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23223 if (m1 == m2)
23224 return m1;
23226 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23227 return VOIDmode;
23229 if ((m1 == CCGCmode && m2 == CCGOCmode)
23230 || (m1 == CCGOCmode && m2 == CCGCmode))
23231 return CCGCmode;
23233 if ((m1 == CCNOmode && m2 == CCGOCmode)
23234 || (m1 == CCGOCmode && m2 == CCNOmode))
23235 return CCNOmode;
23237 if (m1 == CCZmode
23238 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23239 return m2;
23240 else if (m2 == CCZmode
23241 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23242 return m1;
23244 switch (m1)
23246 default:
23247 gcc_unreachable ();
23249 case CCmode:
23250 case CCGCmode:
23251 case CCGOCmode:
23252 case CCNOmode:
23253 case CCAmode:
23254 case CCCmode:
23255 case CCOmode:
23256 case CCPmode:
23257 case CCSmode:
23258 case CCZmode:
23259 switch (m2)
23261 default:
23262 return VOIDmode;
23264 case CCmode:
23265 case CCGCmode:
23266 case CCGOCmode:
23267 case CCNOmode:
23268 case CCAmode:
23269 case CCCmode:
23270 case CCOmode:
23271 case CCPmode:
23272 case CCSmode:
23273 case CCZmode:
23274 return CCmode;
23277 case CCFPmode:
23278 case CCFPUmode:
23279 /* These are only compatible with themselves, which we already
23280 checked above. */
23281 return VOIDmode;
23286 /* Return a comparison we can do and that it is equivalent to
23287 swap_condition (code) apart possibly from orderedness.
23288 But, never change orderedness if TARGET_IEEE_FP, returning
23289 UNKNOWN in that case if necessary. */
23291 static enum rtx_code
23292 ix86_fp_swap_condition (enum rtx_code code)
23294 switch (code)
23296 case GT: /* GTU - CF=0 & ZF=0 */
23297 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23298 case GE: /* GEU - CF=0 */
23299 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23300 case UNLT: /* LTU - CF=1 */
23301 return TARGET_IEEE_FP ? UNKNOWN : GT;
23302 case UNLE: /* LEU - CF=1 | ZF=1 */
23303 return TARGET_IEEE_FP ? UNKNOWN : GE;
23304 default:
23305 return swap_condition (code);
23309 /* Return cost of comparison CODE using the best strategy for performance.
23310 All following functions do use number of instructions as a cost metrics.
23311 In future this should be tweaked to compute bytes for optimize_size and
23312 take into account performance of various instructions on various CPUs. */
23314 static int
23315 ix86_fp_comparison_cost (enum rtx_code code)
23317 int arith_cost;
23319 /* The cost of code using bit-twiddling on %ah. */
23320 switch (code)
23322 case UNLE:
23323 case UNLT:
23324 case LTGT:
23325 case GT:
23326 case GE:
23327 case UNORDERED:
23328 case ORDERED:
23329 case UNEQ:
23330 arith_cost = 4;
23331 break;
23332 case LT:
23333 case NE:
23334 case EQ:
23335 case UNGE:
23336 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23337 break;
23338 case LE:
23339 case UNGT:
23340 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23341 break;
23342 default:
23343 gcc_unreachable ();
23346 switch (ix86_fp_comparison_strategy (code))
23348 case IX86_FPCMP_COMI:
23349 return arith_cost > 4 ? 3 : 2;
23350 case IX86_FPCMP_SAHF:
23351 return arith_cost > 4 ? 4 : 3;
23352 default:
23353 return arith_cost;
23357 /* Return strategy to use for floating-point. We assume that fcomi is always
23358 preferrable where available, since that is also true when looking at size
23359 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23361 enum ix86_fpcmp_strategy
23362 ix86_fp_comparison_strategy (enum rtx_code)
23364 /* Do fcomi/sahf based test when profitable. */
23366 if (TARGET_CMOVE)
23367 return IX86_FPCMP_COMI;
23369 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23370 return IX86_FPCMP_SAHF;
23372 return IX86_FPCMP_ARITH;
23375 /* Swap, force into registers, or otherwise massage the two operands
23376 to a fp comparison. The operands are updated in place; the new
23377 comparison code is returned. */
23379 static enum rtx_code
23380 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23382 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23383 rtx op0 = *pop0, op1 = *pop1;
23384 machine_mode op_mode = GET_MODE (op0);
23385 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23387 /* All of the unordered compare instructions only work on registers.
23388 The same is true of the fcomi compare instructions. The XFmode
23389 compare instructions require registers except when comparing
23390 against zero or when converting operand 1 from fixed point to
23391 floating point. */
23393 if (!is_sse
23394 && (fpcmp_mode == CCFPUmode
23395 || (op_mode == XFmode
23396 && ! (standard_80387_constant_p (op0) == 1
23397 || standard_80387_constant_p (op1) == 1)
23398 && GET_CODE (op1) != FLOAT)
23399 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23401 op0 = force_reg (op_mode, op0);
23402 op1 = force_reg (op_mode, op1);
23404 else
23406 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23407 things around if they appear profitable, otherwise force op0
23408 into a register. */
23410 if (standard_80387_constant_p (op0) == 0
23411 || (MEM_P (op0)
23412 && ! (standard_80387_constant_p (op1) == 0
23413 || MEM_P (op1))))
23415 enum rtx_code new_code = ix86_fp_swap_condition (code);
23416 if (new_code != UNKNOWN)
23418 std::swap (op0, op1);
23419 code = new_code;
23423 if (!REG_P (op0))
23424 op0 = force_reg (op_mode, op0);
23426 if (CONSTANT_P (op1))
23428 int tmp = standard_80387_constant_p (op1);
23429 if (tmp == 0)
23430 op1 = validize_mem (force_const_mem (op_mode, op1));
23431 else if (tmp == 1)
23433 if (TARGET_CMOVE)
23434 op1 = force_reg (op_mode, op1);
23436 else
23437 op1 = force_reg (op_mode, op1);
23441 /* Try to rearrange the comparison to make it cheaper. */
23442 if (ix86_fp_comparison_cost (code)
23443 > ix86_fp_comparison_cost (swap_condition (code))
23444 && (REG_P (op1) || can_create_pseudo_p ()))
23446 std::swap (op0, op1);
23447 code = swap_condition (code);
23448 if (!REG_P (op0))
23449 op0 = force_reg (op_mode, op0);
23452 *pop0 = op0;
23453 *pop1 = op1;
23454 return code;
23457 /* Convert comparison codes we use to represent FP comparison to integer
23458 code that will result in proper branch. Return UNKNOWN if no such code
23459 is available. */
23461 enum rtx_code
23462 ix86_fp_compare_code_to_integer (enum rtx_code code)
23464 switch (code)
23466 case GT:
23467 return GTU;
23468 case GE:
23469 return GEU;
23470 case ORDERED:
23471 case UNORDERED:
23472 return code;
23473 case UNEQ:
23474 return EQ;
23475 case UNLT:
23476 return LTU;
23477 case UNLE:
23478 return LEU;
23479 case LTGT:
23480 return NE;
23481 default:
23482 return UNKNOWN;
23486 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23488 static rtx
23489 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23491 machine_mode fpcmp_mode, intcmp_mode;
23492 rtx tmp, tmp2;
23494 fpcmp_mode = ix86_fp_compare_mode (code);
23495 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23497 /* Do fcomi/sahf based test when profitable. */
23498 switch (ix86_fp_comparison_strategy (code))
23500 case IX86_FPCMP_COMI:
23501 intcmp_mode = fpcmp_mode;
23502 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23503 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23504 emit_insn (tmp);
23505 break;
23507 case IX86_FPCMP_SAHF:
23508 intcmp_mode = fpcmp_mode;
23509 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23510 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23512 if (!scratch)
23513 scratch = gen_reg_rtx (HImode);
23514 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23515 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23516 break;
23518 case IX86_FPCMP_ARITH:
23519 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23520 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23521 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23522 if (!scratch)
23523 scratch = gen_reg_rtx (HImode);
23524 emit_insn (gen_rtx_SET (scratch, tmp2));
23526 /* In the unordered case, we have to check C2 for NaN's, which
23527 doesn't happen to work out to anything nice combination-wise.
23528 So do some bit twiddling on the value we've got in AH to come
23529 up with an appropriate set of condition codes. */
23531 intcmp_mode = CCNOmode;
23532 switch (code)
23534 case GT:
23535 case UNGT:
23536 if (code == GT || !TARGET_IEEE_FP)
23538 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23539 code = EQ;
23541 else
23543 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23544 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23545 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23546 intcmp_mode = CCmode;
23547 code = GEU;
23549 break;
23550 case LT:
23551 case UNLT:
23552 if (code == LT && TARGET_IEEE_FP)
23554 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23555 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23556 intcmp_mode = CCmode;
23557 code = EQ;
23559 else
23561 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23562 code = NE;
23564 break;
23565 case GE:
23566 case UNGE:
23567 if (code == GE || !TARGET_IEEE_FP)
23569 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23570 code = EQ;
23572 else
23574 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23575 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23576 code = NE;
23578 break;
23579 case LE:
23580 case UNLE:
23581 if (code == LE && TARGET_IEEE_FP)
23583 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23584 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23585 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23586 intcmp_mode = CCmode;
23587 code = LTU;
23589 else
23591 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23592 code = NE;
23594 break;
23595 case EQ:
23596 case UNEQ:
23597 if (code == EQ && TARGET_IEEE_FP)
23599 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23600 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23601 intcmp_mode = CCmode;
23602 code = EQ;
23604 else
23606 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23607 code = NE;
23609 break;
23610 case NE:
23611 case LTGT:
23612 if (code == NE && TARGET_IEEE_FP)
23614 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23615 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23616 GEN_INT (0x40)));
23617 code = NE;
23619 else
23621 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23622 code = EQ;
23624 break;
23626 case UNORDERED:
23627 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23628 code = NE;
23629 break;
23630 case ORDERED:
23631 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23632 code = EQ;
23633 break;
23635 default:
23636 gcc_unreachable ();
23638 break;
23640 default:
23641 gcc_unreachable();
23644 /* Return the test that should be put into the flags user, i.e.
23645 the bcc, scc, or cmov instruction. */
23646 return gen_rtx_fmt_ee (code, VOIDmode,
23647 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23648 const0_rtx);
23651 static rtx
23652 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23654 rtx ret;
23656 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23657 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23659 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23661 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23662 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23664 else
23665 ret = ix86_expand_int_compare (code, op0, op1);
23667 return ret;
23670 void
23671 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23673 machine_mode mode = GET_MODE (op0);
23674 rtx tmp;
23676 /* Handle special case - vector comparsion with boolean result, transform
23677 it using ptest instruction. */
23678 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23680 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23681 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23683 gcc_assert (code == EQ || code == NE);
23684 /* Generate XOR since we can't check that one operand is zero vector. */
23685 tmp = gen_reg_rtx (mode);
23686 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23687 tmp = gen_lowpart (p_mode, tmp);
23688 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23689 gen_rtx_UNSPEC (CCmode,
23690 gen_rtvec (2, tmp, tmp),
23691 UNSPEC_PTEST)));
23692 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23693 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23694 gen_rtx_LABEL_REF (VOIDmode, label),
23695 pc_rtx);
23696 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23697 return;
23700 switch (mode)
23702 case SFmode:
23703 case DFmode:
23704 case XFmode:
23705 case QImode:
23706 case HImode:
23707 case SImode:
23708 simple:
23709 tmp = ix86_expand_compare (code, op0, op1);
23710 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23711 gen_rtx_LABEL_REF (VOIDmode, label),
23712 pc_rtx);
23713 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23714 return;
23716 case DImode:
23717 if (TARGET_64BIT)
23718 goto simple;
23719 /* For 32-bit target DI comparison may be performed on
23720 SSE registers. To allow this we should avoid split
23721 to SI mode which is achieved by doing xor in DI mode
23722 and then comparing with zero (which is recognized by
23723 STV pass). We don't compare using xor when optimizing
23724 for size. */
23725 if (!optimize_insn_for_size_p ()
23726 && TARGET_STV
23727 && (code == EQ || code == NE))
23729 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23730 op1 = const0_rtx;
23732 /* FALLTHRU */
23733 case TImode:
23734 /* Expand DImode branch into multiple compare+branch. */
23736 rtx lo[2], hi[2];
23737 rtx_code_label *label2;
23738 enum rtx_code code1, code2, code3;
23739 machine_mode submode;
23741 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23743 std::swap (op0, op1);
23744 code = swap_condition (code);
23747 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23748 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23750 submode = mode == DImode ? SImode : DImode;
23752 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23753 avoid two branches. This costs one extra insn, so disable when
23754 optimizing for size. */
23756 if ((code == EQ || code == NE)
23757 && (!optimize_insn_for_size_p ()
23758 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23760 rtx xor0, xor1;
23762 xor1 = hi[0];
23763 if (hi[1] != const0_rtx)
23764 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23765 NULL_RTX, 0, OPTAB_WIDEN);
23767 xor0 = lo[0];
23768 if (lo[1] != const0_rtx)
23769 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23770 NULL_RTX, 0, OPTAB_WIDEN);
23772 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23773 NULL_RTX, 0, OPTAB_WIDEN);
23775 ix86_expand_branch (code, tmp, const0_rtx, label);
23776 return;
23779 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23780 op1 is a constant and the low word is zero, then we can just
23781 examine the high word. Similarly for low word -1 and
23782 less-or-equal-than or greater-than. */
23784 if (CONST_INT_P (hi[1]))
23785 switch (code)
23787 case LT: case LTU: case GE: case GEU:
23788 if (lo[1] == const0_rtx)
23790 ix86_expand_branch (code, hi[0], hi[1], label);
23791 return;
23793 break;
23794 case LE: case LEU: case GT: case GTU:
23795 if (lo[1] == constm1_rtx)
23797 ix86_expand_branch (code, hi[0], hi[1], label);
23798 return;
23800 break;
23801 default:
23802 break;
23805 /* Otherwise, we need two or three jumps. */
23807 label2 = gen_label_rtx ();
23809 code1 = code;
23810 code2 = swap_condition (code);
23811 code3 = unsigned_condition (code);
23813 switch (code)
23815 case LT: case GT: case LTU: case GTU:
23816 break;
23818 case LE: code1 = LT; code2 = GT; break;
23819 case GE: code1 = GT; code2 = LT; break;
23820 case LEU: code1 = LTU; code2 = GTU; break;
23821 case GEU: code1 = GTU; code2 = LTU; break;
23823 case EQ: code1 = UNKNOWN; code2 = NE; break;
23824 case NE: code2 = UNKNOWN; break;
23826 default:
23827 gcc_unreachable ();
23831 * a < b =>
23832 * if (hi(a) < hi(b)) goto true;
23833 * if (hi(a) > hi(b)) goto false;
23834 * if (lo(a) < lo(b)) goto true;
23835 * false:
23838 if (code1 != UNKNOWN)
23839 ix86_expand_branch (code1, hi[0], hi[1], label);
23840 if (code2 != UNKNOWN)
23841 ix86_expand_branch (code2, hi[0], hi[1], label2);
23843 ix86_expand_branch (code3, lo[0], lo[1], label);
23845 if (code2 != UNKNOWN)
23846 emit_label (label2);
23847 return;
23850 default:
23851 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23852 goto simple;
23856 /* Split branch based on floating point condition. */
23857 void
23858 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23859 rtx target1, rtx target2, rtx tmp)
23861 rtx condition;
23862 rtx_insn *i;
23864 if (target2 != pc_rtx)
23866 std::swap (target1, target2);
23867 code = reverse_condition_maybe_unordered (code);
23870 condition = ix86_expand_fp_compare (code, op1, op2,
23871 tmp);
23873 i = emit_jump_insn (gen_rtx_SET
23874 (pc_rtx,
23875 gen_rtx_IF_THEN_ELSE (VOIDmode,
23876 condition, target1, target2)));
23877 if (split_branch_probability.initialized_p ())
23878 add_reg_br_prob_note (i, split_branch_probability);
23881 void
23882 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23884 rtx ret;
23886 gcc_assert (GET_MODE (dest) == QImode);
23888 ret = ix86_expand_compare (code, op0, op1);
23889 PUT_MODE (ret, QImode);
23890 emit_insn (gen_rtx_SET (dest, ret));
23893 /* Expand comparison setting or clearing carry flag. Return true when
23894 successful and set pop for the operation. */
23895 static bool
23896 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23898 machine_mode mode =
23899 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23901 /* Do not handle double-mode compares that go through special path. */
23902 if (mode == (TARGET_64BIT ? TImode : DImode))
23903 return false;
23905 if (SCALAR_FLOAT_MODE_P (mode))
23907 rtx compare_op;
23908 rtx_insn *compare_seq;
23910 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23912 /* Shortcut: following common codes never translate
23913 into carry flag compares. */
23914 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23915 || code == ORDERED || code == UNORDERED)
23916 return false;
23918 /* These comparisons require zero flag; swap operands so they won't. */
23919 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23920 && !TARGET_IEEE_FP)
23922 std::swap (op0, op1);
23923 code = swap_condition (code);
23926 /* Try to expand the comparison and verify that we end up with
23927 carry flag based comparison. This fails to be true only when
23928 we decide to expand comparison using arithmetic that is not
23929 too common scenario. */
23930 start_sequence ();
23931 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23932 compare_seq = get_insns ();
23933 end_sequence ();
23935 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
23936 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
23937 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23938 else
23939 code = GET_CODE (compare_op);
23941 if (code != LTU && code != GEU)
23942 return false;
23944 emit_insn (compare_seq);
23945 *pop = compare_op;
23946 return true;
23949 if (!INTEGRAL_MODE_P (mode))
23950 return false;
23952 switch (code)
23954 case LTU:
23955 case GEU:
23956 break;
23958 /* Convert a==0 into (unsigned)a<1. */
23959 case EQ:
23960 case NE:
23961 if (op1 != const0_rtx)
23962 return false;
23963 op1 = const1_rtx;
23964 code = (code == EQ ? LTU : GEU);
23965 break;
23967 /* Convert a>b into b<a or a>=b-1. */
23968 case GTU:
23969 case LEU:
23970 if (CONST_INT_P (op1))
23972 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23973 /* Bail out on overflow. We still can swap operands but that
23974 would force loading of the constant into register. */
23975 if (op1 == const0_rtx
23976 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23977 return false;
23978 code = (code == GTU ? GEU : LTU);
23980 else
23982 std::swap (op0, op1);
23983 code = (code == GTU ? LTU : GEU);
23985 break;
23987 /* Convert a>=0 into (unsigned)a<0x80000000. */
23988 case LT:
23989 case GE:
23990 if (mode == DImode || op1 != const0_rtx)
23991 return false;
23992 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23993 code = (code == LT ? GEU : LTU);
23994 break;
23995 case LE:
23996 case GT:
23997 if (mode == DImode || op1 != constm1_rtx)
23998 return false;
23999 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24000 code = (code == LE ? GEU : LTU);
24001 break;
24003 default:
24004 return false;
24006 /* Swapping operands may cause constant to appear as first operand. */
24007 if (!nonimmediate_operand (op0, VOIDmode))
24009 if (!can_create_pseudo_p ())
24010 return false;
24011 op0 = force_reg (mode, op0);
24013 *pop = ix86_expand_compare (code, op0, op1);
24014 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24015 return true;
24018 bool
24019 ix86_expand_int_movcc (rtx operands[])
24021 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24022 rtx_insn *compare_seq;
24023 rtx compare_op;
24024 machine_mode mode = GET_MODE (operands[0]);
24025 bool sign_bit_compare_p = false;
24026 rtx op0 = XEXP (operands[1], 0);
24027 rtx op1 = XEXP (operands[1], 1);
24029 if (GET_MODE (op0) == TImode
24030 || (GET_MODE (op0) == DImode
24031 && !TARGET_64BIT))
24032 return false;
24034 start_sequence ();
24035 compare_op = ix86_expand_compare (code, op0, op1);
24036 compare_seq = get_insns ();
24037 end_sequence ();
24039 compare_code = GET_CODE (compare_op);
24041 if ((op1 == const0_rtx && (code == GE || code == LT))
24042 || (op1 == constm1_rtx && (code == GT || code == LE)))
24043 sign_bit_compare_p = true;
24045 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24046 HImode insns, we'd be swallowed in word prefix ops. */
24048 if ((mode != HImode || TARGET_FAST_PREFIX)
24049 && (mode != (TARGET_64BIT ? TImode : DImode))
24050 && CONST_INT_P (operands[2])
24051 && CONST_INT_P (operands[3]))
24053 rtx out = operands[0];
24054 HOST_WIDE_INT ct = INTVAL (operands[2]);
24055 HOST_WIDE_INT cf = INTVAL (operands[3]);
24056 HOST_WIDE_INT diff;
24058 diff = ct - cf;
24059 /* Sign bit compares are better done using shifts than we do by using
24060 sbb. */
24061 if (sign_bit_compare_p
24062 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24064 /* Detect overlap between destination and compare sources. */
24065 rtx tmp = out;
24067 if (!sign_bit_compare_p)
24069 rtx flags;
24070 bool fpcmp = false;
24072 compare_code = GET_CODE (compare_op);
24074 flags = XEXP (compare_op, 0);
24076 if (GET_MODE (flags) == CCFPmode
24077 || GET_MODE (flags) == CCFPUmode)
24079 fpcmp = true;
24080 compare_code
24081 = ix86_fp_compare_code_to_integer (compare_code);
24084 /* To simplify rest of code, restrict to the GEU case. */
24085 if (compare_code == LTU)
24087 std::swap (ct, cf);
24088 compare_code = reverse_condition (compare_code);
24089 code = reverse_condition (code);
24091 else
24093 if (fpcmp)
24094 PUT_CODE (compare_op,
24095 reverse_condition_maybe_unordered
24096 (GET_CODE (compare_op)));
24097 else
24098 PUT_CODE (compare_op,
24099 reverse_condition (GET_CODE (compare_op)));
24101 diff = ct - cf;
24103 if (reg_overlap_mentioned_p (out, op0)
24104 || reg_overlap_mentioned_p (out, op1))
24105 tmp = gen_reg_rtx (mode);
24107 if (mode == DImode)
24108 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24109 else
24110 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24111 flags, compare_op));
24113 else
24115 if (code == GT || code == GE)
24116 code = reverse_condition (code);
24117 else
24119 std::swap (ct, cf);
24120 diff = ct - cf;
24122 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24125 if (diff == 1)
24128 * cmpl op0,op1
24129 * sbbl dest,dest
24130 * [addl dest, ct]
24132 * Size 5 - 8.
24134 if (ct)
24135 tmp = expand_simple_binop (mode, PLUS,
24136 tmp, GEN_INT (ct),
24137 copy_rtx (tmp), 1, OPTAB_DIRECT);
24139 else if (cf == -1)
24142 * cmpl op0,op1
24143 * sbbl dest,dest
24144 * orl $ct, dest
24146 * Size 8.
24148 tmp = expand_simple_binop (mode, IOR,
24149 tmp, GEN_INT (ct),
24150 copy_rtx (tmp), 1, OPTAB_DIRECT);
24152 else if (diff == -1 && ct)
24155 * cmpl op0,op1
24156 * sbbl dest,dest
24157 * notl dest
24158 * [addl dest, cf]
24160 * Size 8 - 11.
24162 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24163 if (cf)
24164 tmp = expand_simple_binop (mode, PLUS,
24165 copy_rtx (tmp), GEN_INT (cf),
24166 copy_rtx (tmp), 1, OPTAB_DIRECT);
24168 else
24171 * cmpl op0,op1
24172 * sbbl dest,dest
24173 * [notl dest]
24174 * andl cf - ct, dest
24175 * [addl dest, ct]
24177 * Size 8 - 11.
24180 if (cf == 0)
24182 cf = ct;
24183 ct = 0;
24184 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24187 tmp = expand_simple_binop (mode, AND,
24188 copy_rtx (tmp),
24189 gen_int_mode (cf - ct, mode),
24190 copy_rtx (tmp), 1, OPTAB_DIRECT);
24191 if (ct)
24192 tmp = expand_simple_binop (mode, PLUS,
24193 copy_rtx (tmp), GEN_INT (ct),
24194 copy_rtx (tmp), 1, OPTAB_DIRECT);
24197 if (!rtx_equal_p (tmp, out))
24198 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24200 return true;
24203 if (diff < 0)
24205 machine_mode cmp_mode = GET_MODE (op0);
24206 enum rtx_code new_code;
24208 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24210 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24212 /* We may be reversing unordered compare to normal compare, that
24213 is not valid in general (we may convert non-trapping condition
24214 to trapping one), however on i386 we currently emit all
24215 comparisons unordered. */
24216 new_code = reverse_condition_maybe_unordered (code);
24218 else
24219 new_code = ix86_reverse_condition (code, cmp_mode);
24220 if (new_code != UNKNOWN)
24222 std::swap (ct, cf);
24223 diff = -diff;
24224 code = new_code;
24228 compare_code = UNKNOWN;
24229 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24230 && CONST_INT_P (op1))
24232 if (op1 == const0_rtx
24233 && (code == LT || code == GE))
24234 compare_code = code;
24235 else if (op1 == constm1_rtx)
24237 if (code == LE)
24238 compare_code = LT;
24239 else if (code == GT)
24240 compare_code = GE;
24244 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24245 if (compare_code != UNKNOWN
24246 && GET_MODE (op0) == GET_MODE (out)
24247 && (cf == -1 || ct == -1))
24249 /* If lea code below could be used, only optimize
24250 if it results in a 2 insn sequence. */
24252 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24253 || diff == 3 || diff == 5 || diff == 9)
24254 || (compare_code == LT && ct == -1)
24255 || (compare_code == GE && cf == -1))
24258 * notl op1 (if necessary)
24259 * sarl $31, op1
24260 * orl cf, op1
24262 if (ct != -1)
24264 cf = ct;
24265 ct = -1;
24266 code = reverse_condition (code);
24269 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24271 out = expand_simple_binop (mode, IOR,
24272 out, GEN_INT (cf),
24273 out, 1, OPTAB_DIRECT);
24274 if (out != operands[0])
24275 emit_move_insn (operands[0], out);
24277 return true;
24282 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24283 || diff == 3 || diff == 5 || diff == 9)
24284 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24285 && (mode != DImode
24286 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24289 * xorl dest,dest
24290 * cmpl op1,op2
24291 * setcc dest
24292 * lea cf(dest*(ct-cf)),dest
24294 * Size 14.
24296 * This also catches the degenerate setcc-only case.
24299 rtx tmp;
24300 int nops;
24302 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24304 nops = 0;
24305 /* On x86_64 the lea instruction operates on Pmode, so we need
24306 to get arithmetics done in proper mode to match. */
24307 if (diff == 1)
24308 tmp = copy_rtx (out);
24309 else
24311 rtx out1;
24312 out1 = copy_rtx (out);
24313 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24314 nops++;
24315 if (diff & 1)
24317 tmp = gen_rtx_PLUS (mode, tmp, out1);
24318 nops++;
24321 if (cf != 0)
24323 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24324 nops++;
24326 if (!rtx_equal_p (tmp, out))
24328 if (nops == 1)
24329 out = force_operand (tmp, copy_rtx (out));
24330 else
24331 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24333 if (!rtx_equal_p (out, operands[0]))
24334 emit_move_insn (operands[0], copy_rtx (out));
24336 return true;
24340 * General case: Jumpful:
24341 * xorl dest,dest cmpl op1, op2
24342 * cmpl op1, op2 movl ct, dest
24343 * setcc dest jcc 1f
24344 * decl dest movl cf, dest
24345 * andl (cf-ct),dest 1:
24346 * addl ct,dest
24348 * Size 20. Size 14.
24350 * This is reasonably steep, but branch mispredict costs are
24351 * high on modern cpus, so consider failing only if optimizing
24352 * for space.
24355 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24356 && BRANCH_COST (optimize_insn_for_speed_p (),
24357 false) >= 2)
24359 if (cf == 0)
24361 machine_mode cmp_mode = GET_MODE (op0);
24362 enum rtx_code new_code;
24364 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24366 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24368 /* We may be reversing unordered compare to normal compare,
24369 that is not valid in general (we may convert non-trapping
24370 condition to trapping one), however on i386 we currently
24371 emit all comparisons unordered. */
24372 new_code = reverse_condition_maybe_unordered (code);
24374 else
24376 new_code = ix86_reverse_condition (code, cmp_mode);
24377 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24378 compare_code = reverse_condition (compare_code);
24381 if (new_code != UNKNOWN)
24383 cf = ct;
24384 ct = 0;
24385 code = new_code;
24389 if (compare_code != UNKNOWN)
24391 /* notl op1 (if needed)
24392 sarl $31, op1
24393 andl (cf-ct), op1
24394 addl ct, op1
24396 For x < 0 (resp. x <= -1) there will be no notl,
24397 so if possible swap the constants to get rid of the
24398 complement.
24399 True/false will be -1/0 while code below (store flag
24400 followed by decrement) is 0/-1, so the constants need
24401 to be exchanged once more. */
24403 if (compare_code == GE || !cf)
24405 code = reverse_condition (code);
24406 compare_code = LT;
24408 else
24409 std::swap (ct, cf);
24411 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24413 else
24415 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24417 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24418 constm1_rtx,
24419 copy_rtx (out), 1, OPTAB_DIRECT);
24422 out = expand_simple_binop (mode, AND, copy_rtx (out),
24423 gen_int_mode (cf - ct, mode),
24424 copy_rtx (out), 1, OPTAB_DIRECT);
24425 if (ct)
24426 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24427 copy_rtx (out), 1, OPTAB_DIRECT);
24428 if (!rtx_equal_p (out, operands[0]))
24429 emit_move_insn (operands[0], copy_rtx (out));
24431 return true;
24435 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24437 /* Try a few things more with specific constants and a variable. */
24439 optab op;
24440 rtx var, orig_out, out, tmp;
24442 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24443 return false;
24445 /* If one of the two operands is an interesting constant, load a
24446 constant with the above and mask it in with a logical operation. */
24448 if (CONST_INT_P (operands[2]))
24450 var = operands[3];
24451 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24452 operands[3] = constm1_rtx, op = and_optab;
24453 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24454 operands[3] = const0_rtx, op = ior_optab;
24455 else
24456 return false;
24458 else if (CONST_INT_P (operands[3]))
24460 var = operands[2];
24461 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24462 operands[2] = constm1_rtx, op = and_optab;
24463 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24464 operands[2] = const0_rtx, op = ior_optab;
24465 else
24466 return false;
24468 else
24469 return false;
24471 orig_out = operands[0];
24472 tmp = gen_reg_rtx (mode);
24473 operands[0] = tmp;
24475 /* Recurse to get the constant loaded. */
24476 if (!ix86_expand_int_movcc (operands))
24477 return false;
24479 /* Mask in the interesting variable. */
24480 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24481 OPTAB_WIDEN);
24482 if (!rtx_equal_p (out, orig_out))
24483 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24485 return true;
24489 * For comparison with above,
24491 * movl cf,dest
24492 * movl ct,tmp
24493 * cmpl op1,op2
24494 * cmovcc tmp,dest
24496 * Size 15.
24499 if (! nonimmediate_operand (operands[2], mode))
24500 operands[2] = force_reg (mode, operands[2]);
24501 if (! nonimmediate_operand (operands[3], mode))
24502 operands[3] = force_reg (mode, operands[3]);
24504 if (! register_operand (operands[2], VOIDmode)
24505 && (mode == QImode
24506 || ! register_operand (operands[3], VOIDmode)))
24507 operands[2] = force_reg (mode, operands[2]);
24509 if (mode == QImode
24510 && ! register_operand (operands[3], VOIDmode))
24511 operands[3] = force_reg (mode, operands[3]);
24513 emit_insn (compare_seq);
24514 emit_insn (gen_rtx_SET (operands[0],
24515 gen_rtx_IF_THEN_ELSE (mode,
24516 compare_op, operands[2],
24517 operands[3])));
24518 return true;
24521 /* Swap, force into registers, or otherwise massage the two operands
24522 to an sse comparison with a mask result. Thus we differ a bit from
24523 ix86_prepare_fp_compare_args which expects to produce a flags result.
24525 The DEST operand exists to help determine whether to commute commutative
24526 operators. The POP0/POP1 operands are updated in place. The new
24527 comparison code is returned, or UNKNOWN if not implementable. */
24529 static enum rtx_code
24530 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24531 rtx *pop0, rtx *pop1)
24533 switch (code)
24535 case LTGT:
24536 case UNEQ:
24537 /* AVX supports all the needed comparisons. */
24538 if (TARGET_AVX)
24539 break;
24540 /* We have no LTGT as an operator. We could implement it with
24541 NE & ORDERED, but this requires an extra temporary. It's
24542 not clear that it's worth it. */
24543 return UNKNOWN;
24545 case LT:
24546 case LE:
24547 case UNGT:
24548 case UNGE:
24549 /* These are supported directly. */
24550 break;
24552 case EQ:
24553 case NE:
24554 case UNORDERED:
24555 case ORDERED:
24556 /* AVX has 3 operand comparisons, no need to swap anything. */
24557 if (TARGET_AVX)
24558 break;
24559 /* For commutative operators, try to canonicalize the destination
24560 operand to be first in the comparison - this helps reload to
24561 avoid extra moves. */
24562 if (!dest || !rtx_equal_p (dest, *pop1))
24563 break;
24564 /* FALLTHRU */
24566 case GE:
24567 case GT:
24568 case UNLE:
24569 case UNLT:
24570 /* These are not supported directly before AVX, and furthermore
24571 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24572 comparison operands to transform into something that is
24573 supported. */
24574 std::swap (*pop0, *pop1);
24575 code = swap_condition (code);
24576 break;
24578 default:
24579 gcc_unreachable ();
24582 return code;
24585 /* Detect conditional moves that exactly match min/max operational
24586 semantics. Note that this is IEEE safe, as long as we don't
24587 interchange the operands.
24589 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24590 and TRUE if the operation is successful and instructions are emitted. */
24592 static bool
24593 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24594 rtx cmp_op1, rtx if_true, rtx if_false)
24596 machine_mode mode;
24597 bool is_min;
24598 rtx tmp;
24600 if (code == LT)
24602 else if (code == UNGE)
24603 std::swap (if_true, if_false);
24604 else
24605 return false;
24607 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24608 is_min = true;
24609 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24610 is_min = false;
24611 else
24612 return false;
24614 mode = GET_MODE (dest);
24616 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24617 but MODE may be a vector mode and thus not appropriate. */
24618 if (!flag_finite_math_only || flag_signed_zeros)
24620 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24621 rtvec v;
24623 if_true = force_reg (mode, if_true);
24624 v = gen_rtvec (2, if_true, if_false);
24625 tmp = gen_rtx_UNSPEC (mode, v, u);
24627 else
24629 code = is_min ? SMIN : SMAX;
24630 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24633 emit_insn (gen_rtx_SET (dest, tmp));
24634 return true;
24637 /* Expand an sse vector comparison. Return the register with the result. */
24639 static rtx
24640 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24641 rtx op_true, rtx op_false)
24643 machine_mode mode = GET_MODE (dest);
24644 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24646 /* In general case result of comparison can differ from operands' type. */
24647 machine_mode cmp_mode;
24649 /* In AVX512F the result of comparison is an integer mask. */
24650 bool maskcmp = false;
24651 rtx x;
24653 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24655 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
24656 gcc_assert (cmp_mode != BLKmode);
24658 maskcmp = true;
24660 else
24661 cmp_mode = cmp_ops_mode;
24664 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24665 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24666 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24668 if (optimize
24669 || (maskcmp && cmp_mode != mode)
24670 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24671 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24672 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24674 /* Compare patterns for int modes are unspec in AVX512F only. */
24675 if (maskcmp && (code == GT || code == EQ))
24677 rtx (*gen)(rtx, rtx, rtx);
24679 switch (cmp_ops_mode)
24681 case V64QImode:
24682 gcc_assert (TARGET_AVX512BW);
24683 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24684 break;
24685 case V32HImode:
24686 gcc_assert (TARGET_AVX512BW);
24687 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24688 break;
24689 case V16SImode:
24690 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24691 break;
24692 case V8DImode:
24693 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24694 break;
24695 default:
24696 gen = NULL;
24699 if (gen)
24701 emit_insn (gen (dest, cmp_op0, cmp_op1));
24702 return dest;
24705 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24707 if (cmp_mode != mode && !maskcmp)
24709 x = force_reg (cmp_ops_mode, x);
24710 convert_move (dest, x, false);
24712 else
24713 emit_insn (gen_rtx_SET (dest, x));
24715 return dest;
24718 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24719 operations. This is used for both scalar and vector conditional moves. */
24721 void
24722 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24724 machine_mode mode = GET_MODE (dest);
24725 machine_mode cmpmode = GET_MODE (cmp);
24727 /* In AVX512F the result of comparison is an integer mask. */
24728 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24730 rtx t2, t3, x;
24732 /* If we have an integer mask and FP value then we need
24733 to cast mask to FP mode. */
24734 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24736 cmp = force_reg (cmpmode, cmp);
24737 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24740 if (vector_all_ones_operand (op_true, mode)
24741 && rtx_equal_p (op_false, CONST0_RTX (mode))
24742 && !maskcmp)
24744 emit_insn (gen_rtx_SET (dest, cmp));
24746 else if (op_false == CONST0_RTX (mode)
24747 && !maskcmp)
24749 op_true = force_reg (mode, op_true);
24750 x = gen_rtx_AND (mode, cmp, op_true);
24751 emit_insn (gen_rtx_SET (dest, x));
24753 else if (op_true == CONST0_RTX (mode)
24754 && !maskcmp)
24756 op_false = force_reg (mode, op_false);
24757 x = gen_rtx_NOT (mode, cmp);
24758 x = gen_rtx_AND (mode, x, op_false);
24759 emit_insn (gen_rtx_SET (dest, x));
24761 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24762 && !maskcmp)
24764 op_false = force_reg (mode, op_false);
24765 x = gen_rtx_IOR (mode, cmp, op_false);
24766 emit_insn (gen_rtx_SET (dest, x));
24768 else if (TARGET_XOP
24769 && !maskcmp)
24771 op_true = force_reg (mode, op_true);
24773 if (!nonimmediate_operand (op_false, mode))
24774 op_false = force_reg (mode, op_false);
24776 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24777 op_true,
24778 op_false)));
24780 else
24782 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24783 rtx d = dest;
24785 if (!nonimmediate_operand (op_true, mode))
24786 op_true = force_reg (mode, op_true);
24788 op_false = force_reg (mode, op_false);
24790 switch (mode)
24792 case V4SFmode:
24793 if (TARGET_SSE4_1)
24794 gen = gen_sse4_1_blendvps;
24795 break;
24796 case V2DFmode:
24797 if (TARGET_SSE4_1)
24798 gen = gen_sse4_1_blendvpd;
24799 break;
24800 case V16QImode:
24801 case V8HImode:
24802 case V4SImode:
24803 case V2DImode:
24804 if (TARGET_SSE4_1)
24806 gen = gen_sse4_1_pblendvb;
24807 if (mode != V16QImode)
24808 d = gen_reg_rtx (V16QImode);
24809 op_false = gen_lowpart (V16QImode, op_false);
24810 op_true = gen_lowpart (V16QImode, op_true);
24811 cmp = gen_lowpart (V16QImode, cmp);
24813 break;
24814 case V8SFmode:
24815 if (TARGET_AVX)
24816 gen = gen_avx_blendvps256;
24817 break;
24818 case V4DFmode:
24819 if (TARGET_AVX)
24820 gen = gen_avx_blendvpd256;
24821 break;
24822 case V32QImode:
24823 case V16HImode:
24824 case V8SImode:
24825 case V4DImode:
24826 if (TARGET_AVX2)
24828 gen = gen_avx2_pblendvb;
24829 if (mode != V32QImode)
24830 d = gen_reg_rtx (V32QImode);
24831 op_false = gen_lowpart (V32QImode, op_false);
24832 op_true = gen_lowpart (V32QImode, op_true);
24833 cmp = gen_lowpart (V32QImode, cmp);
24835 break;
24837 case V64QImode:
24838 gen = gen_avx512bw_blendmv64qi;
24839 break;
24840 case V32HImode:
24841 gen = gen_avx512bw_blendmv32hi;
24842 break;
24843 case V16SImode:
24844 gen = gen_avx512f_blendmv16si;
24845 break;
24846 case V8DImode:
24847 gen = gen_avx512f_blendmv8di;
24848 break;
24849 case V8DFmode:
24850 gen = gen_avx512f_blendmv8df;
24851 break;
24852 case V16SFmode:
24853 gen = gen_avx512f_blendmv16sf;
24854 break;
24856 default:
24857 break;
24860 if (gen != NULL)
24862 emit_insn (gen (d, op_false, op_true, cmp));
24863 if (d != dest)
24864 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24866 else
24868 op_true = force_reg (mode, op_true);
24870 t2 = gen_reg_rtx (mode);
24871 if (optimize)
24872 t3 = gen_reg_rtx (mode);
24873 else
24874 t3 = dest;
24876 x = gen_rtx_AND (mode, op_true, cmp);
24877 emit_insn (gen_rtx_SET (t2, x));
24879 x = gen_rtx_NOT (mode, cmp);
24880 x = gen_rtx_AND (mode, x, op_false);
24881 emit_insn (gen_rtx_SET (t3, x));
24883 x = gen_rtx_IOR (mode, t3, t2);
24884 emit_insn (gen_rtx_SET (dest, x));
24889 /* Expand a floating-point conditional move. Return true if successful. */
24891 bool
24892 ix86_expand_fp_movcc (rtx operands[])
24894 machine_mode mode = GET_MODE (operands[0]);
24895 enum rtx_code code = GET_CODE (operands[1]);
24896 rtx tmp, compare_op;
24897 rtx op0 = XEXP (operands[1], 0);
24898 rtx op1 = XEXP (operands[1], 1);
24900 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24902 machine_mode cmode;
24904 /* Since we've no cmove for sse registers, don't force bad register
24905 allocation just to gain access to it. Deny movcc when the
24906 comparison mode doesn't match the move mode. */
24907 cmode = GET_MODE (op0);
24908 if (cmode == VOIDmode)
24909 cmode = GET_MODE (op1);
24910 if (cmode != mode)
24911 return false;
24913 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24914 if (code == UNKNOWN)
24915 return false;
24917 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24918 operands[2], operands[3]))
24919 return true;
24921 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24922 operands[2], operands[3]);
24923 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24924 return true;
24927 if (GET_MODE (op0) == TImode
24928 || (GET_MODE (op0) == DImode
24929 && !TARGET_64BIT))
24930 return false;
24932 /* The floating point conditional move instructions don't directly
24933 support conditions resulting from a signed integer comparison. */
24935 compare_op = ix86_expand_compare (code, op0, op1);
24936 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24938 tmp = gen_reg_rtx (QImode);
24939 ix86_expand_setcc (tmp, code, op0, op1);
24941 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24944 emit_insn (gen_rtx_SET (operands[0],
24945 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24946 operands[2], operands[3])));
24948 return true;
24951 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24953 static int
24954 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24956 switch (code)
24958 case EQ:
24959 return 0;
24960 case LT:
24961 case LTU:
24962 return 1;
24963 case LE:
24964 case LEU:
24965 return 2;
24966 case NE:
24967 return 4;
24968 case GE:
24969 case GEU:
24970 return 5;
24971 case GT:
24972 case GTU:
24973 return 6;
24974 default:
24975 gcc_unreachable ();
24979 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24981 static int
24982 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24984 switch (code)
24986 case EQ:
24987 return 0x00;
24988 case NE:
24989 return 0x04;
24990 case GT:
24991 return 0x0e;
24992 case LE:
24993 return 0x02;
24994 case GE:
24995 return 0x0d;
24996 case LT:
24997 return 0x01;
24998 case UNLE:
24999 return 0x0a;
25000 case UNLT:
25001 return 0x09;
25002 case UNGE:
25003 return 0x05;
25004 case UNGT:
25005 return 0x06;
25006 case UNEQ:
25007 return 0x18;
25008 case LTGT:
25009 return 0x0c;
25010 case ORDERED:
25011 return 0x07;
25012 case UNORDERED:
25013 return 0x03;
25014 default:
25015 gcc_unreachable ();
25019 /* Return immediate value to be used in UNSPEC_PCMP
25020 for comparison CODE in MODE. */
25022 static int
25023 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25025 if (FLOAT_MODE_P (mode))
25026 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25027 return ix86_int_cmp_code_to_pcmp_immediate (code);
25030 /* Expand AVX-512 vector comparison. */
25032 bool
25033 ix86_expand_mask_vec_cmp (rtx operands[])
25035 machine_mode mask_mode = GET_MODE (operands[0]);
25036 machine_mode cmp_mode = GET_MODE (operands[2]);
25037 enum rtx_code code = GET_CODE (operands[1]);
25038 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25039 int unspec_code;
25040 rtx unspec;
25042 switch (code)
25044 case LEU:
25045 case GTU:
25046 case GEU:
25047 case LTU:
25048 unspec_code = UNSPEC_UNSIGNED_PCMP;
25049 break;
25051 default:
25052 unspec_code = UNSPEC_PCMP;
25055 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25056 operands[3], imm),
25057 unspec_code);
25058 emit_insn (gen_rtx_SET (operands[0], unspec));
25060 return true;
25063 /* Expand fp vector comparison. */
25065 bool
25066 ix86_expand_fp_vec_cmp (rtx operands[])
25068 enum rtx_code code = GET_CODE (operands[1]);
25069 rtx cmp;
25071 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25072 &operands[2], &operands[3]);
25073 if (code == UNKNOWN)
25075 rtx temp;
25076 switch (GET_CODE (operands[1]))
25078 case LTGT:
25079 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25080 operands[3], NULL, NULL);
25081 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25082 operands[3], NULL, NULL);
25083 code = AND;
25084 break;
25085 case UNEQ:
25086 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25087 operands[3], NULL, NULL);
25088 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25089 operands[3], NULL, NULL);
25090 code = IOR;
25091 break;
25092 default:
25093 gcc_unreachable ();
25095 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25096 OPTAB_DIRECT);
25098 else
25099 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25100 operands[1], operands[2]);
25102 if (operands[0] != cmp)
25103 emit_move_insn (operands[0], cmp);
25105 return true;
25108 static rtx
25109 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25110 rtx op_true, rtx op_false, bool *negate)
25112 machine_mode data_mode = GET_MODE (dest);
25113 machine_mode mode = GET_MODE (cop0);
25114 rtx x;
25116 *negate = false;
25118 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25119 if (TARGET_XOP
25120 && (mode == V16QImode || mode == V8HImode
25121 || mode == V4SImode || mode == V2DImode))
25123 else
25125 /* Canonicalize the comparison to EQ, GT, GTU. */
25126 switch (code)
25128 case EQ:
25129 case GT:
25130 case GTU:
25131 break;
25133 case NE:
25134 case LE:
25135 case LEU:
25136 code = reverse_condition (code);
25137 *negate = true;
25138 break;
25140 case GE:
25141 case GEU:
25142 code = reverse_condition (code);
25143 *negate = true;
25144 /* FALLTHRU */
25146 case LT:
25147 case LTU:
25148 std::swap (cop0, cop1);
25149 code = swap_condition (code);
25150 break;
25152 default:
25153 gcc_unreachable ();
25156 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25157 if (mode == V2DImode)
25159 switch (code)
25161 case EQ:
25162 /* SSE4.1 supports EQ. */
25163 if (!TARGET_SSE4_1)
25164 return NULL;
25165 break;
25167 case GT:
25168 case GTU:
25169 /* SSE4.2 supports GT/GTU. */
25170 if (!TARGET_SSE4_2)
25171 return NULL;
25172 break;
25174 default:
25175 gcc_unreachable ();
25179 /* Unsigned parallel compare is not supported by the hardware.
25180 Play some tricks to turn this into a signed comparison
25181 against 0. */
25182 if (code == GTU)
25184 cop0 = force_reg (mode, cop0);
25186 switch (mode)
25188 case V16SImode:
25189 case V8DImode:
25190 case V8SImode:
25191 case V4DImode:
25192 case V4SImode:
25193 case V2DImode:
25195 rtx t1, t2, mask;
25196 rtx (*gen_sub3) (rtx, rtx, rtx);
25198 switch (mode)
25200 case V16SImode: gen_sub3 = gen_subv16si3; break;
25201 case V8DImode: gen_sub3 = gen_subv8di3; break;
25202 case V8SImode: gen_sub3 = gen_subv8si3; break;
25203 case V4DImode: gen_sub3 = gen_subv4di3; break;
25204 case V4SImode: gen_sub3 = gen_subv4si3; break;
25205 case V2DImode: gen_sub3 = gen_subv2di3; break;
25206 default:
25207 gcc_unreachable ();
25209 /* Subtract (-(INT MAX) - 1) from both operands to make
25210 them signed. */
25211 mask = ix86_build_signbit_mask (mode, true, false);
25212 t1 = gen_reg_rtx (mode);
25213 emit_insn (gen_sub3 (t1, cop0, mask));
25215 t2 = gen_reg_rtx (mode);
25216 emit_insn (gen_sub3 (t2, cop1, mask));
25218 cop0 = t1;
25219 cop1 = t2;
25220 code = GT;
25222 break;
25224 case V64QImode:
25225 case V32HImode:
25226 case V32QImode:
25227 case V16HImode:
25228 case V16QImode:
25229 case V8HImode:
25230 /* Perform a parallel unsigned saturating subtraction. */
25231 x = gen_reg_rtx (mode);
25232 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25233 cop1)));
25235 cop0 = x;
25236 cop1 = CONST0_RTX (mode);
25237 code = EQ;
25238 *negate = !*negate;
25239 break;
25241 default:
25242 gcc_unreachable ();
25247 if (*negate)
25248 std::swap (op_true, op_false);
25250 /* Allow the comparison to be done in one mode, but the movcc to
25251 happen in another mode. */
25252 if (data_mode == mode)
25254 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25255 op_true, op_false);
25257 else
25259 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25260 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25261 op_true, op_false);
25262 if (GET_MODE (x) == mode)
25263 x = gen_lowpart (data_mode, x);
25266 return x;
25269 /* Expand integer vector comparison. */
25271 bool
25272 ix86_expand_int_vec_cmp (rtx operands[])
25274 rtx_code code = GET_CODE (operands[1]);
25275 bool negate = false;
25276 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25277 operands[3], NULL, NULL, &negate);
25279 if (!cmp)
25280 return false;
25282 if (negate)
25283 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25284 CONST0_RTX (GET_MODE (cmp)),
25285 NULL, NULL, &negate);
25287 gcc_assert (!negate);
25289 if (operands[0] != cmp)
25290 emit_move_insn (operands[0], cmp);
25292 return true;
25295 /* Expand a floating-point vector conditional move; a vcond operation
25296 rather than a movcc operation. */
25298 bool
25299 ix86_expand_fp_vcond (rtx operands[])
25301 enum rtx_code code = GET_CODE (operands[3]);
25302 rtx cmp;
25304 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25305 &operands[4], &operands[5]);
25306 if (code == UNKNOWN)
25308 rtx temp;
25309 switch (GET_CODE (operands[3]))
25311 case LTGT:
25312 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25313 operands[5], operands[0], operands[0]);
25314 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25315 operands[5], operands[1], operands[2]);
25316 code = AND;
25317 break;
25318 case UNEQ:
25319 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25320 operands[5], operands[0], operands[0]);
25321 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25322 operands[5], operands[1], operands[2]);
25323 code = IOR;
25324 break;
25325 default:
25326 gcc_unreachable ();
25328 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25329 OPTAB_DIRECT);
25330 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25331 return true;
25334 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25335 operands[5], operands[1], operands[2]))
25336 return true;
25338 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25339 operands[1], operands[2]);
25340 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25341 return true;
25344 /* Expand a signed/unsigned integral vector conditional move. */
25346 bool
25347 ix86_expand_int_vcond (rtx operands[])
25349 machine_mode data_mode = GET_MODE (operands[0]);
25350 machine_mode mode = GET_MODE (operands[4]);
25351 enum rtx_code code = GET_CODE (operands[3]);
25352 bool negate = false;
25353 rtx x, cop0, cop1;
25355 cop0 = operands[4];
25356 cop1 = operands[5];
25358 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25359 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25360 if ((code == LT || code == GE)
25361 && data_mode == mode
25362 && cop1 == CONST0_RTX (mode)
25363 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25364 && GET_MODE_UNIT_SIZE (data_mode) > 1
25365 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25366 && (GET_MODE_SIZE (data_mode) == 16
25367 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25369 rtx negop = operands[2 - (code == LT)];
25370 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25371 if (negop == CONST1_RTX (data_mode))
25373 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25374 operands[0], 1, OPTAB_DIRECT);
25375 if (res != operands[0])
25376 emit_move_insn (operands[0], res);
25377 return true;
25379 else if (GET_MODE_INNER (data_mode) != DImode
25380 && vector_all_ones_operand (negop, data_mode))
25382 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25383 operands[0], 0, OPTAB_DIRECT);
25384 if (res != operands[0])
25385 emit_move_insn (operands[0], res);
25386 return true;
25390 if (!nonimmediate_operand (cop1, mode))
25391 cop1 = force_reg (mode, cop1);
25392 if (!general_operand (operands[1], data_mode))
25393 operands[1] = force_reg (data_mode, operands[1]);
25394 if (!general_operand (operands[2], data_mode))
25395 operands[2] = force_reg (data_mode, operands[2]);
25397 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25398 operands[1], operands[2], &negate);
25400 if (!x)
25401 return false;
25403 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25404 operands[2-negate]);
25405 return true;
25408 /* AVX512F does support 64-byte integer vector operations,
25409 thus the longest vector we are faced with is V64QImode. */
25410 #define MAX_VECT_LEN 64
25412 struct expand_vec_perm_d
25414 rtx target, op0, op1;
25415 unsigned char perm[MAX_VECT_LEN];
25416 machine_mode vmode;
25417 unsigned char nelt;
25418 bool one_operand_p;
25419 bool testing_p;
25422 static bool
25423 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25424 struct expand_vec_perm_d *d)
25426 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25427 expander, so args are either in d, or in op0, op1 etc. */
25428 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25429 machine_mode maskmode = mode;
25430 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25432 switch (mode)
25434 case V8HImode:
25435 if (TARGET_AVX512VL && TARGET_AVX512BW)
25436 gen = gen_avx512vl_vpermi2varv8hi3;
25437 break;
25438 case V16HImode:
25439 if (TARGET_AVX512VL && TARGET_AVX512BW)
25440 gen = gen_avx512vl_vpermi2varv16hi3;
25441 break;
25442 case V64QImode:
25443 if (TARGET_AVX512VBMI)
25444 gen = gen_avx512bw_vpermi2varv64qi3;
25445 break;
25446 case V32HImode:
25447 if (TARGET_AVX512BW)
25448 gen = gen_avx512bw_vpermi2varv32hi3;
25449 break;
25450 case V4SImode:
25451 if (TARGET_AVX512VL)
25452 gen = gen_avx512vl_vpermi2varv4si3;
25453 break;
25454 case V8SImode:
25455 if (TARGET_AVX512VL)
25456 gen = gen_avx512vl_vpermi2varv8si3;
25457 break;
25458 case V16SImode:
25459 if (TARGET_AVX512F)
25460 gen = gen_avx512f_vpermi2varv16si3;
25461 break;
25462 case V4SFmode:
25463 if (TARGET_AVX512VL)
25465 gen = gen_avx512vl_vpermi2varv4sf3;
25466 maskmode = V4SImode;
25468 break;
25469 case V8SFmode:
25470 if (TARGET_AVX512VL)
25472 gen = gen_avx512vl_vpermi2varv8sf3;
25473 maskmode = V8SImode;
25475 break;
25476 case V16SFmode:
25477 if (TARGET_AVX512F)
25479 gen = gen_avx512f_vpermi2varv16sf3;
25480 maskmode = V16SImode;
25482 break;
25483 case V2DImode:
25484 if (TARGET_AVX512VL)
25485 gen = gen_avx512vl_vpermi2varv2di3;
25486 break;
25487 case V4DImode:
25488 if (TARGET_AVX512VL)
25489 gen = gen_avx512vl_vpermi2varv4di3;
25490 break;
25491 case V8DImode:
25492 if (TARGET_AVX512F)
25493 gen = gen_avx512f_vpermi2varv8di3;
25494 break;
25495 case V2DFmode:
25496 if (TARGET_AVX512VL)
25498 gen = gen_avx512vl_vpermi2varv2df3;
25499 maskmode = V2DImode;
25501 break;
25502 case V4DFmode:
25503 if (TARGET_AVX512VL)
25505 gen = gen_avx512vl_vpermi2varv4df3;
25506 maskmode = V4DImode;
25508 break;
25509 case V8DFmode:
25510 if (TARGET_AVX512F)
25512 gen = gen_avx512f_vpermi2varv8df3;
25513 maskmode = V8DImode;
25515 break;
25516 default:
25517 break;
25520 if (gen == NULL)
25521 return false;
25523 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25524 expander, so args are either in d, or in op0, op1 etc. */
25525 if (d)
25527 rtx vec[64];
25528 target = d->target;
25529 op0 = d->op0;
25530 op1 = d->op1;
25531 for (int i = 0; i < d->nelt; ++i)
25532 vec[i] = GEN_INT (d->perm[i]);
25533 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25536 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25537 return true;
25540 /* Expand a variable vector permutation. */
25542 void
25543 ix86_expand_vec_perm (rtx operands[])
25545 rtx target = operands[0];
25546 rtx op0 = operands[1];
25547 rtx op1 = operands[2];
25548 rtx mask = operands[3];
25549 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25550 machine_mode mode = GET_MODE (op0);
25551 machine_mode maskmode = GET_MODE (mask);
25552 int w, e, i;
25553 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25555 /* Number of elements in the vector. */
25556 w = GET_MODE_NUNITS (mode);
25557 e = GET_MODE_UNIT_SIZE (mode);
25558 gcc_assert (w <= 64);
25560 if (TARGET_AVX512F && one_operand_shuffle)
25562 rtx (*gen) (rtx, rtx, rtx) = NULL;
25563 switch (mode)
25565 case V16SImode:
25566 gen =gen_avx512f_permvarv16si;
25567 break;
25568 case V16SFmode:
25569 gen = gen_avx512f_permvarv16sf;
25570 break;
25571 case V8DImode:
25572 gen = gen_avx512f_permvarv8di;
25573 break;
25574 case V8DFmode:
25575 gen = gen_avx512f_permvarv8df;
25576 break;
25577 default:
25578 break;
25580 if (gen != NULL)
25582 emit_insn (gen (target, op0, mask));
25583 return;
25587 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25588 return;
25590 if (TARGET_AVX2)
25592 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25594 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25595 an constant shuffle operand. With a tiny bit of effort we can
25596 use VPERMD instead. A re-interpretation stall for V4DFmode is
25597 unfortunate but there's no avoiding it.
25598 Similarly for V16HImode we don't have instructions for variable
25599 shuffling, while for V32QImode we can use after preparing suitable
25600 masks vpshufb; vpshufb; vpermq; vpor. */
25602 if (mode == V16HImode)
25604 maskmode = mode = V32QImode;
25605 w = 32;
25606 e = 1;
25608 else
25610 maskmode = mode = V8SImode;
25611 w = 8;
25612 e = 4;
25614 t1 = gen_reg_rtx (maskmode);
25616 /* Replicate the low bits of the V4DImode mask into V8SImode:
25617 mask = { A B C D }
25618 t1 = { A A B B C C D D }. */
25619 for (i = 0; i < w / 2; ++i)
25620 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25621 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25622 vt = force_reg (maskmode, vt);
25623 mask = gen_lowpart (maskmode, mask);
25624 if (maskmode == V8SImode)
25625 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25626 else
25627 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25629 /* Multiply the shuffle indicies by two. */
25630 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25631 OPTAB_DIRECT);
25633 /* Add one to the odd shuffle indicies:
25634 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25635 for (i = 0; i < w / 2; ++i)
25637 vec[i * 2] = const0_rtx;
25638 vec[i * 2 + 1] = const1_rtx;
25640 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25641 vt = validize_mem (force_const_mem (maskmode, vt));
25642 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25643 OPTAB_DIRECT);
25645 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25646 operands[3] = mask = t1;
25647 target = gen_reg_rtx (mode);
25648 op0 = gen_lowpart (mode, op0);
25649 op1 = gen_lowpart (mode, op1);
25652 switch (mode)
25654 case V8SImode:
25655 /* The VPERMD and VPERMPS instructions already properly ignore
25656 the high bits of the shuffle elements. No need for us to
25657 perform an AND ourselves. */
25658 if (one_operand_shuffle)
25660 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25661 if (target != operands[0])
25662 emit_move_insn (operands[0],
25663 gen_lowpart (GET_MODE (operands[0]), target));
25665 else
25667 t1 = gen_reg_rtx (V8SImode);
25668 t2 = gen_reg_rtx (V8SImode);
25669 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25670 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25671 goto merge_two;
25673 return;
25675 case V8SFmode:
25676 mask = gen_lowpart (V8SImode, mask);
25677 if (one_operand_shuffle)
25678 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25679 else
25681 t1 = gen_reg_rtx (V8SFmode);
25682 t2 = gen_reg_rtx (V8SFmode);
25683 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25684 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25685 goto merge_two;
25687 return;
25689 case V4SImode:
25690 /* By combining the two 128-bit input vectors into one 256-bit
25691 input vector, we can use VPERMD and VPERMPS for the full
25692 two-operand shuffle. */
25693 t1 = gen_reg_rtx (V8SImode);
25694 t2 = gen_reg_rtx (V8SImode);
25695 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25696 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25697 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25698 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25699 return;
25701 case V4SFmode:
25702 t1 = gen_reg_rtx (V8SFmode);
25703 t2 = gen_reg_rtx (V8SImode);
25704 mask = gen_lowpart (V4SImode, mask);
25705 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25706 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25707 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25708 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25709 return;
25711 case V32QImode:
25712 t1 = gen_reg_rtx (V32QImode);
25713 t2 = gen_reg_rtx (V32QImode);
25714 t3 = gen_reg_rtx (V32QImode);
25715 vt2 = GEN_INT (-128);
25716 for (i = 0; i < 32; i++)
25717 vec[i] = vt2;
25718 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25719 vt = force_reg (V32QImode, vt);
25720 for (i = 0; i < 32; i++)
25721 vec[i] = i < 16 ? vt2 : const0_rtx;
25722 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25723 vt2 = force_reg (V32QImode, vt2);
25724 /* From mask create two adjusted masks, which contain the same
25725 bits as mask in the low 7 bits of each vector element.
25726 The first mask will have the most significant bit clear
25727 if it requests element from the same 128-bit lane
25728 and MSB set if it requests element from the other 128-bit lane.
25729 The second mask will have the opposite values of the MSB,
25730 and additionally will have its 128-bit lanes swapped.
25731 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25732 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25733 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25734 stands for other 12 bytes. */
25735 /* The bit whether element is from the same lane or the other
25736 lane is bit 4, so shift it up by 3 to the MSB position. */
25737 t5 = gen_reg_rtx (V4DImode);
25738 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25739 GEN_INT (3)));
25740 /* Clear MSB bits from the mask just in case it had them set. */
25741 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25742 /* After this t1 will have MSB set for elements from other lane. */
25743 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25744 /* Clear bits other than MSB. */
25745 emit_insn (gen_andv32qi3 (t1, t1, vt));
25746 /* Or in the lower bits from mask into t3. */
25747 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25748 /* And invert MSB bits in t1, so MSB is set for elements from the same
25749 lane. */
25750 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25751 /* Swap 128-bit lanes in t3. */
25752 t6 = gen_reg_rtx (V4DImode);
25753 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25754 const2_rtx, GEN_INT (3),
25755 const0_rtx, const1_rtx));
25756 /* And or in the lower bits from mask into t1. */
25757 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25758 if (one_operand_shuffle)
25760 /* Each of these shuffles will put 0s in places where
25761 element from the other 128-bit lane is needed, otherwise
25762 will shuffle in the requested value. */
25763 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25764 gen_lowpart (V32QImode, t6)));
25765 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25766 /* For t3 the 128-bit lanes are swapped again. */
25767 t7 = gen_reg_rtx (V4DImode);
25768 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25769 const2_rtx, GEN_INT (3),
25770 const0_rtx, const1_rtx));
25771 /* And oring both together leads to the result. */
25772 emit_insn (gen_iorv32qi3 (target, t1,
25773 gen_lowpart (V32QImode, t7)));
25774 if (target != operands[0])
25775 emit_move_insn (operands[0],
25776 gen_lowpart (GET_MODE (operands[0]), target));
25777 return;
25780 t4 = gen_reg_rtx (V32QImode);
25781 /* Similarly to the above one_operand_shuffle code,
25782 just for repeated twice for each operand. merge_two:
25783 code will merge the two results together. */
25784 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25785 gen_lowpart (V32QImode, t6)));
25786 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25787 gen_lowpart (V32QImode, t6)));
25788 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25789 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25790 t7 = gen_reg_rtx (V4DImode);
25791 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25792 const2_rtx, GEN_INT (3),
25793 const0_rtx, const1_rtx));
25794 t8 = gen_reg_rtx (V4DImode);
25795 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25796 const2_rtx, GEN_INT (3),
25797 const0_rtx, const1_rtx));
25798 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25799 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25800 t1 = t4;
25801 t2 = t3;
25802 goto merge_two;
25804 default:
25805 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25806 break;
25810 if (TARGET_XOP)
25812 /* The XOP VPPERM insn supports three inputs. By ignoring the
25813 one_operand_shuffle special case, we avoid creating another
25814 set of constant vectors in memory. */
25815 one_operand_shuffle = false;
25817 /* mask = mask & {2*w-1, ...} */
25818 vt = GEN_INT (2*w - 1);
25820 else
25822 /* mask = mask & {w-1, ...} */
25823 vt = GEN_INT (w - 1);
25826 for (i = 0; i < w; i++)
25827 vec[i] = vt;
25828 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25829 mask = expand_simple_binop (maskmode, AND, mask, vt,
25830 NULL_RTX, 0, OPTAB_DIRECT);
25832 /* For non-QImode operations, convert the word permutation control
25833 into a byte permutation control. */
25834 if (mode != V16QImode)
25836 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25837 GEN_INT (exact_log2 (e)),
25838 NULL_RTX, 0, OPTAB_DIRECT);
25840 /* Convert mask to vector of chars. */
25841 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25843 /* Replicate each of the input bytes into byte positions:
25844 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25845 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25846 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25847 for (i = 0; i < 16; ++i)
25848 vec[i] = GEN_INT (i/e * e);
25849 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25850 vt = validize_mem (force_const_mem (V16QImode, vt));
25851 if (TARGET_XOP)
25852 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25853 else
25854 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25856 /* Convert it into the byte positions by doing
25857 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25858 for (i = 0; i < 16; ++i)
25859 vec[i] = GEN_INT (i % e);
25860 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25861 vt = validize_mem (force_const_mem (V16QImode, vt));
25862 emit_insn (gen_addv16qi3 (mask, mask, vt));
25865 /* The actual shuffle operations all operate on V16QImode. */
25866 op0 = gen_lowpart (V16QImode, op0);
25867 op1 = gen_lowpart (V16QImode, op1);
25869 if (TARGET_XOP)
25871 if (GET_MODE (target) != V16QImode)
25872 target = gen_reg_rtx (V16QImode);
25873 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25874 if (target != operands[0])
25875 emit_move_insn (operands[0],
25876 gen_lowpart (GET_MODE (operands[0]), target));
25878 else if (one_operand_shuffle)
25880 if (GET_MODE (target) != V16QImode)
25881 target = gen_reg_rtx (V16QImode);
25882 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25883 if (target != operands[0])
25884 emit_move_insn (operands[0],
25885 gen_lowpart (GET_MODE (operands[0]), target));
25887 else
25889 rtx xops[6];
25890 bool ok;
25892 /* Shuffle the two input vectors independently. */
25893 t1 = gen_reg_rtx (V16QImode);
25894 t2 = gen_reg_rtx (V16QImode);
25895 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25896 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25898 merge_two:
25899 /* Then merge them together. The key is whether any given control
25900 element contained a bit set that indicates the second word. */
25901 mask = operands[3];
25902 vt = GEN_INT (w);
25903 if (maskmode == V2DImode && !TARGET_SSE4_1)
25905 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25906 more shuffle to convert the V2DI input mask into a V4SI
25907 input mask. At which point the masking that expand_int_vcond
25908 will work as desired. */
25909 rtx t3 = gen_reg_rtx (V4SImode);
25910 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25911 const0_rtx, const0_rtx,
25912 const2_rtx, const2_rtx));
25913 mask = t3;
25914 maskmode = V4SImode;
25915 e = w = 4;
25918 for (i = 0; i < w; i++)
25919 vec[i] = vt;
25920 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25921 vt = force_reg (maskmode, vt);
25922 mask = expand_simple_binop (maskmode, AND, mask, vt,
25923 NULL_RTX, 0, OPTAB_DIRECT);
25925 if (GET_MODE (target) != mode)
25926 target = gen_reg_rtx (mode);
25927 xops[0] = target;
25928 xops[1] = gen_lowpart (mode, t2);
25929 xops[2] = gen_lowpart (mode, t1);
25930 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25931 xops[4] = mask;
25932 xops[5] = vt;
25933 ok = ix86_expand_int_vcond (xops);
25934 gcc_assert (ok);
25935 if (target != operands[0])
25936 emit_move_insn (operands[0],
25937 gen_lowpart (GET_MODE (operands[0]), target));
25941 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25942 true if we should do zero extension, else sign extension. HIGH_P is
25943 true if we want the N/2 high elements, else the low elements. */
25945 void
25946 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25948 machine_mode imode = GET_MODE (src);
25949 rtx tmp;
25951 if (TARGET_SSE4_1)
25953 rtx (*unpack)(rtx, rtx);
25954 rtx (*extract)(rtx, rtx) = NULL;
25955 machine_mode halfmode = BLKmode;
25957 switch (imode)
25959 case V64QImode:
25960 if (unsigned_p)
25961 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25962 else
25963 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25964 halfmode = V32QImode;
25965 extract
25966 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25967 break;
25968 case V32QImode:
25969 if (unsigned_p)
25970 unpack = gen_avx2_zero_extendv16qiv16hi2;
25971 else
25972 unpack = gen_avx2_sign_extendv16qiv16hi2;
25973 halfmode = V16QImode;
25974 extract
25975 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25976 break;
25977 case V32HImode:
25978 if (unsigned_p)
25979 unpack = gen_avx512f_zero_extendv16hiv16si2;
25980 else
25981 unpack = gen_avx512f_sign_extendv16hiv16si2;
25982 halfmode = V16HImode;
25983 extract
25984 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25985 break;
25986 case V16HImode:
25987 if (unsigned_p)
25988 unpack = gen_avx2_zero_extendv8hiv8si2;
25989 else
25990 unpack = gen_avx2_sign_extendv8hiv8si2;
25991 halfmode = V8HImode;
25992 extract
25993 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25994 break;
25995 case V16SImode:
25996 if (unsigned_p)
25997 unpack = gen_avx512f_zero_extendv8siv8di2;
25998 else
25999 unpack = gen_avx512f_sign_extendv8siv8di2;
26000 halfmode = V8SImode;
26001 extract
26002 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
26003 break;
26004 case V8SImode:
26005 if (unsigned_p)
26006 unpack = gen_avx2_zero_extendv4siv4di2;
26007 else
26008 unpack = gen_avx2_sign_extendv4siv4di2;
26009 halfmode = V4SImode;
26010 extract
26011 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26012 break;
26013 case V16QImode:
26014 if (unsigned_p)
26015 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26016 else
26017 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26018 break;
26019 case V8HImode:
26020 if (unsigned_p)
26021 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26022 else
26023 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26024 break;
26025 case V4SImode:
26026 if (unsigned_p)
26027 unpack = gen_sse4_1_zero_extendv2siv2di2;
26028 else
26029 unpack = gen_sse4_1_sign_extendv2siv2di2;
26030 break;
26031 default:
26032 gcc_unreachable ();
26035 if (GET_MODE_SIZE (imode) >= 32)
26037 tmp = gen_reg_rtx (halfmode);
26038 emit_insn (extract (tmp, src));
26040 else if (high_p)
26042 /* Shift higher 8 bytes to lower 8 bytes. */
26043 tmp = gen_reg_rtx (V1TImode);
26044 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26045 GEN_INT (64)));
26046 tmp = gen_lowpart (imode, tmp);
26048 else
26049 tmp = src;
26051 emit_insn (unpack (dest, tmp));
26053 else
26055 rtx (*unpack)(rtx, rtx, rtx);
26057 switch (imode)
26059 case V16QImode:
26060 if (high_p)
26061 unpack = gen_vec_interleave_highv16qi;
26062 else
26063 unpack = gen_vec_interleave_lowv16qi;
26064 break;
26065 case V8HImode:
26066 if (high_p)
26067 unpack = gen_vec_interleave_highv8hi;
26068 else
26069 unpack = gen_vec_interleave_lowv8hi;
26070 break;
26071 case V4SImode:
26072 if (high_p)
26073 unpack = gen_vec_interleave_highv4si;
26074 else
26075 unpack = gen_vec_interleave_lowv4si;
26076 break;
26077 default:
26078 gcc_unreachable ();
26081 if (unsigned_p)
26082 tmp = force_reg (imode, CONST0_RTX (imode));
26083 else
26084 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26085 src, pc_rtx, pc_rtx);
26087 rtx tmp2 = gen_reg_rtx (imode);
26088 emit_insn (unpack (tmp2, src, tmp));
26089 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26093 /* Expand conditional increment or decrement using adb/sbb instructions.
26094 The default case using setcc followed by the conditional move can be
26095 done by generic code. */
26096 bool
26097 ix86_expand_int_addcc (rtx operands[])
26099 enum rtx_code code = GET_CODE (operands[1]);
26100 rtx flags;
26101 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26102 rtx compare_op;
26103 rtx val = const0_rtx;
26104 bool fpcmp = false;
26105 machine_mode mode;
26106 rtx op0 = XEXP (operands[1], 0);
26107 rtx op1 = XEXP (operands[1], 1);
26109 if (operands[3] != const1_rtx
26110 && operands[3] != constm1_rtx)
26111 return false;
26112 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26113 return false;
26114 code = GET_CODE (compare_op);
26116 flags = XEXP (compare_op, 0);
26118 if (GET_MODE (flags) == CCFPmode
26119 || GET_MODE (flags) == CCFPUmode)
26121 fpcmp = true;
26122 code = ix86_fp_compare_code_to_integer (code);
26125 if (code != LTU)
26127 val = constm1_rtx;
26128 if (fpcmp)
26129 PUT_CODE (compare_op,
26130 reverse_condition_maybe_unordered
26131 (GET_CODE (compare_op)));
26132 else
26133 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26136 mode = GET_MODE (operands[0]);
26138 /* Construct either adc or sbb insn. */
26139 if ((code == LTU) == (operands[3] == constm1_rtx))
26141 switch (mode)
26143 case QImode:
26144 insn = gen_subqi3_carry;
26145 break;
26146 case HImode:
26147 insn = gen_subhi3_carry;
26148 break;
26149 case SImode:
26150 insn = gen_subsi3_carry;
26151 break;
26152 case DImode:
26153 insn = gen_subdi3_carry;
26154 break;
26155 default:
26156 gcc_unreachable ();
26159 else
26161 switch (mode)
26163 case QImode:
26164 insn = gen_addqi3_carry;
26165 break;
26166 case HImode:
26167 insn = gen_addhi3_carry;
26168 break;
26169 case SImode:
26170 insn = gen_addsi3_carry;
26171 break;
26172 case DImode:
26173 insn = gen_adddi3_carry;
26174 break;
26175 default:
26176 gcc_unreachable ();
26179 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26181 return true;
26185 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26186 but works for floating pointer parameters and nonoffsetable memories.
26187 For pushes, it returns just stack offsets; the values will be saved
26188 in the right order. Maximally three parts are generated. */
26190 static int
26191 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26193 int size;
26195 if (!TARGET_64BIT)
26196 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26197 else
26198 size = (GET_MODE_SIZE (mode) + 4) / 8;
26200 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26201 gcc_assert (size >= 2 && size <= 4);
26203 /* Optimize constant pool reference to immediates. This is used by fp
26204 moves, that force all constants to memory to allow combining. */
26205 if (MEM_P (operand) && MEM_READONLY_P (operand))
26207 rtx tmp = maybe_get_pool_constant (operand);
26208 if (tmp)
26209 operand = tmp;
26212 if (MEM_P (operand) && !offsettable_memref_p (operand))
26214 /* The only non-offsetable memories we handle are pushes. */
26215 int ok = push_operand (operand, VOIDmode);
26217 gcc_assert (ok);
26219 operand = copy_rtx (operand);
26220 PUT_MODE (operand, word_mode);
26221 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26222 return size;
26225 if (GET_CODE (operand) == CONST_VECTOR)
26227 machine_mode imode = int_mode_for_mode (mode);
26228 /* Caution: if we looked through a constant pool memory above,
26229 the operand may actually have a different mode now. That's
26230 ok, since we want to pun this all the way back to an integer. */
26231 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26232 gcc_assert (operand != NULL);
26233 mode = imode;
26236 if (!TARGET_64BIT)
26238 if (mode == DImode)
26239 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26240 else
26242 int i;
26244 if (REG_P (operand))
26246 gcc_assert (reload_completed);
26247 for (i = 0; i < size; i++)
26248 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26250 else if (offsettable_memref_p (operand))
26252 operand = adjust_address (operand, SImode, 0);
26253 parts[0] = operand;
26254 for (i = 1; i < size; i++)
26255 parts[i] = adjust_address (operand, SImode, 4 * i);
26257 else if (CONST_DOUBLE_P (operand))
26259 const REAL_VALUE_TYPE *r;
26260 long l[4];
26262 r = CONST_DOUBLE_REAL_VALUE (operand);
26263 switch (mode)
26265 case TFmode:
26266 real_to_target (l, r, mode);
26267 parts[3] = gen_int_mode (l[3], SImode);
26268 parts[2] = gen_int_mode (l[2], SImode);
26269 break;
26270 case XFmode:
26271 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26272 long double may not be 80-bit. */
26273 real_to_target (l, r, mode);
26274 parts[2] = gen_int_mode (l[2], SImode);
26275 break;
26276 case DFmode:
26277 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26278 break;
26279 default:
26280 gcc_unreachable ();
26282 parts[1] = gen_int_mode (l[1], SImode);
26283 parts[0] = gen_int_mode (l[0], SImode);
26285 else
26286 gcc_unreachable ();
26289 else
26291 if (mode == TImode)
26292 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26293 if (mode == XFmode || mode == TFmode)
26295 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26296 if (REG_P (operand))
26298 gcc_assert (reload_completed);
26299 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26300 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26302 else if (offsettable_memref_p (operand))
26304 operand = adjust_address (operand, DImode, 0);
26305 parts[0] = operand;
26306 parts[1] = adjust_address (operand, upper_mode, 8);
26308 else if (CONST_DOUBLE_P (operand))
26310 long l[4];
26312 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26314 /* real_to_target puts 32-bit pieces in each long. */
26315 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26316 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26317 << 32), DImode);
26319 if (upper_mode == SImode)
26320 parts[1] = gen_int_mode (l[2], SImode);
26321 else
26322 parts[1]
26323 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26324 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26325 << 32), DImode);
26327 else
26328 gcc_unreachable ();
26332 return size;
26335 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26336 Return false when normal moves are needed; true when all required
26337 insns have been emitted. Operands 2-4 contain the input values
26338 int the correct order; operands 5-7 contain the output values. */
26340 void
26341 ix86_split_long_move (rtx operands[])
26343 rtx part[2][4];
26344 int nparts, i, j;
26345 int push = 0;
26346 int collisions = 0;
26347 machine_mode mode = GET_MODE (operands[0]);
26348 bool collisionparts[4];
26350 /* The DFmode expanders may ask us to move double.
26351 For 64bit target this is single move. By hiding the fact
26352 here we simplify i386.md splitters. */
26353 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26355 /* Optimize constant pool reference to immediates. This is used by
26356 fp moves, that force all constants to memory to allow combining. */
26358 if (MEM_P (operands[1])
26359 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26360 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26361 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26362 if (push_operand (operands[0], VOIDmode))
26364 operands[0] = copy_rtx (operands[0]);
26365 PUT_MODE (operands[0], word_mode);
26367 else
26368 operands[0] = gen_lowpart (DImode, operands[0]);
26369 operands[1] = gen_lowpart (DImode, operands[1]);
26370 emit_move_insn (operands[0], operands[1]);
26371 return;
26374 /* The only non-offsettable memory we handle is push. */
26375 if (push_operand (operands[0], VOIDmode))
26376 push = 1;
26377 else
26378 gcc_assert (!MEM_P (operands[0])
26379 || offsettable_memref_p (operands[0]));
26381 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26382 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26384 /* When emitting push, take care for source operands on the stack. */
26385 if (push && MEM_P (operands[1])
26386 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26388 rtx src_base = XEXP (part[1][nparts - 1], 0);
26390 /* Compensate for the stack decrement by 4. */
26391 if (!TARGET_64BIT && nparts == 3
26392 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26393 src_base = plus_constant (Pmode, src_base, 4);
26395 /* src_base refers to the stack pointer and is
26396 automatically decreased by emitted push. */
26397 for (i = 0; i < nparts; i++)
26398 part[1][i] = change_address (part[1][i],
26399 GET_MODE (part[1][i]), src_base);
26402 /* We need to do copy in the right order in case an address register
26403 of the source overlaps the destination. */
26404 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26406 rtx tmp;
26408 for (i = 0; i < nparts; i++)
26410 collisionparts[i]
26411 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26412 if (collisionparts[i])
26413 collisions++;
26416 /* Collision in the middle part can be handled by reordering. */
26417 if (collisions == 1 && nparts == 3 && collisionparts [1])
26419 std::swap (part[0][1], part[0][2]);
26420 std::swap (part[1][1], part[1][2]);
26422 else if (collisions == 1
26423 && nparts == 4
26424 && (collisionparts [1] || collisionparts [2]))
26426 if (collisionparts [1])
26428 std::swap (part[0][1], part[0][2]);
26429 std::swap (part[1][1], part[1][2]);
26431 else
26433 std::swap (part[0][2], part[0][3]);
26434 std::swap (part[1][2], part[1][3]);
26438 /* If there are more collisions, we can't handle it by reordering.
26439 Do an lea to the last part and use only one colliding move. */
26440 else if (collisions > 1)
26442 rtx base, addr, tls_base = NULL_RTX;
26444 collisions = 1;
26446 base = part[0][nparts - 1];
26448 /* Handle the case when the last part isn't valid for lea.
26449 Happens in 64-bit mode storing the 12-byte XFmode. */
26450 if (GET_MODE (base) != Pmode)
26451 base = gen_rtx_REG (Pmode, REGNO (base));
26453 addr = XEXP (part[1][0], 0);
26454 if (TARGET_TLS_DIRECT_SEG_REFS)
26456 struct ix86_address parts;
26457 int ok = ix86_decompose_address (addr, &parts);
26458 gcc_assert (ok);
26459 if (parts.seg == DEFAULT_TLS_SEG_REG)
26461 /* It is not valid to use %gs: or %fs: in
26462 lea though, so we need to remove it from the
26463 address used for lea and add it to each individual
26464 memory loads instead. */
26465 addr = copy_rtx (addr);
26466 rtx *x = &addr;
26467 while (GET_CODE (*x) == PLUS)
26469 for (i = 0; i < 2; i++)
26471 rtx u = XEXP (*x, i);
26472 if (GET_CODE (u) == ZERO_EXTEND)
26473 u = XEXP (u, 0);
26474 if (GET_CODE (u) == UNSPEC
26475 && XINT (u, 1) == UNSPEC_TP)
26477 tls_base = XEXP (*x, i);
26478 *x = XEXP (*x, 1 - i);
26479 break;
26482 if (tls_base)
26483 break;
26484 x = &XEXP (*x, 0);
26486 gcc_assert (tls_base);
26489 emit_insn (gen_rtx_SET (base, addr));
26490 if (tls_base)
26491 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26492 part[1][0] = replace_equiv_address (part[1][0], base);
26493 for (i = 1; i < nparts; i++)
26495 if (tls_base)
26496 base = copy_rtx (base);
26497 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26498 part[1][i] = replace_equiv_address (part[1][i], tmp);
26503 if (push)
26505 if (!TARGET_64BIT)
26507 if (nparts == 3)
26509 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26510 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26511 stack_pointer_rtx, GEN_INT (-4)));
26512 emit_move_insn (part[0][2], part[1][2]);
26514 else if (nparts == 4)
26516 emit_move_insn (part[0][3], part[1][3]);
26517 emit_move_insn (part[0][2], part[1][2]);
26520 else
26522 /* In 64bit mode we don't have 32bit push available. In case this is
26523 register, it is OK - we will just use larger counterpart. We also
26524 retype memory - these comes from attempt to avoid REX prefix on
26525 moving of second half of TFmode value. */
26526 if (GET_MODE (part[1][1]) == SImode)
26528 switch (GET_CODE (part[1][1]))
26530 case MEM:
26531 part[1][1] = adjust_address (part[1][1], DImode, 0);
26532 break;
26534 case REG:
26535 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26536 break;
26538 default:
26539 gcc_unreachable ();
26542 if (GET_MODE (part[1][0]) == SImode)
26543 part[1][0] = part[1][1];
26546 emit_move_insn (part[0][1], part[1][1]);
26547 emit_move_insn (part[0][0], part[1][0]);
26548 return;
26551 /* Choose correct order to not overwrite the source before it is copied. */
26552 if ((REG_P (part[0][0])
26553 && REG_P (part[1][1])
26554 && (REGNO (part[0][0]) == REGNO (part[1][1])
26555 || (nparts == 3
26556 && REGNO (part[0][0]) == REGNO (part[1][2]))
26557 || (nparts == 4
26558 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26559 || (collisions > 0
26560 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26562 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26564 operands[2 + i] = part[0][j];
26565 operands[6 + i] = part[1][j];
26568 else
26570 for (i = 0; i < nparts; i++)
26572 operands[2 + i] = part[0][i];
26573 operands[6 + i] = part[1][i];
26577 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26578 if (optimize_insn_for_size_p ())
26580 for (j = 0; j < nparts - 1; j++)
26581 if (CONST_INT_P (operands[6 + j])
26582 && operands[6 + j] != const0_rtx
26583 && REG_P (operands[2 + j]))
26584 for (i = j; i < nparts - 1; i++)
26585 if (CONST_INT_P (operands[7 + i])
26586 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26587 operands[7 + i] = operands[2 + j];
26590 for (i = 0; i < nparts; i++)
26591 emit_move_insn (operands[2 + i], operands[6 + i]);
26593 return;
26596 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26597 left shift by a constant, either using a single shift or
26598 a sequence of add instructions. */
26600 static void
26601 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26603 rtx (*insn)(rtx, rtx, rtx);
26605 if (count == 1
26606 || (count * ix86_cost->add <= ix86_cost->shift_const
26607 && !optimize_insn_for_size_p ()))
26609 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26610 while (count-- > 0)
26611 emit_insn (insn (operand, operand, operand));
26613 else
26615 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26616 emit_insn (insn (operand, operand, GEN_INT (count)));
26620 void
26621 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26623 rtx (*gen_ashl3)(rtx, rtx, rtx);
26624 rtx (*gen_shld)(rtx, rtx, rtx);
26625 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26627 rtx low[2], high[2];
26628 int count;
26630 if (CONST_INT_P (operands[2]))
26632 split_double_mode (mode, operands, 2, low, high);
26633 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26635 if (count >= half_width)
26637 emit_move_insn (high[0], low[1]);
26638 emit_move_insn (low[0], const0_rtx);
26640 if (count > half_width)
26641 ix86_expand_ashl_const (high[0], count - half_width, mode);
26643 else
26645 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26647 if (!rtx_equal_p (operands[0], operands[1]))
26648 emit_move_insn (operands[0], operands[1]);
26650 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26651 ix86_expand_ashl_const (low[0], count, mode);
26653 return;
26656 split_double_mode (mode, operands, 1, low, high);
26658 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26660 if (operands[1] == const1_rtx)
26662 /* Assuming we've chosen a QImode capable registers, then 1 << N
26663 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26664 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26666 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26668 ix86_expand_clear (low[0]);
26669 ix86_expand_clear (high[0]);
26670 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26672 d = gen_lowpart (QImode, low[0]);
26673 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26674 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26675 emit_insn (gen_rtx_SET (d, s));
26677 d = gen_lowpart (QImode, high[0]);
26678 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26679 s = gen_rtx_NE (QImode, flags, const0_rtx);
26680 emit_insn (gen_rtx_SET (d, s));
26683 /* Otherwise, we can get the same results by manually performing
26684 a bit extract operation on bit 5/6, and then performing the two
26685 shifts. The two methods of getting 0/1 into low/high are exactly
26686 the same size. Avoiding the shift in the bit extract case helps
26687 pentium4 a bit; no one else seems to care much either way. */
26688 else
26690 machine_mode half_mode;
26691 rtx (*gen_lshr3)(rtx, rtx, rtx);
26692 rtx (*gen_and3)(rtx, rtx, rtx);
26693 rtx (*gen_xor3)(rtx, rtx, rtx);
26694 HOST_WIDE_INT bits;
26695 rtx x;
26697 if (mode == DImode)
26699 half_mode = SImode;
26700 gen_lshr3 = gen_lshrsi3;
26701 gen_and3 = gen_andsi3;
26702 gen_xor3 = gen_xorsi3;
26703 bits = 5;
26705 else
26707 half_mode = DImode;
26708 gen_lshr3 = gen_lshrdi3;
26709 gen_and3 = gen_anddi3;
26710 gen_xor3 = gen_xordi3;
26711 bits = 6;
26714 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26715 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26716 else
26717 x = gen_lowpart (half_mode, operands[2]);
26718 emit_insn (gen_rtx_SET (high[0], x));
26720 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26721 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26722 emit_move_insn (low[0], high[0]);
26723 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26726 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26727 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26728 return;
26731 if (operands[1] == constm1_rtx)
26733 /* For -1 << N, we can avoid the shld instruction, because we
26734 know that we're shifting 0...31/63 ones into a -1. */
26735 emit_move_insn (low[0], constm1_rtx);
26736 if (optimize_insn_for_size_p ())
26737 emit_move_insn (high[0], low[0]);
26738 else
26739 emit_move_insn (high[0], constm1_rtx);
26741 else
26743 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26745 if (!rtx_equal_p (operands[0], operands[1]))
26746 emit_move_insn (operands[0], operands[1]);
26748 split_double_mode (mode, operands, 1, low, high);
26749 emit_insn (gen_shld (high[0], low[0], operands[2]));
26752 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26754 if (TARGET_CMOVE && scratch)
26756 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26757 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26759 ix86_expand_clear (scratch);
26760 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26762 else
26764 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26765 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26767 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26771 void
26772 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26774 rtx (*gen_ashr3)(rtx, rtx, rtx)
26775 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26776 rtx (*gen_shrd)(rtx, rtx, rtx);
26777 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26779 rtx low[2], high[2];
26780 int count;
26782 if (CONST_INT_P (operands[2]))
26784 split_double_mode (mode, operands, 2, low, high);
26785 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26787 if (count == GET_MODE_BITSIZE (mode) - 1)
26789 emit_move_insn (high[0], high[1]);
26790 emit_insn (gen_ashr3 (high[0], high[0],
26791 GEN_INT (half_width - 1)));
26792 emit_move_insn (low[0], high[0]);
26795 else if (count >= half_width)
26797 emit_move_insn (low[0], high[1]);
26798 emit_move_insn (high[0], low[0]);
26799 emit_insn (gen_ashr3 (high[0], high[0],
26800 GEN_INT (half_width - 1)));
26802 if (count > half_width)
26803 emit_insn (gen_ashr3 (low[0], low[0],
26804 GEN_INT (count - half_width)));
26806 else
26808 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26810 if (!rtx_equal_p (operands[0], operands[1]))
26811 emit_move_insn (operands[0], operands[1]);
26813 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26814 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26817 else
26819 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26821 if (!rtx_equal_p (operands[0], operands[1]))
26822 emit_move_insn (operands[0], operands[1]);
26824 split_double_mode (mode, operands, 1, low, high);
26826 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26827 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26829 if (TARGET_CMOVE && scratch)
26831 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26832 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26834 emit_move_insn (scratch, high[0]);
26835 emit_insn (gen_ashr3 (scratch, scratch,
26836 GEN_INT (half_width - 1)));
26837 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26838 scratch));
26840 else
26842 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26843 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26845 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26850 void
26851 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26853 rtx (*gen_lshr3)(rtx, rtx, rtx)
26854 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26855 rtx (*gen_shrd)(rtx, rtx, rtx);
26856 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26858 rtx low[2], high[2];
26859 int count;
26861 if (CONST_INT_P (operands[2]))
26863 split_double_mode (mode, operands, 2, low, high);
26864 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26866 if (count >= half_width)
26868 emit_move_insn (low[0], high[1]);
26869 ix86_expand_clear (high[0]);
26871 if (count > half_width)
26872 emit_insn (gen_lshr3 (low[0], low[0],
26873 GEN_INT (count - half_width)));
26875 else
26877 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26879 if (!rtx_equal_p (operands[0], operands[1]))
26880 emit_move_insn (operands[0], operands[1]);
26882 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26883 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26886 else
26888 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26890 if (!rtx_equal_p (operands[0], operands[1]))
26891 emit_move_insn (operands[0], operands[1]);
26893 split_double_mode (mode, operands, 1, low, high);
26895 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26896 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26898 if (TARGET_CMOVE && scratch)
26900 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26901 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26903 ix86_expand_clear (scratch);
26904 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26905 scratch));
26907 else
26909 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26910 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26912 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26917 /* Predict just emitted jump instruction to be taken with probability PROB. */
26918 static void
26919 predict_jump (int prob)
26921 rtx_insn *insn = get_last_insn ();
26922 gcc_assert (JUMP_P (insn));
26923 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26926 /* Helper function for the string operations below. Dest VARIABLE whether
26927 it is aligned to VALUE bytes. If true, jump to the label. */
26928 static rtx_code_label *
26929 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26931 rtx_code_label *label = gen_label_rtx ();
26932 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26933 if (GET_MODE (variable) == DImode)
26934 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26935 else
26936 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26937 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26938 1, label);
26939 if (epilogue)
26940 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26941 else
26942 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26943 return label;
26946 /* Adjust COUNTER by the VALUE. */
26947 static void
26948 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26950 rtx (*gen_add)(rtx, rtx, rtx)
26951 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26953 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26956 /* Zero extend possibly SImode EXP to Pmode register. */
26958 ix86_zero_extend_to_Pmode (rtx exp)
26960 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26963 /* Divide COUNTREG by SCALE. */
26964 static rtx
26965 scale_counter (rtx countreg, int scale)
26967 rtx sc;
26969 if (scale == 1)
26970 return countreg;
26971 if (CONST_INT_P (countreg))
26972 return GEN_INT (INTVAL (countreg) / scale);
26973 gcc_assert (REG_P (countreg));
26975 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26976 GEN_INT (exact_log2 (scale)),
26977 NULL, 1, OPTAB_DIRECT);
26978 return sc;
26981 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26982 DImode for constant loop counts. */
26984 static machine_mode
26985 counter_mode (rtx count_exp)
26987 if (GET_MODE (count_exp) != VOIDmode)
26988 return GET_MODE (count_exp);
26989 if (!CONST_INT_P (count_exp))
26990 return Pmode;
26991 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26992 return DImode;
26993 return SImode;
26996 /* Copy the address to a Pmode register. This is used for x32 to
26997 truncate DImode TLS address to a SImode register. */
26999 static rtx
27000 ix86_copy_addr_to_reg (rtx addr)
27002 rtx reg;
27003 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27005 reg = copy_addr_to_reg (addr);
27006 REG_POINTER (reg) = 1;
27007 return reg;
27009 else
27011 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27012 reg = copy_to_mode_reg (DImode, addr);
27013 REG_POINTER (reg) = 1;
27014 return gen_rtx_SUBREG (SImode, reg, 0);
27018 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27019 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27020 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27021 memory by VALUE (supposed to be in MODE).
27023 The size is rounded down to whole number of chunk size moved at once.
27024 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27027 static void
27028 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27029 rtx destptr, rtx srcptr, rtx value,
27030 rtx count, machine_mode mode, int unroll,
27031 int expected_size, bool issetmem)
27033 rtx_code_label *out_label, *top_label;
27034 rtx iter, tmp;
27035 machine_mode iter_mode = counter_mode (count);
27036 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27037 rtx piece_size = GEN_INT (piece_size_n);
27038 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27039 rtx size;
27040 int i;
27042 top_label = gen_label_rtx ();
27043 out_label = gen_label_rtx ();
27044 iter = gen_reg_rtx (iter_mode);
27046 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27047 NULL, 1, OPTAB_DIRECT);
27048 /* Those two should combine. */
27049 if (piece_size == const1_rtx)
27051 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27052 true, out_label);
27053 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27055 emit_move_insn (iter, const0_rtx);
27057 emit_label (top_label);
27059 tmp = convert_modes (Pmode, iter_mode, iter, true);
27061 /* This assert could be relaxed - in this case we'll need to compute
27062 smallest power of two, containing in PIECE_SIZE_N and pass it to
27063 offset_address. */
27064 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27065 destmem = offset_address (destmem, tmp, piece_size_n);
27066 destmem = adjust_address (destmem, mode, 0);
27068 if (!issetmem)
27070 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27071 srcmem = adjust_address (srcmem, mode, 0);
27073 /* When unrolling for chips that reorder memory reads and writes,
27074 we can save registers by using single temporary.
27075 Also using 4 temporaries is overkill in 32bit mode. */
27076 if (!TARGET_64BIT && 0)
27078 for (i = 0; i < unroll; i++)
27080 if (i)
27082 destmem =
27083 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27084 srcmem =
27085 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27087 emit_move_insn (destmem, srcmem);
27090 else
27092 rtx tmpreg[4];
27093 gcc_assert (unroll <= 4);
27094 for (i = 0; i < unroll; i++)
27096 tmpreg[i] = gen_reg_rtx (mode);
27097 if (i)
27099 srcmem =
27100 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27102 emit_move_insn (tmpreg[i], srcmem);
27104 for (i = 0; i < unroll; i++)
27106 if (i)
27108 destmem =
27109 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27111 emit_move_insn (destmem, tmpreg[i]);
27115 else
27116 for (i = 0; i < unroll; i++)
27118 if (i)
27119 destmem =
27120 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27121 emit_move_insn (destmem, value);
27124 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27125 true, OPTAB_LIB_WIDEN);
27126 if (tmp != iter)
27127 emit_move_insn (iter, tmp);
27129 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27130 true, top_label);
27131 if (expected_size != -1)
27133 expected_size /= GET_MODE_SIZE (mode) * unroll;
27134 if (expected_size == 0)
27135 predict_jump (0);
27136 else if (expected_size > REG_BR_PROB_BASE)
27137 predict_jump (REG_BR_PROB_BASE - 1);
27138 else
27139 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27141 else
27142 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27143 iter = ix86_zero_extend_to_Pmode (iter);
27144 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27145 true, OPTAB_LIB_WIDEN);
27146 if (tmp != destptr)
27147 emit_move_insn (destptr, tmp);
27148 if (!issetmem)
27150 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27151 true, OPTAB_LIB_WIDEN);
27152 if (tmp != srcptr)
27153 emit_move_insn (srcptr, tmp);
27155 emit_label (out_label);
27158 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27159 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27160 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27161 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27162 ORIG_VALUE is the original value passed to memset to fill the memory with.
27163 Other arguments have same meaning as for previous function. */
27165 static void
27166 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27167 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27168 rtx count,
27169 machine_mode mode, bool issetmem)
27171 rtx destexp;
27172 rtx srcexp;
27173 rtx countreg;
27174 HOST_WIDE_INT rounded_count;
27176 /* If possible, it is shorter to use rep movs.
27177 TODO: Maybe it is better to move this logic to decide_alg. */
27178 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27179 && (!issetmem || orig_value == const0_rtx))
27180 mode = SImode;
27182 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27183 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27185 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27186 GET_MODE_SIZE (mode)));
27187 if (mode != QImode)
27189 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27190 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27191 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27193 else
27194 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27195 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27197 rounded_count
27198 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27199 destmem = shallow_copy_rtx (destmem);
27200 set_mem_size (destmem, rounded_count);
27202 else if (MEM_SIZE_KNOWN_P (destmem))
27203 clear_mem_size (destmem);
27205 if (issetmem)
27207 value = force_reg (mode, gen_lowpart (mode, value));
27208 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27210 else
27212 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27213 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27214 if (mode != QImode)
27216 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27217 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27218 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27220 else
27221 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27222 if (CONST_INT_P (count))
27224 rounded_count
27225 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27226 srcmem = shallow_copy_rtx (srcmem);
27227 set_mem_size (srcmem, rounded_count);
27229 else
27231 if (MEM_SIZE_KNOWN_P (srcmem))
27232 clear_mem_size (srcmem);
27234 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27235 destexp, srcexp));
27239 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27240 DESTMEM.
27241 SRC is passed by pointer to be updated on return.
27242 Return value is updated DST. */
27243 static rtx
27244 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27245 HOST_WIDE_INT size_to_move)
27247 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27248 enum insn_code code;
27249 machine_mode move_mode;
27250 int piece_size, i;
27252 /* Find the widest mode in which we could perform moves.
27253 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27254 it until move of such size is supported. */
27255 piece_size = 1 << floor_log2 (size_to_move);
27256 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27257 code = optab_handler (mov_optab, move_mode);
27258 while (code == CODE_FOR_nothing && piece_size > 1)
27260 piece_size >>= 1;
27261 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27262 code = optab_handler (mov_optab, move_mode);
27265 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27266 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27267 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27269 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27270 move_mode = mode_for_vector (word_mode, nunits);
27271 code = optab_handler (mov_optab, move_mode);
27272 if (code == CODE_FOR_nothing)
27274 move_mode = word_mode;
27275 piece_size = GET_MODE_SIZE (move_mode);
27276 code = optab_handler (mov_optab, move_mode);
27279 gcc_assert (code != CODE_FOR_nothing);
27281 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27282 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27284 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27285 gcc_assert (size_to_move % piece_size == 0);
27286 adjust = GEN_INT (piece_size);
27287 for (i = 0; i < size_to_move; i += piece_size)
27289 /* We move from memory to memory, so we'll need to do it via
27290 a temporary register. */
27291 tempreg = gen_reg_rtx (move_mode);
27292 emit_insn (GEN_FCN (code) (tempreg, src));
27293 emit_insn (GEN_FCN (code) (dst, tempreg));
27295 emit_move_insn (destptr,
27296 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27297 emit_move_insn (srcptr,
27298 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27300 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27301 piece_size);
27302 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27303 piece_size);
27306 /* Update DST and SRC rtx. */
27307 *srcmem = src;
27308 return dst;
27311 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27312 static void
27313 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27314 rtx destptr, rtx srcptr, rtx count, int max_size)
27316 rtx src, dest;
27317 if (CONST_INT_P (count))
27319 HOST_WIDE_INT countval = INTVAL (count);
27320 HOST_WIDE_INT epilogue_size = countval % max_size;
27321 int i;
27323 /* For now MAX_SIZE should be a power of 2. This assert could be
27324 relaxed, but it'll require a bit more complicated epilogue
27325 expanding. */
27326 gcc_assert ((max_size & (max_size - 1)) == 0);
27327 for (i = max_size; i >= 1; i >>= 1)
27329 if (epilogue_size & i)
27330 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27332 return;
27334 if (max_size > 8)
27336 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27337 count, 1, OPTAB_DIRECT);
27338 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27339 count, QImode, 1, 4, false);
27340 return;
27343 /* When there are stringops, we can cheaply increase dest and src pointers.
27344 Otherwise we save code size by maintaining offset (zero is readily
27345 available from preceding rep operation) and using x86 addressing modes.
27347 if (TARGET_SINGLE_STRINGOP)
27349 if (max_size > 4)
27351 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27352 src = change_address (srcmem, SImode, srcptr);
27353 dest = change_address (destmem, SImode, destptr);
27354 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27355 emit_label (label);
27356 LABEL_NUSES (label) = 1;
27358 if (max_size > 2)
27360 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27361 src = change_address (srcmem, HImode, srcptr);
27362 dest = change_address (destmem, HImode, destptr);
27363 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27364 emit_label (label);
27365 LABEL_NUSES (label) = 1;
27367 if (max_size > 1)
27369 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27370 src = change_address (srcmem, QImode, srcptr);
27371 dest = change_address (destmem, QImode, destptr);
27372 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27373 emit_label (label);
27374 LABEL_NUSES (label) = 1;
27377 else
27379 rtx offset = force_reg (Pmode, const0_rtx);
27380 rtx tmp;
27382 if (max_size > 4)
27384 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27385 src = change_address (srcmem, SImode, srcptr);
27386 dest = change_address (destmem, SImode, destptr);
27387 emit_move_insn (dest, src);
27388 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27389 true, OPTAB_LIB_WIDEN);
27390 if (tmp != offset)
27391 emit_move_insn (offset, tmp);
27392 emit_label (label);
27393 LABEL_NUSES (label) = 1;
27395 if (max_size > 2)
27397 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27398 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27399 src = change_address (srcmem, HImode, tmp);
27400 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27401 dest = change_address (destmem, HImode, tmp);
27402 emit_move_insn (dest, src);
27403 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27404 true, OPTAB_LIB_WIDEN);
27405 if (tmp != offset)
27406 emit_move_insn (offset, tmp);
27407 emit_label (label);
27408 LABEL_NUSES (label) = 1;
27410 if (max_size > 1)
27412 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27413 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27414 src = change_address (srcmem, QImode, tmp);
27415 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27416 dest = change_address (destmem, QImode, tmp);
27417 emit_move_insn (dest, src);
27418 emit_label (label);
27419 LABEL_NUSES (label) = 1;
27424 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27425 with value PROMOTED_VAL.
27426 SRC is passed by pointer to be updated on return.
27427 Return value is updated DST. */
27428 static rtx
27429 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27430 HOST_WIDE_INT size_to_move)
27432 rtx dst = destmem, adjust;
27433 enum insn_code code;
27434 machine_mode move_mode;
27435 int piece_size, i;
27437 /* Find the widest mode in which we could perform moves.
27438 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27439 it until move of such size is supported. */
27440 move_mode = GET_MODE (promoted_val);
27441 if (move_mode == VOIDmode)
27442 move_mode = QImode;
27443 if (size_to_move < GET_MODE_SIZE (move_mode))
27445 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
27446 promoted_val = gen_lowpart (move_mode, promoted_val);
27448 piece_size = GET_MODE_SIZE (move_mode);
27449 code = optab_handler (mov_optab, move_mode);
27450 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27452 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27454 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27455 gcc_assert (size_to_move % piece_size == 0);
27456 adjust = GEN_INT (piece_size);
27457 for (i = 0; i < size_to_move; i += piece_size)
27459 if (piece_size <= GET_MODE_SIZE (word_mode))
27461 emit_insn (gen_strset (destptr, dst, promoted_val));
27462 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27463 piece_size);
27464 continue;
27467 emit_insn (GEN_FCN (code) (dst, promoted_val));
27469 emit_move_insn (destptr,
27470 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27472 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27473 piece_size);
27476 /* Update DST rtx. */
27477 return dst;
27479 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27480 static void
27481 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27482 rtx count, int max_size)
27484 count =
27485 expand_simple_binop (counter_mode (count), AND, count,
27486 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27487 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27488 gen_lowpart (QImode, value), count, QImode,
27489 1, max_size / 2, true);
27492 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27493 static void
27494 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27495 rtx count, int max_size)
27497 rtx dest;
27499 if (CONST_INT_P (count))
27501 HOST_WIDE_INT countval = INTVAL (count);
27502 HOST_WIDE_INT epilogue_size = countval % max_size;
27503 int i;
27505 /* For now MAX_SIZE should be a power of 2. This assert could be
27506 relaxed, but it'll require a bit more complicated epilogue
27507 expanding. */
27508 gcc_assert ((max_size & (max_size - 1)) == 0);
27509 for (i = max_size; i >= 1; i >>= 1)
27511 if (epilogue_size & i)
27513 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27514 destmem = emit_memset (destmem, destptr, vec_value, i);
27515 else
27516 destmem = emit_memset (destmem, destptr, value, i);
27519 return;
27521 if (max_size > 32)
27523 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27524 return;
27526 if (max_size > 16)
27528 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27529 if (TARGET_64BIT)
27531 dest = change_address (destmem, DImode, destptr);
27532 emit_insn (gen_strset (destptr, dest, value));
27533 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27534 emit_insn (gen_strset (destptr, dest, value));
27536 else
27538 dest = change_address (destmem, SImode, destptr);
27539 emit_insn (gen_strset (destptr, dest, value));
27540 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27541 emit_insn (gen_strset (destptr, dest, value));
27542 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27543 emit_insn (gen_strset (destptr, dest, value));
27544 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27545 emit_insn (gen_strset (destptr, dest, value));
27547 emit_label (label);
27548 LABEL_NUSES (label) = 1;
27550 if (max_size > 8)
27552 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27553 if (TARGET_64BIT)
27555 dest = change_address (destmem, DImode, destptr);
27556 emit_insn (gen_strset (destptr, dest, value));
27558 else
27560 dest = change_address (destmem, SImode, destptr);
27561 emit_insn (gen_strset (destptr, dest, value));
27562 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27563 emit_insn (gen_strset (destptr, dest, value));
27565 emit_label (label);
27566 LABEL_NUSES (label) = 1;
27568 if (max_size > 4)
27570 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27571 dest = change_address (destmem, SImode, destptr);
27572 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27573 emit_label (label);
27574 LABEL_NUSES (label) = 1;
27576 if (max_size > 2)
27578 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27579 dest = change_address (destmem, HImode, destptr);
27580 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27581 emit_label (label);
27582 LABEL_NUSES (label) = 1;
27584 if (max_size > 1)
27586 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27587 dest = change_address (destmem, QImode, destptr);
27588 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27589 emit_label (label);
27590 LABEL_NUSES (label) = 1;
27594 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27595 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27596 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27597 ignored.
27598 Return value is updated DESTMEM. */
27599 static rtx
27600 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27601 rtx destptr, rtx srcptr, rtx value,
27602 rtx vec_value, rtx count, int align,
27603 int desired_alignment, bool issetmem)
27605 int i;
27606 for (i = 1; i < desired_alignment; i <<= 1)
27608 if (align <= i)
27610 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27611 if (issetmem)
27613 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27614 destmem = emit_memset (destmem, destptr, vec_value, i);
27615 else
27616 destmem = emit_memset (destmem, destptr, value, i);
27618 else
27619 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27620 ix86_adjust_counter (count, i);
27621 emit_label (label);
27622 LABEL_NUSES (label) = 1;
27623 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27626 return destmem;
27629 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27630 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27631 and jump to DONE_LABEL. */
27632 static void
27633 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27634 rtx destptr, rtx srcptr,
27635 rtx value, rtx vec_value,
27636 rtx count, int size,
27637 rtx done_label, bool issetmem)
27639 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27640 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
27641 rtx modesize;
27642 int n;
27644 /* If we do not have vector value to copy, we must reduce size. */
27645 if (issetmem)
27647 if (!vec_value)
27649 if (GET_MODE (value) == VOIDmode && size > 8)
27650 mode = Pmode;
27651 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27652 mode = GET_MODE (value);
27654 else
27655 mode = GET_MODE (vec_value), value = vec_value;
27657 else
27659 /* Choose appropriate vector mode. */
27660 if (size >= 32)
27661 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27662 else if (size >= 16)
27663 mode = TARGET_SSE ? V16QImode : DImode;
27664 srcmem = change_address (srcmem, mode, srcptr);
27666 destmem = change_address (destmem, mode, destptr);
27667 modesize = GEN_INT (GET_MODE_SIZE (mode));
27668 gcc_assert (GET_MODE_SIZE (mode) <= size);
27669 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27671 if (issetmem)
27672 emit_move_insn (destmem, gen_lowpart (mode, value));
27673 else
27675 emit_move_insn (destmem, srcmem);
27676 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27678 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27681 destmem = offset_address (destmem, count, 1);
27682 destmem = offset_address (destmem, GEN_INT (-2 * size),
27683 GET_MODE_SIZE (mode));
27684 if (!issetmem)
27686 srcmem = offset_address (srcmem, count, 1);
27687 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27688 GET_MODE_SIZE (mode));
27690 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27692 if (issetmem)
27693 emit_move_insn (destmem, gen_lowpart (mode, value));
27694 else
27696 emit_move_insn (destmem, srcmem);
27697 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27699 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27701 emit_jump_insn (gen_jump (done_label));
27702 emit_barrier ();
27704 emit_label (label);
27705 LABEL_NUSES (label) = 1;
27708 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27709 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27710 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27711 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27712 DONE_LABEL is a label after the whole copying sequence. The label is created
27713 on demand if *DONE_LABEL is NULL.
27714 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27715 bounds after the initial copies.
27717 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27718 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27719 we will dispatch to a library call for large blocks.
27721 In pseudocode we do:
27723 if (COUNT < SIZE)
27725 Assume that SIZE is 4. Bigger sizes are handled analogously
27726 if (COUNT & 4)
27728 copy 4 bytes from SRCPTR to DESTPTR
27729 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27730 goto done_label
27732 if (!COUNT)
27733 goto done_label;
27734 copy 1 byte from SRCPTR to DESTPTR
27735 if (COUNT & 2)
27737 copy 2 bytes from SRCPTR to DESTPTR
27738 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27741 else
27743 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27744 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27746 OLD_DESPTR = DESTPTR;
27747 Align DESTPTR up to DESIRED_ALIGN
27748 SRCPTR += DESTPTR - OLD_DESTPTR
27749 COUNT -= DEST_PTR - OLD_DESTPTR
27750 if (DYNAMIC_CHECK)
27751 Round COUNT down to multiple of SIZE
27752 << optional caller supplied zero size guard is here >>
27753 << optional caller supplied dynamic check is here >>
27754 << caller supplied main copy loop is here >>
27756 done_label:
27758 static void
27759 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27760 rtx *destptr, rtx *srcptr,
27761 machine_mode mode,
27762 rtx value, rtx vec_value,
27763 rtx *count,
27764 rtx_code_label **done_label,
27765 int size,
27766 int desired_align,
27767 int align,
27768 unsigned HOST_WIDE_INT *min_size,
27769 bool dynamic_check,
27770 bool issetmem)
27772 rtx_code_label *loop_label = NULL, *label;
27773 int n;
27774 rtx modesize;
27775 int prolog_size = 0;
27776 rtx mode_value;
27778 /* Chose proper value to copy. */
27779 if (issetmem && VECTOR_MODE_P (mode))
27780 mode_value = vec_value;
27781 else
27782 mode_value = value;
27783 gcc_assert (GET_MODE_SIZE (mode) <= size);
27785 /* See if block is big or small, handle small blocks. */
27786 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27788 int size2 = size;
27789 loop_label = gen_label_rtx ();
27791 if (!*done_label)
27792 *done_label = gen_label_rtx ();
27794 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27795 1, loop_label);
27796 size2 >>= 1;
27798 /* Handle sizes > 3. */
27799 for (;size2 > 2; size2 >>= 1)
27800 expand_small_movmem_or_setmem (destmem, srcmem,
27801 *destptr, *srcptr,
27802 value, vec_value,
27803 *count,
27804 size2, *done_label, issetmem);
27805 /* Nothing to copy? Jump to DONE_LABEL if so */
27806 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27807 1, *done_label);
27809 /* Do a byte copy. */
27810 destmem = change_address (destmem, QImode, *destptr);
27811 if (issetmem)
27812 emit_move_insn (destmem, gen_lowpart (QImode, value));
27813 else
27815 srcmem = change_address (srcmem, QImode, *srcptr);
27816 emit_move_insn (destmem, srcmem);
27819 /* Handle sizes 2 and 3. */
27820 label = ix86_expand_aligntest (*count, 2, false);
27821 destmem = change_address (destmem, HImode, *destptr);
27822 destmem = offset_address (destmem, *count, 1);
27823 destmem = offset_address (destmem, GEN_INT (-2), 2);
27824 if (issetmem)
27825 emit_move_insn (destmem, gen_lowpart (HImode, value));
27826 else
27828 srcmem = change_address (srcmem, HImode, *srcptr);
27829 srcmem = offset_address (srcmem, *count, 1);
27830 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27831 emit_move_insn (destmem, srcmem);
27834 emit_label (label);
27835 LABEL_NUSES (label) = 1;
27836 emit_jump_insn (gen_jump (*done_label));
27837 emit_barrier ();
27839 else
27840 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27841 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27843 /* Start memcpy for COUNT >= SIZE. */
27844 if (loop_label)
27846 emit_label (loop_label);
27847 LABEL_NUSES (loop_label) = 1;
27850 /* Copy first desired_align bytes. */
27851 if (!issetmem)
27852 srcmem = change_address (srcmem, mode, *srcptr);
27853 destmem = change_address (destmem, mode, *destptr);
27854 modesize = GEN_INT (GET_MODE_SIZE (mode));
27855 for (n = 0; prolog_size < desired_align - align; n++)
27857 if (issetmem)
27858 emit_move_insn (destmem, mode_value);
27859 else
27861 emit_move_insn (destmem, srcmem);
27862 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27864 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27865 prolog_size += GET_MODE_SIZE (mode);
27869 /* Copy last SIZE bytes. */
27870 destmem = offset_address (destmem, *count, 1);
27871 destmem = offset_address (destmem,
27872 GEN_INT (-size - prolog_size),
27874 if (issetmem)
27875 emit_move_insn (destmem, mode_value);
27876 else
27878 srcmem = offset_address (srcmem, *count, 1);
27879 srcmem = offset_address (srcmem,
27880 GEN_INT (-size - prolog_size),
27882 emit_move_insn (destmem, srcmem);
27884 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27886 destmem = offset_address (destmem, modesize, 1);
27887 if (issetmem)
27888 emit_move_insn (destmem, mode_value);
27889 else
27891 srcmem = offset_address (srcmem, modesize, 1);
27892 emit_move_insn (destmem, srcmem);
27896 /* Align destination. */
27897 if (desired_align > 1 && desired_align > align)
27899 rtx saveddest = *destptr;
27901 gcc_assert (desired_align <= size);
27902 /* Align destptr up, place it to new register. */
27903 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27904 GEN_INT (prolog_size),
27905 NULL_RTX, 1, OPTAB_DIRECT);
27906 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27907 REG_POINTER (*destptr) = 1;
27908 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27909 GEN_INT (-desired_align),
27910 *destptr, 1, OPTAB_DIRECT);
27911 /* See how many bytes we skipped. */
27912 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27913 *destptr,
27914 saveddest, 1, OPTAB_DIRECT);
27915 /* Adjust srcptr and count. */
27916 if (!issetmem)
27917 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27918 saveddest, *srcptr, 1, OPTAB_DIRECT);
27919 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27920 saveddest, *count, 1, OPTAB_DIRECT);
27921 /* We copied at most size + prolog_size. */
27922 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27923 *min_size
27924 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27925 else
27926 *min_size = 0;
27928 /* Our loops always round down the block size, but for dispatch to
27929 library we need precise value. */
27930 if (dynamic_check)
27931 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27932 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27934 else
27936 gcc_assert (prolog_size == 0);
27937 /* Decrease count, so we won't end up copying last word twice. */
27938 if (!CONST_INT_P (*count))
27939 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27940 constm1_rtx, *count, 1, OPTAB_DIRECT);
27941 else
27942 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27943 (unsigned HOST_WIDE_INT)size));
27944 if (*min_size)
27945 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27950 /* This function is like the previous one, except here we know how many bytes
27951 need to be copied. That allows us to update alignment not only of DST, which
27952 is returned, but also of SRC, which is passed as a pointer for that
27953 reason. */
27954 static rtx
27955 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27956 rtx srcreg, rtx value, rtx vec_value,
27957 int desired_align, int align_bytes,
27958 bool issetmem)
27960 rtx src = NULL;
27961 rtx orig_dst = dst;
27962 rtx orig_src = NULL;
27963 int piece_size = 1;
27964 int copied_bytes = 0;
27966 if (!issetmem)
27968 gcc_assert (srcp != NULL);
27969 src = *srcp;
27970 orig_src = src;
27973 for (piece_size = 1;
27974 piece_size <= desired_align && copied_bytes < align_bytes;
27975 piece_size <<= 1)
27977 if (align_bytes & piece_size)
27979 if (issetmem)
27981 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27982 dst = emit_memset (dst, destreg, vec_value, piece_size);
27983 else
27984 dst = emit_memset (dst, destreg, value, piece_size);
27986 else
27987 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27988 copied_bytes += piece_size;
27991 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27992 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27993 if (MEM_SIZE_KNOWN_P (orig_dst))
27994 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27996 if (!issetmem)
27998 int src_align_bytes = get_mem_align_offset (src, desired_align
27999 * BITS_PER_UNIT);
28000 if (src_align_bytes >= 0)
28001 src_align_bytes = desired_align - src_align_bytes;
28002 if (src_align_bytes >= 0)
28004 unsigned int src_align;
28005 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28007 if ((src_align_bytes & (src_align - 1))
28008 == (align_bytes & (src_align - 1)))
28009 break;
28011 if (src_align > (unsigned int) desired_align)
28012 src_align = desired_align;
28013 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28014 set_mem_align (src, src_align * BITS_PER_UNIT);
28016 if (MEM_SIZE_KNOWN_P (orig_src))
28017 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28018 *srcp = src;
28021 return dst;
28024 /* Return true if ALG can be used in current context.
28025 Assume we expand memset if MEMSET is true. */
28026 static bool
28027 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28029 if (alg == no_stringop)
28030 return false;
28031 if (alg == vector_loop)
28032 return TARGET_SSE || TARGET_AVX;
28033 /* Algorithms using the rep prefix want at least edi and ecx;
28034 additionally, memset wants eax and memcpy wants esi. Don't
28035 consider such algorithms if the user has appropriated those
28036 registers for their own purposes, or if we have a non-default
28037 address space, since some string insns cannot override the segment. */
28038 if (alg == rep_prefix_1_byte
28039 || alg == rep_prefix_4_byte
28040 || alg == rep_prefix_8_byte)
28042 if (have_as)
28043 return false;
28044 if (fixed_regs[CX_REG]
28045 || fixed_regs[DI_REG]
28046 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28047 return false;
28049 return true;
28052 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28053 static enum stringop_alg
28054 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28055 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28056 bool memset, bool zero_memset, bool have_as,
28057 int *dynamic_check, bool *noalign, bool recur)
28059 const struct stringop_algs *algs;
28060 bool optimize_for_speed;
28061 int max = 0;
28062 const struct processor_costs *cost;
28063 int i;
28064 bool any_alg_usable_p = false;
28066 *noalign = false;
28067 *dynamic_check = -1;
28069 /* Even if the string operation call is cold, we still might spend a lot
28070 of time processing large blocks. */
28071 if (optimize_function_for_size_p (cfun)
28072 || (optimize_insn_for_size_p ()
28073 && (max_size < 256
28074 || (expected_size != -1 && expected_size < 256))))
28075 optimize_for_speed = false;
28076 else
28077 optimize_for_speed = true;
28079 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28080 if (memset)
28081 algs = &cost->memset[TARGET_64BIT != 0];
28082 else
28083 algs = &cost->memcpy[TARGET_64BIT != 0];
28085 /* See maximal size for user defined algorithm. */
28086 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28088 enum stringop_alg candidate = algs->size[i].alg;
28089 bool usable = alg_usable_p (candidate, memset, have_as);
28090 any_alg_usable_p |= usable;
28092 if (candidate != libcall && candidate && usable)
28093 max = algs->size[i].max;
28096 /* If expected size is not known but max size is small enough
28097 so inline version is a win, set expected size into
28098 the range. */
28099 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28100 && expected_size == -1)
28101 expected_size = min_size / 2 + max_size / 2;
28103 /* If user specified the algorithm, honor it if possible. */
28104 if (ix86_stringop_alg != no_stringop
28105 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28106 return ix86_stringop_alg;
28107 /* rep; movq or rep; movl is the smallest variant. */
28108 else if (!optimize_for_speed)
28110 *noalign = true;
28111 if (!count || (count & 3) || (memset && !zero_memset))
28112 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28113 ? rep_prefix_1_byte : loop_1_byte;
28114 else
28115 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28116 ? rep_prefix_4_byte : loop;
28118 /* Very tiny blocks are best handled via the loop, REP is expensive to
28119 setup. */
28120 else if (expected_size != -1 && expected_size < 4)
28121 return loop_1_byte;
28122 else if (expected_size != -1)
28124 enum stringop_alg alg = libcall;
28125 bool alg_noalign = false;
28126 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28128 /* We get here if the algorithms that were not libcall-based
28129 were rep-prefix based and we are unable to use rep prefixes
28130 based on global register usage. Break out of the loop and
28131 use the heuristic below. */
28132 if (algs->size[i].max == 0)
28133 break;
28134 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28136 enum stringop_alg candidate = algs->size[i].alg;
28138 if (candidate != libcall
28139 && alg_usable_p (candidate, memset, have_as))
28141 alg = candidate;
28142 alg_noalign = algs->size[i].noalign;
28144 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28145 last non-libcall inline algorithm. */
28146 if (TARGET_INLINE_ALL_STRINGOPS)
28148 /* When the current size is best to be copied by a libcall,
28149 but we are still forced to inline, run the heuristic below
28150 that will pick code for medium sized blocks. */
28151 if (alg != libcall)
28153 *noalign = alg_noalign;
28154 return alg;
28156 else if (!any_alg_usable_p)
28157 break;
28159 else if (alg_usable_p (candidate, memset, have_as))
28161 *noalign = algs->size[i].noalign;
28162 return candidate;
28167 /* When asked to inline the call anyway, try to pick meaningful choice.
28168 We look for maximal size of block that is faster to copy by hand and
28169 take blocks of at most of that size guessing that average size will
28170 be roughly half of the block.
28172 If this turns out to be bad, we might simply specify the preferred
28173 choice in ix86_costs. */
28174 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28175 && (algs->unknown_size == libcall
28176 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28178 enum stringop_alg alg;
28179 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28181 /* If there aren't any usable algorithms or if recursing already,
28182 then recursing on smaller sizes or same size isn't going to
28183 find anything. Just return the simple byte-at-a-time copy loop. */
28184 if (!any_alg_usable_p || recur)
28186 /* Pick something reasonable. */
28187 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28188 *dynamic_check = 128;
28189 return loop_1_byte;
28191 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28192 zero_memset, have_as, dynamic_check, noalign, true);
28193 gcc_assert (*dynamic_check == -1);
28194 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28195 *dynamic_check = max;
28196 else
28197 gcc_assert (alg != libcall);
28198 return alg;
28200 return (alg_usable_p (algs->unknown_size, memset, have_as)
28201 ? algs->unknown_size : libcall);
28204 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28205 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28206 static int
28207 decide_alignment (int align,
28208 enum stringop_alg alg,
28209 int expected_size,
28210 machine_mode move_mode)
28212 int desired_align = 0;
28214 gcc_assert (alg != no_stringop);
28216 if (alg == libcall)
28217 return 0;
28218 if (move_mode == VOIDmode)
28219 return 0;
28221 desired_align = GET_MODE_SIZE (move_mode);
28222 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28223 copying whole cacheline at once. */
28224 if (TARGET_PENTIUMPRO
28225 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28226 desired_align = 8;
28228 if (optimize_size)
28229 desired_align = 1;
28230 if (desired_align < align)
28231 desired_align = align;
28232 if (expected_size != -1 && expected_size < 4)
28233 desired_align = align;
28235 return desired_align;
28239 /* Helper function for memcpy. For QImode value 0xXY produce
28240 0xXYXYXYXY of wide specified by MODE. This is essentially
28241 a * 0x10101010, but we can do slightly better than
28242 synth_mult by unwinding the sequence by hand on CPUs with
28243 slow multiply. */
28244 static rtx
28245 promote_duplicated_reg (machine_mode mode, rtx val)
28247 machine_mode valmode = GET_MODE (val);
28248 rtx tmp;
28249 int nops = mode == DImode ? 3 : 2;
28251 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28252 if (val == const0_rtx)
28253 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28254 if (CONST_INT_P (val))
28256 HOST_WIDE_INT v = INTVAL (val) & 255;
28258 v |= v << 8;
28259 v |= v << 16;
28260 if (mode == DImode)
28261 v |= (v << 16) << 16;
28262 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28265 if (valmode == VOIDmode)
28266 valmode = QImode;
28267 if (valmode != QImode)
28268 val = gen_lowpart (QImode, val);
28269 if (mode == QImode)
28270 return val;
28271 if (!TARGET_PARTIAL_REG_STALL)
28272 nops--;
28273 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28274 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28275 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28276 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28278 rtx reg = convert_modes (mode, QImode, val, true);
28279 tmp = promote_duplicated_reg (mode, const1_rtx);
28280 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28281 OPTAB_DIRECT);
28283 else
28285 rtx reg = convert_modes (mode, QImode, val, true);
28287 if (!TARGET_PARTIAL_REG_STALL)
28288 if (mode == SImode)
28289 emit_insn (gen_insvsi_1 (reg, reg));
28290 else
28291 emit_insn (gen_insvdi_1 (reg, reg));
28292 else
28294 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28295 NULL, 1, OPTAB_DIRECT);
28296 reg =
28297 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28299 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28300 NULL, 1, OPTAB_DIRECT);
28301 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28302 if (mode == SImode)
28303 return reg;
28304 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28305 NULL, 1, OPTAB_DIRECT);
28306 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28307 return reg;
28311 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28312 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28313 alignment from ALIGN to DESIRED_ALIGN. */
28314 static rtx
28315 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28316 int align)
28318 rtx promoted_val;
28320 if (TARGET_64BIT
28321 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28322 promoted_val = promote_duplicated_reg (DImode, val);
28323 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28324 promoted_val = promote_duplicated_reg (SImode, val);
28325 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28326 promoted_val = promote_duplicated_reg (HImode, val);
28327 else
28328 promoted_val = val;
28330 return promoted_val;
28333 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28334 operations when profitable. The code depends upon architecture, block size
28335 and alignment, but always has one of the following overall structures:
28337 Aligned move sequence:
28339 1) Prologue guard: Conditional that jumps up to epilogues for small
28340 blocks that can be handled by epilogue alone. This is faster
28341 but also needed for correctness, since prologue assume the block
28342 is larger than the desired alignment.
28344 Optional dynamic check for size and libcall for large
28345 blocks is emitted here too, with -minline-stringops-dynamically.
28347 2) Prologue: copy first few bytes in order to get destination
28348 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28349 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28350 copied. We emit either a jump tree on power of two sized
28351 blocks, or a byte loop.
28353 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28354 with specified algorithm.
28356 4) Epilogue: code copying tail of the block that is too small to be
28357 handled by main body (or up to size guarded by prologue guard).
28359 Misaligned move sequence
28361 1) missaligned move prologue/epilogue containing:
28362 a) Prologue handling small memory blocks and jumping to done_label
28363 (skipped if blocks are known to be large enough)
28364 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28365 needed by single possibly misaligned move
28366 (skipped if alignment is not needed)
28367 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28369 2) Zero size guard dispatching to done_label, if needed
28371 3) dispatch to library call, if needed,
28373 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28374 with specified algorithm. */
28375 bool
28376 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28377 rtx align_exp, rtx expected_align_exp,
28378 rtx expected_size_exp, rtx min_size_exp,
28379 rtx max_size_exp, rtx probable_max_size_exp,
28380 bool issetmem)
28382 rtx destreg;
28383 rtx srcreg = NULL;
28384 rtx_code_label *label = NULL;
28385 rtx tmp;
28386 rtx_code_label *jump_around_label = NULL;
28387 HOST_WIDE_INT align = 1;
28388 unsigned HOST_WIDE_INT count = 0;
28389 HOST_WIDE_INT expected_size = -1;
28390 int size_needed = 0, epilogue_size_needed;
28391 int desired_align = 0, align_bytes = 0;
28392 enum stringop_alg alg;
28393 rtx promoted_val = NULL;
28394 rtx vec_promoted_val = NULL;
28395 bool force_loopy_epilogue = false;
28396 int dynamic_check;
28397 bool need_zero_guard = false;
28398 bool noalign;
28399 machine_mode move_mode = VOIDmode;
28400 int unroll_factor = 1;
28401 /* TODO: Once value ranges are available, fill in proper data. */
28402 unsigned HOST_WIDE_INT min_size = 0;
28403 unsigned HOST_WIDE_INT max_size = -1;
28404 unsigned HOST_WIDE_INT probable_max_size = -1;
28405 bool misaligned_prologue_used = false;
28406 bool have_as;
28408 if (CONST_INT_P (align_exp))
28409 align = INTVAL (align_exp);
28410 /* i386 can do misaligned access on reasonably increased cost. */
28411 if (CONST_INT_P (expected_align_exp)
28412 && INTVAL (expected_align_exp) > align)
28413 align = INTVAL (expected_align_exp);
28414 /* ALIGN is the minimum of destination and source alignment, but we care here
28415 just about destination alignment. */
28416 else if (!issetmem
28417 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28418 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28420 if (CONST_INT_P (count_exp))
28422 min_size = max_size = probable_max_size = count = expected_size
28423 = INTVAL (count_exp);
28424 /* When COUNT is 0, there is nothing to do. */
28425 if (!count)
28426 return true;
28428 else
28430 if (min_size_exp)
28431 min_size = INTVAL (min_size_exp);
28432 if (max_size_exp)
28433 max_size = INTVAL (max_size_exp);
28434 if (probable_max_size_exp)
28435 probable_max_size = INTVAL (probable_max_size_exp);
28436 if (CONST_INT_P (expected_size_exp))
28437 expected_size = INTVAL (expected_size_exp);
28440 /* Make sure we don't need to care about overflow later on. */
28441 if (count > (HOST_WIDE_INT_1U << 30))
28442 return false;
28444 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28445 if (!issetmem)
28446 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28448 /* Step 0: Decide on preferred algorithm, desired alignment and
28449 size of chunks to be copied by main loop. */
28450 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28451 issetmem,
28452 issetmem && val_exp == const0_rtx, have_as,
28453 &dynamic_check, &noalign, false);
28454 if (alg == libcall)
28455 return false;
28456 gcc_assert (alg != no_stringop);
28458 /* For now vector-version of memset is generated only for memory zeroing, as
28459 creating of promoted vector value is very cheap in this case. */
28460 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28461 alg = unrolled_loop;
28463 if (!count)
28464 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28465 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28466 if (!issetmem)
28467 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28469 unroll_factor = 1;
28470 move_mode = word_mode;
28471 switch (alg)
28473 case libcall:
28474 case no_stringop:
28475 case last_alg:
28476 gcc_unreachable ();
28477 case loop_1_byte:
28478 need_zero_guard = true;
28479 move_mode = QImode;
28480 break;
28481 case loop:
28482 need_zero_guard = true;
28483 break;
28484 case unrolled_loop:
28485 need_zero_guard = true;
28486 unroll_factor = (TARGET_64BIT ? 4 : 2);
28487 break;
28488 case vector_loop:
28489 need_zero_guard = true;
28490 unroll_factor = 4;
28491 /* Find the widest supported mode. */
28492 move_mode = word_mode;
28493 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
28494 != CODE_FOR_nothing)
28495 move_mode = GET_MODE_WIDER_MODE (move_mode);
28497 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28498 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28499 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28501 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28502 move_mode = mode_for_vector (word_mode, nunits);
28503 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28504 move_mode = word_mode;
28506 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28507 break;
28508 case rep_prefix_8_byte:
28509 move_mode = DImode;
28510 break;
28511 case rep_prefix_4_byte:
28512 move_mode = SImode;
28513 break;
28514 case rep_prefix_1_byte:
28515 move_mode = QImode;
28516 break;
28518 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28519 epilogue_size_needed = size_needed;
28521 /* If we are going to call any library calls conditionally, make sure any
28522 pending stack adjustment happen before the first conditional branch,
28523 otherwise they will be emitted before the library call only and won't
28524 happen from the other branches. */
28525 if (dynamic_check != -1)
28526 do_pending_stack_adjust ();
28528 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28529 if (!TARGET_ALIGN_STRINGOPS || noalign)
28530 align = desired_align;
28532 /* Step 1: Prologue guard. */
28534 /* Alignment code needs count to be in register. */
28535 if (CONST_INT_P (count_exp) && desired_align > align)
28537 if (INTVAL (count_exp) > desired_align
28538 && INTVAL (count_exp) > size_needed)
28540 align_bytes
28541 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28542 if (align_bytes <= 0)
28543 align_bytes = 0;
28544 else
28545 align_bytes = desired_align - align_bytes;
28547 if (align_bytes == 0)
28548 count_exp = force_reg (counter_mode (count_exp), count_exp);
28550 gcc_assert (desired_align >= 1 && align >= 1);
28552 /* Misaligned move sequences handle both prologue and epilogue at once.
28553 Default code generation results in a smaller code for large alignments
28554 and also avoids redundant job when sizes are known precisely. */
28555 misaligned_prologue_used
28556 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28557 && MAX (desired_align, epilogue_size_needed) <= 32
28558 && desired_align <= epilogue_size_needed
28559 && ((desired_align > align && !align_bytes)
28560 || (!count && epilogue_size_needed > 1)));
28562 /* Do the cheap promotion to allow better CSE across the
28563 main loop and epilogue (ie one load of the big constant in the
28564 front of all code.
28565 For now the misaligned move sequences do not have fast path
28566 without broadcasting. */
28567 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28569 if (alg == vector_loop)
28571 gcc_assert (val_exp == const0_rtx);
28572 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28573 promoted_val = promote_duplicated_reg_to_size (val_exp,
28574 GET_MODE_SIZE (word_mode),
28575 desired_align, align);
28577 else
28579 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28580 desired_align, align);
28583 /* Misaligned move sequences handles both prologues and epilogues at once.
28584 Default code generation results in smaller code for large alignments and
28585 also avoids redundant job when sizes are known precisely. */
28586 if (misaligned_prologue_used)
28588 /* Misaligned move prologue handled small blocks by itself. */
28589 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28590 (dst, src, &destreg, &srcreg,
28591 move_mode, promoted_val, vec_promoted_val,
28592 &count_exp,
28593 &jump_around_label,
28594 desired_align < align
28595 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28596 desired_align, align, &min_size, dynamic_check, issetmem);
28597 if (!issetmem)
28598 src = change_address (src, BLKmode, srcreg);
28599 dst = change_address (dst, BLKmode, destreg);
28600 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28601 epilogue_size_needed = 0;
28602 if (need_zero_guard
28603 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28605 /* It is possible that we copied enough so the main loop will not
28606 execute. */
28607 gcc_assert (size_needed > 1);
28608 if (jump_around_label == NULL_RTX)
28609 jump_around_label = gen_label_rtx ();
28610 emit_cmp_and_jump_insns (count_exp,
28611 GEN_INT (size_needed),
28612 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28613 if (expected_size == -1
28614 || expected_size < (desired_align - align) / 2 + size_needed)
28615 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28616 else
28617 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28620 /* Ensure that alignment prologue won't copy past end of block. */
28621 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28623 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28624 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28625 Make sure it is power of 2. */
28626 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28628 /* To improve performance of small blocks, we jump around the VAL
28629 promoting mode. This mean that if the promoted VAL is not constant,
28630 we might not use it in the epilogue and have to use byte
28631 loop variant. */
28632 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28633 force_loopy_epilogue = true;
28634 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28635 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28637 /* If main algorithm works on QImode, no epilogue is needed.
28638 For small sizes just don't align anything. */
28639 if (size_needed == 1)
28640 desired_align = align;
28641 else
28642 goto epilogue;
28644 else if (!count
28645 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28647 label = gen_label_rtx ();
28648 emit_cmp_and_jump_insns (count_exp,
28649 GEN_INT (epilogue_size_needed),
28650 LTU, 0, counter_mode (count_exp), 1, label);
28651 if (expected_size == -1 || expected_size < epilogue_size_needed)
28652 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28653 else
28654 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28658 /* Emit code to decide on runtime whether library call or inline should be
28659 used. */
28660 if (dynamic_check != -1)
28662 if (!issetmem && CONST_INT_P (count_exp))
28664 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28666 emit_block_copy_via_libcall (dst, src, count_exp);
28667 count_exp = const0_rtx;
28668 goto epilogue;
28671 else
28673 rtx_code_label *hot_label = gen_label_rtx ();
28674 if (jump_around_label == NULL_RTX)
28675 jump_around_label = gen_label_rtx ();
28676 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28677 LEU, 0, counter_mode (count_exp),
28678 1, hot_label);
28679 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28680 if (issetmem)
28681 set_storage_via_libcall (dst, count_exp, val_exp);
28682 else
28683 emit_block_copy_via_libcall (dst, src, count_exp);
28684 emit_jump (jump_around_label);
28685 emit_label (hot_label);
28689 /* Step 2: Alignment prologue. */
28690 /* Do the expensive promotion once we branched off the small blocks. */
28691 if (issetmem && !promoted_val)
28692 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28693 desired_align, align);
28695 if (desired_align > align && !misaligned_prologue_used)
28697 if (align_bytes == 0)
28699 /* Except for the first move in prologue, we no longer know
28700 constant offset in aliasing info. It don't seems to worth
28701 the pain to maintain it for the first move, so throw away
28702 the info early. */
28703 dst = change_address (dst, BLKmode, destreg);
28704 if (!issetmem)
28705 src = change_address (src, BLKmode, srcreg);
28706 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28707 promoted_val, vec_promoted_val,
28708 count_exp, align, desired_align,
28709 issetmem);
28710 /* At most desired_align - align bytes are copied. */
28711 if (min_size < (unsigned)(desired_align - align))
28712 min_size = 0;
28713 else
28714 min_size -= desired_align - align;
28716 else
28718 /* If we know how many bytes need to be stored before dst is
28719 sufficiently aligned, maintain aliasing info accurately. */
28720 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28721 srcreg,
28722 promoted_val,
28723 vec_promoted_val,
28724 desired_align,
28725 align_bytes,
28726 issetmem);
28728 count_exp = plus_constant (counter_mode (count_exp),
28729 count_exp, -align_bytes);
28730 count -= align_bytes;
28731 min_size -= align_bytes;
28732 max_size -= align_bytes;
28734 if (need_zero_guard
28735 && min_size < (unsigned HOST_WIDE_INT) size_needed
28736 && (count < (unsigned HOST_WIDE_INT) size_needed
28737 || (align_bytes == 0
28738 && count < ((unsigned HOST_WIDE_INT) size_needed
28739 + desired_align - align))))
28741 /* It is possible that we copied enough so the main loop will not
28742 execute. */
28743 gcc_assert (size_needed > 1);
28744 if (label == NULL_RTX)
28745 label = gen_label_rtx ();
28746 emit_cmp_and_jump_insns (count_exp,
28747 GEN_INT (size_needed),
28748 LTU, 0, counter_mode (count_exp), 1, label);
28749 if (expected_size == -1
28750 || expected_size < (desired_align - align) / 2 + size_needed)
28751 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28752 else
28753 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28756 if (label && size_needed == 1)
28758 emit_label (label);
28759 LABEL_NUSES (label) = 1;
28760 label = NULL;
28761 epilogue_size_needed = 1;
28762 if (issetmem)
28763 promoted_val = val_exp;
28765 else if (label == NULL_RTX && !misaligned_prologue_used)
28766 epilogue_size_needed = size_needed;
28768 /* Step 3: Main loop. */
28770 switch (alg)
28772 case libcall:
28773 case no_stringop:
28774 case last_alg:
28775 gcc_unreachable ();
28776 case loop_1_byte:
28777 case loop:
28778 case unrolled_loop:
28779 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28780 count_exp, move_mode, unroll_factor,
28781 expected_size, issetmem);
28782 break;
28783 case vector_loop:
28784 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28785 vec_promoted_val, count_exp, move_mode,
28786 unroll_factor, expected_size, issetmem);
28787 break;
28788 case rep_prefix_8_byte:
28789 case rep_prefix_4_byte:
28790 case rep_prefix_1_byte:
28791 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28792 val_exp, count_exp, move_mode, issetmem);
28793 break;
28795 /* Adjust properly the offset of src and dest memory for aliasing. */
28796 if (CONST_INT_P (count_exp))
28798 if (!issetmem)
28799 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28800 (count / size_needed) * size_needed);
28801 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28802 (count / size_needed) * size_needed);
28804 else
28806 if (!issetmem)
28807 src = change_address (src, BLKmode, srcreg);
28808 dst = change_address (dst, BLKmode, destreg);
28811 /* Step 4: Epilogue to copy the remaining bytes. */
28812 epilogue:
28813 if (label)
28815 /* When the main loop is done, COUNT_EXP might hold original count,
28816 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28817 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28818 bytes. Compensate if needed. */
28820 if (size_needed < epilogue_size_needed)
28822 tmp =
28823 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28824 GEN_INT (size_needed - 1), count_exp, 1,
28825 OPTAB_DIRECT);
28826 if (tmp != count_exp)
28827 emit_move_insn (count_exp, tmp);
28829 emit_label (label);
28830 LABEL_NUSES (label) = 1;
28833 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28835 if (force_loopy_epilogue)
28836 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28837 epilogue_size_needed);
28838 else
28840 if (issetmem)
28841 expand_setmem_epilogue (dst, destreg, promoted_val,
28842 vec_promoted_val, count_exp,
28843 epilogue_size_needed);
28844 else
28845 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28846 epilogue_size_needed);
28849 if (jump_around_label)
28850 emit_label (jump_around_label);
28851 return true;
28855 /* Expand the appropriate insns for doing strlen if not just doing
28856 repnz; scasb
28858 out = result, initialized with the start address
28859 align_rtx = alignment of the address.
28860 scratch = scratch register, initialized with the startaddress when
28861 not aligned, otherwise undefined
28863 This is just the body. It needs the initializations mentioned above and
28864 some address computing at the end. These things are done in i386.md. */
28866 static void
28867 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28869 int align;
28870 rtx tmp;
28871 rtx_code_label *align_2_label = NULL;
28872 rtx_code_label *align_3_label = NULL;
28873 rtx_code_label *align_4_label = gen_label_rtx ();
28874 rtx_code_label *end_0_label = gen_label_rtx ();
28875 rtx mem;
28876 rtx tmpreg = gen_reg_rtx (SImode);
28877 rtx scratch = gen_reg_rtx (SImode);
28878 rtx cmp;
28880 align = 0;
28881 if (CONST_INT_P (align_rtx))
28882 align = INTVAL (align_rtx);
28884 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28886 /* Is there a known alignment and is it less than 4? */
28887 if (align < 4)
28889 rtx scratch1 = gen_reg_rtx (Pmode);
28890 emit_move_insn (scratch1, out);
28891 /* Is there a known alignment and is it not 2? */
28892 if (align != 2)
28894 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28895 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28897 /* Leave just the 3 lower bits. */
28898 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28899 NULL_RTX, 0, OPTAB_WIDEN);
28901 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28902 Pmode, 1, align_4_label);
28903 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28904 Pmode, 1, align_2_label);
28905 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28906 Pmode, 1, align_3_label);
28908 else
28910 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28911 check if is aligned to 4 - byte. */
28913 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28914 NULL_RTX, 0, OPTAB_WIDEN);
28916 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28917 Pmode, 1, align_4_label);
28920 mem = change_address (src, QImode, out);
28922 /* Now compare the bytes. */
28924 /* Compare the first n unaligned byte on a byte per byte basis. */
28925 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28926 QImode, 1, end_0_label);
28928 /* Increment the address. */
28929 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28931 /* Not needed with an alignment of 2 */
28932 if (align != 2)
28934 emit_label (align_2_label);
28936 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28937 end_0_label);
28939 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28941 emit_label (align_3_label);
28944 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28945 end_0_label);
28947 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28950 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28951 align this loop. It gives only huge programs, but does not help to
28952 speed up. */
28953 emit_label (align_4_label);
28955 mem = change_address (src, SImode, out);
28956 emit_move_insn (scratch, mem);
28957 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28959 /* This formula yields a nonzero result iff one of the bytes is zero.
28960 This saves three branches inside loop and many cycles. */
28962 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28963 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28964 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28965 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28966 gen_int_mode (0x80808080, SImode)));
28967 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28968 align_4_label);
28970 if (TARGET_CMOVE)
28972 rtx reg = gen_reg_rtx (SImode);
28973 rtx reg2 = gen_reg_rtx (Pmode);
28974 emit_move_insn (reg, tmpreg);
28975 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28977 /* If zero is not in the first two bytes, move two bytes forward. */
28978 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28979 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28980 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28981 emit_insn (gen_rtx_SET (tmpreg,
28982 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28983 reg,
28984 tmpreg)));
28985 /* Emit lea manually to avoid clobbering of flags. */
28986 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28988 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28989 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28990 emit_insn (gen_rtx_SET (out,
28991 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28992 reg2,
28993 out)));
28995 else
28997 rtx_code_label *end_2_label = gen_label_rtx ();
28998 /* Is zero in the first two bytes? */
29000 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29001 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29002 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
29003 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
29004 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29005 pc_rtx);
29006 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29007 JUMP_LABEL (tmp) = end_2_label;
29009 /* Not in the first two. Move two bytes forward. */
29010 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29011 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29013 emit_label (end_2_label);
29017 /* Avoid branch in fixing the byte. */
29018 tmpreg = gen_lowpart (QImode, tmpreg);
29019 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29020 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29021 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29022 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29024 emit_label (end_0_label);
29027 /* Expand strlen. */
29029 bool
29030 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29032 rtx addr, scratch1, scratch2, scratch3, scratch4;
29034 /* The generic case of strlen expander is long. Avoid it's
29035 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29037 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29038 && !TARGET_INLINE_ALL_STRINGOPS
29039 && !optimize_insn_for_size_p ()
29040 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29041 return false;
29043 addr = force_reg (Pmode, XEXP (src, 0));
29044 scratch1 = gen_reg_rtx (Pmode);
29046 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29047 && !optimize_insn_for_size_p ())
29049 /* Well it seems that some optimizer does not combine a call like
29050 foo(strlen(bar), strlen(bar));
29051 when the move and the subtraction is done here. It does calculate
29052 the length just once when these instructions are done inside of
29053 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29054 often used and I use one fewer register for the lifetime of
29055 output_strlen_unroll() this is better. */
29057 emit_move_insn (out, addr);
29059 ix86_expand_strlensi_unroll_1 (out, src, align);
29061 /* strlensi_unroll_1 returns the address of the zero at the end of
29062 the string, like memchr(), so compute the length by subtracting
29063 the start address. */
29064 emit_insn (ix86_gen_sub3 (out, out, addr));
29066 else
29068 rtx unspec;
29070 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29071 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29072 return false;
29073 /* Can't use this for non-default address spaces. */
29074 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29075 return false;
29077 scratch2 = gen_reg_rtx (Pmode);
29078 scratch3 = gen_reg_rtx (Pmode);
29079 scratch4 = force_reg (Pmode, constm1_rtx);
29081 emit_move_insn (scratch3, addr);
29082 eoschar = force_reg (QImode, eoschar);
29084 src = replace_equiv_address_nv (src, scratch3);
29086 /* If .md starts supporting :P, this can be done in .md. */
29087 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29088 scratch4), UNSPEC_SCAS);
29089 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29090 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29091 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29093 return true;
29096 /* For given symbol (function) construct code to compute address of it's PLT
29097 entry in large x86-64 PIC model. */
29098 static rtx
29099 construct_plt_address (rtx symbol)
29101 rtx tmp, unspec;
29103 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29104 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29105 gcc_assert (Pmode == DImode);
29107 tmp = gen_reg_rtx (Pmode);
29108 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29110 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29111 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29112 return tmp;
29116 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29117 rtx callarg2,
29118 rtx pop, bool sibcall)
29120 rtx vec[3];
29121 rtx use = NULL, call;
29122 unsigned int vec_len = 0;
29123 tree fndecl;
29125 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29127 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29128 if (fndecl
29129 && (lookup_attribute ("interrupt",
29130 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29131 error ("interrupt service routine can't be called directly");
29133 else
29134 fndecl = NULL_TREE;
29136 if (pop == const0_rtx)
29137 pop = NULL;
29138 gcc_assert (!TARGET_64BIT || !pop);
29140 if (TARGET_MACHO && !TARGET_64BIT)
29142 #if TARGET_MACHO
29143 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29144 fnaddr = machopic_indirect_call_target (fnaddr);
29145 #endif
29147 else
29149 /* Static functions and indirect calls don't need the pic register. Also,
29150 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29151 it an indirect call. */
29152 rtx addr = XEXP (fnaddr, 0);
29153 if (flag_pic
29154 && GET_CODE (addr) == SYMBOL_REF
29155 && !SYMBOL_REF_LOCAL_P (addr))
29157 if (flag_plt
29158 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29159 || !lookup_attribute ("noplt",
29160 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29162 if (!TARGET_64BIT
29163 || (ix86_cmodel == CM_LARGE_PIC
29164 && DEFAULT_ABI != MS_ABI))
29166 use_reg (&use, gen_rtx_REG (Pmode,
29167 REAL_PIC_OFFSET_TABLE_REGNUM));
29168 if (ix86_use_pseudo_pic_reg ())
29169 emit_move_insn (gen_rtx_REG (Pmode,
29170 REAL_PIC_OFFSET_TABLE_REGNUM),
29171 pic_offset_table_rtx);
29174 else if (!TARGET_PECOFF && !TARGET_MACHO)
29176 if (TARGET_64BIT)
29178 fnaddr = gen_rtx_UNSPEC (Pmode,
29179 gen_rtvec (1, addr),
29180 UNSPEC_GOTPCREL);
29181 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29183 else
29185 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29186 UNSPEC_GOT);
29187 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29188 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29189 fnaddr);
29191 fnaddr = gen_const_mem (Pmode, fnaddr);
29192 /* Pmode may not be the same as word_mode for x32, which
29193 doesn't support indirect branch via 32-bit memory slot.
29194 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29195 indirect branch via x32 GOT slot is OK. */
29196 if (GET_MODE (fnaddr) != word_mode)
29197 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29198 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29203 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29204 parameters passed in vector registers. */
29205 if (TARGET_64BIT
29206 && (INTVAL (callarg2) > 0
29207 || (INTVAL (callarg2) == 0
29208 && (TARGET_SSE || !flag_skip_rax_setup))))
29210 rtx al = gen_rtx_REG (QImode, AX_REG);
29211 emit_move_insn (al, callarg2);
29212 use_reg (&use, al);
29215 if (ix86_cmodel == CM_LARGE_PIC
29216 && !TARGET_PECOFF
29217 && MEM_P (fnaddr)
29218 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29219 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29220 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29221 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29222 branch via x32 GOT slot is OK. */
29223 else if (!(TARGET_X32
29224 && MEM_P (fnaddr)
29225 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29226 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29227 && (sibcall
29228 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29229 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29231 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29232 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29235 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29237 if (retval)
29239 /* We should add bounds as destination register in case
29240 pointer with bounds may be returned. */
29241 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29243 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29244 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29245 if (GET_CODE (retval) == PARALLEL)
29247 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29248 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29249 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29250 retval = chkp_join_splitted_slot (retval, par);
29252 else
29254 retval = gen_rtx_PARALLEL (VOIDmode,
29255 gen_rtvec (3, retval, b0, b1));
29256 chkp_put_regs_to_expr_list (retval);
29260 call = gen_rtx_SET (retval, call);
29262 vec[vec_len++] = call;
29264 if (pop)
29266 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29267 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29268 vec[vec_len++] = pop;
29271 if (cfun->machine->no_caller_saved_registers
29272 && (!fndecl
29273 || (!TREE_THIS_VOLATILE (fndecl)
29274 && !lookup_attribute ("no_caller_saved_registers",
29275 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29277 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29278 bool is_64bit_ms_abi = (TARGET_64BIT
29279 && ix86_function_abi (fndecl) == MS_ABI);
29280 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29282 /* If there are no caller-saved registers, add all registers
29283 that are clobbered by the call which returns. */
29284 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29285 if (!fixed_regs[i]
29286 && (ix86_call_used_regs[i] == 1
29287 || (ix86_call_used_regs[i] & c_mask))
29288 && !STACK_REGNO_P (i)
29289 && !MMX_REGNO_P (i))
29290 clobber_reg (&use,
29291 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29293 else if (TARGET_64BIT_MS_ABI
29294 && (!callarg2 || INTVAL (callarg2) != -2))
29296 unsigned i;
29298 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29300 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29301 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29303 clobber_reg (&use, gen_rtx_REG (mode, regno));
29306 /* Set here, but it may get cleared later. */
29307 if (TARGET_CALL_MS2SYSV_XLOGUES)
29309 if (!TARGET_SSE)
29312 /* Don't break hot-patched functions. */
29313 else if (ix86_function_ms_hook_prologue (current_function_decl))
29316 /* TODO: Cases not yet examined. */
29317 else if (flag_split_stack)
29318 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29320 else
29322 gcc_assert (!reload_completed);
29323 cfun->machine->call_ms2sysv = true;
29328 if (vec_len > 1)
29329 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29330 call = emit_call_insn (call);
29331 if (use)
29332 CALL_INSN_FUNCTION_USAGE (call) = use;
29334 return call;
29337 /* Return true if the function being called was marked with attribute
29338 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29339 to handle the non-PIC case in the backend because there is no easy
29340 interface for the front-end to force non-PLT calls to use the GOT.
29341 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29342 to call the function marked "noplt" indirectly. */
29344 static bool
29345 ix86_nopic_noplt_attribute_p (rtx call_op)
29347 if (flag_pic || ix86_cmodel == CM_LARGE
29348 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29349 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29350 || SYMBOL_REF_LOCAL_P (call_op))
29351 return false;
29353 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29355 if (!flag_plt
29356 || (symbol_decl != NULL_TREE
29357 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29358 return true;
29360 return false;
29363 /* Output the assembly for a call instruction. */
29365 const char *
29366 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29368 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29369 bool seh_nop_p = false;
29370 const char *xasm;
29372 if (SIBLING_CALL_P (insn))
29374 if (direct_p)
29376 if (ix86_nopic_noplt_attribute_p (call_op))
29378 if (TARGET_64BIT)
29379 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29380 else
29381 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29383 else
29384 xasm = "%!jmp\t%P0";
29386 /* SEH epilogue detection requires the indirect branch case
29387 to include REX.W. */
29388 else if (TARGET_SEH)
29389 xasm = "%!rex.W jmp\t%A0";
29390 else
29391 xasm = "%!jmp\t%A0";
29393 output_asm_insn (xasm, &call_op);
29394 return "";
29397 /* SEH unwinding can require an extra nop to be emitted in several
29398 circumstances. Determine if we have one of those. */
29399 if (TARGET_SEH)
29401 rtx_insn *i;
29403 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29405 /* If we get to another real insn, we don't need the nop. */
29406 if (INSN_P (i))
29407 break;
29409 /* If we get to the epilogue note, prevent a catch region from
29410 being adjacent to the standard epilogue sequence. If non-
29411 call-exceptions, we'll have done this during epilogue emission. */
29412 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29413 && !flag_non_call_exceptions
29414 && !can_throw_internal (insn))
29416 seh_nop_p = true;
29417 break;
29421 /* If we didn't find a real insn following the call, prevent the
29422 unwinder from looking into the next function. */
29423 if (i == NULL)
29424 seh_nop_p = true;
29427 if (direct_p)
29429 if (ix86_nopic_noplt_attribute_p (call_op))
29431 if (TARGET_64BIT)
29432 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29433 else
29434 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29436 else
29437 xasm = "%!call\t%P0";
29439 else
29440 xasm = "%!call\t%A0";
29442 output_asm_insn (xasm, &call_op);
29444 if (seh_nop_p)
29445 return "nop";
29447 return "";
29450 /* Clear stack slot assignments remembered from previous functions.
29451 This is called from INIT_EXPANDERS once before RTL is emitted for each
29452 function. */
29454 static struct machine_function *
29455 ix86_init_machine_status (void)
29457 struct machine_function *f;
29459 f = ggc_cleared_alloc<machine_function> ();
29460 f->call_abi = ix86_abi;
29462 return f;
29465 /* Return a MEM corresponding to a stack slot with mode MODE.
29466 Allocate a new slot if necessary.
29468 The RTL for a function can have several slots available: N is
29469 which slot to use. */
29472 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29474 struct stack_local_entry *s;
29476 gcc_assert (n < MAX_386_STACK_LOCALS);
29478 for (s = ix86_stack_locals; s; s = s->next)
29479 if (s->mode == mode && s->n == n)
29480 return validize_mem (copy_rtx (s->rtl));
29482 s = ggc_alloc<stack_local_entry> ();
29483 s->n = n;
29484 s->mode = mode;
29485 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29487 s->next = ix86_stack_locals;
29488 ix86_stack_locals = s;
29489 return validize_mem (copy_rtx (s->rtl));
29492 static void
29493 ix86_instantiate_decls (void)
29495 struct stack_local_entry *s;
29497 for (s = ix86_stack_locals; s; s = s->next)
29498 if (s->rtl != NULL_RTX)
29499 instantiate_decl_rtl (s->rtl);
29502 /* Return the number used for encoding REG, in the range 0..7. */
29504 static int
29505 reg_encoded_number (rtx reg)
29507 unsigned regno = REGNO (reg);
29508 switch (regno)
29510 case AX_REG:
29511 return 0;
29512 case CX_REG:
29513 return 1;
29514 case DX_REG:
29515 return 2;
29516 case BX_REG:
29517 return 3;
29518 case SP_REG:
29519 return 4;
29520 case BP_REG:
29521 return 5;
29522 case SI_REG:
29523 return 6;
29524 case DI_REG:
29525 return 7;
29526 default:
29527 break;
29529 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29530 return regno - FIRST_STACK_REG;
29531 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29532 return regno - FIRST_SSE_REG;
29533 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29534 return regno - FIRST_MMX_REG;
29535 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29536 return regno - FIRST_REX_SSE_REG;
29537 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29538 return regno - FIRST_REX_INT_REG;
29539 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29540 return regno - FIRST_MASK_REG;
29541 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29542 return regno - FIRST_BND_REG;
29543 return -1;
29546 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29547 in its encoding if it could be relevant for ROP mitigation, otherwise
29548 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29549 used for calculating it into them. */
29551 static int
29552 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29553 int *popno0 = 0, int *popno1 = 0)
29555 if (asm_noperands (PATTERN (insn)) >= 0)
29556 return -1;
29557 int has_modrm = get_attr_modrm (insn);
29558 if (!has_modrm)
29559 return -1;
29560 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29561 rtx op0, op1;
29562 switch (cls)
29564 case MODRM_CLASS_OP02:
29565 gcc_assert (noperands >= 3);
29566 if (popno0)
29568 *popno0 = 0;
29569 *popno1 = 2;
29571 op0 = operands[0];
29572 op1 = operands[2];
29573 break;
29574 case MODRM_CLASS_OP01:
29575 gcc_assert (noperands >= 2);
29576 if (popno0)
29578 *popno0 = 0;
29579 *popno1 = 1;
29581 op0 = operands[0];
29582 op1 = operands[1];
29583 break;
29584 default:
29585 return -1;
29587 if (REG_P (op0) && REG_P (op1))
29589 int enc0 = reg_encoded_number (op0);
29590 int enc1 = reg_encoded_number (op1);
29591 return 0xc0 + (enc1 << 3) + enc0;
29593 return -1;
29596 /* Check whether x86 address PARTS is a pc-relative address. */
29598 static bool
29599 rip_relative_addr_p (struct ix86_address *parts)
29601 rtx base, index, disp;
29603 base = parts->base;
29604 index = parts->index;
29605 disp = parts->disp;
29607 if (disp && !base && !index)
29609 if (TARGET_64BIT)
29611 rtx symbol = disp;
29613 if (GET_CODE (disp) == CONST)
29614 symbol = XEXP (disp, 0);
29615 if (GET_CODE (symbol) == PLUS
29616 && CONST_INT_P (XEXP (symbol, 1)))
29617 symbol = XEXP (symbol, 0);
29619 if (GET_CODE (symbol) == LABEL_REF
29620 || (GET_CODE (symbol) == SYMBOL_REF
29621 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29622 || (GET_CODE (symbol) == UNSPEC
29623 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29624 || XINT (symbol, 1) == UNSPEC_PCREL
29625 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29626 return true;
29629 return false;
29632 /* Calculate the length of the memory address in the instruction encoding.
29633 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29634 or other prefixes. We never generate addr32 prefix for LEA insn. */
29637 memory_address_length (rtx addr, bool lea)
29639 struct ix86_address parts;
29640 rtx base, index, disp;
29641 int len;
29642 int ok;
29644 if (GET_CODE (addr) == PRE_DEC
29645 || GET_CODE (addr) == POST_INC
29646 || GET_CODE (addr) == PRE_MODIFY
29647 || GET_CODE (addr) == POST_MODIFY)
29648 return 0;
29650 ok = ix86_decompose_address (addr, &parts);
29651 gcc_assert (ok);
29653 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29655 /* If this is not LEA instruction, add the length of addr32 prefix. */
29656 if (TARGET_64BIT && !lea
29657 && (SImode_address_operand (addr, VOIDmode)
29658 || (parts.base && GET_MODE (parts.base) == SImode)
29659 || (parts.index && GET_MODE (parts.index) == SImode)))
29660 len++;
29662 base = parts.base;
29663 index = parts.index;
29664 disp = parts.disp;
29666 if (base && SUBREG_P (base))
29667 base = SUBREG_REG (base);
29668 if (index && SUBREG_P (index))
29669 index = SUBREG_REG (index);
29671 gcc_assert (base == NULL_RTX || REG_P (base));
29672 gcc_assert (index == NULL_RTX || REG_P (index));
29674 /* Rule of thumb:
29675 - esp as the base always wants an index,
29676 - ebp as the base always wants a displacement,
29677 - r12 as the base always wants an index,
29678 - r13 as the base always wants a displacement. */
29680 /* Register Indirect. */
29681 if (base && !index && !disp)
29683 /* esp (for its index) and ebp (for its displacement) need
29684 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29685 code. */
29686 if (base == arg_pointer_rtx
29687 || base == frame_pointer_rtx
29688 || REGNO (base) == SP_REG
29689 || REGNO (base) == BP_REG
29690 || REGNO (base) == R12_REG
29691 || REGNO (base) == R13_REG)
29692 len++;
29695 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29696 is not disp32, but disp32(%rip), so for disp32
29697 SIB byte is needed, unless print_operand_address
29698 optimizes it into disp32(%rip) or (%rip) is implied
29699 by UNSPEC. */
29700 else if (disp && !base && !index)
29702 len += 4;
29703 if (!rip_relative_addr_p (&parts))
29704 len++;
29706 else
29708 /* Find the length of the displacement constant. */
29709 if (disp)
29711 if (base && satisfies_constraint_K (disp))
29712 len += 1;
29713 else
29714 len += 4;
29716 /* ebp always wants a displacement. Similarly r13. */
29717 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29718 len++;
29720 /* An index requires the two-byte modrm form.... */
29721 if (index
29722 /* ...like esp (or r12), which always wants an index. */
29723 || base == arg_pointer_rtx
29724 || base == frame_pointer_rtx
29725 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29726 len++;
29729 return len;
29732 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29733 is set, expect that insn have 8bit immediate alternative. */
29735 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29737 int len = 0;
29738 int i;
29739 extract_insn_cached (insn);
29740 for (i = recog_data.n_operands - 1; i >= 0; --i)
29741 if (CONSTANT_P (recog_data.operand[i]))
29743 enum attr_mode mode = get_attr_mode (insn);
29745 gcc_assert (!len);
29746 if (shortform && CONST_INT_P (recog_data.operand[i]))
29748 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29749 switch (mode)
29751 case MODE_QI:
29752 len = 1;
29753 continue;
29754 case MODE_HI:
29755 ival = trunc_int_for_mode (ival, HImode);
29756 break;
29757 case MODE_SI:
29758 ival = trunc_int_for_mode (ival, SImode);
29759 break;
29760 default:
29761 break;
29763 if (IN_RANGE (ival, -128, 127))
29765 len = 1;
29766 continue;
29769 switch (mode)
29771 case MODE_QI:
29772 len = 1;
29773 break;
29774 case MODE_HI:
29775 len = 2;
29776 break;
29777 case MODE_SI:
29778 len = 4;
29779 break;
29780 /* Immediates for DImode instructions are encoded
29781 as 32bit sign extended values. */
29782 case MODE_DI:
29783 len = 4;
29784 break;
29785 default:
29786 fatal_insn ("unknown insn mode", insn);
29789 return len;
29792 /* Compute default value for "length_address" attribute. */
29794 ix86_attr_length_address_default (rtx_insn *insn)
29796 int i;
29798 if (get_attr_type (insn) == TYPE_LEA)
29800 rtx set = PATTERN (insn), addr;
29802 if (GET_CODE (set) == PARALLEL)
29803 set = XVECEXP (set, 0, 0);
29805 gcc_assert (GET_CODE (set) == SET);
29807 addr = SET_SRC (set);
29809 return memory_address_length (addr, true);
29812 extract_insn_cached (insn);
29813 for (i = recog_data.n_operands - 1; i >= 0; --i)
29815 rtx op = recog_data.operand[i];
29816 if (MEM_P (op))
29818 constrain_operands_cached (insn, reload_completed);
29819 if (which_alternative != -1)
29821 const char *constraints = recog_data.constraints[i];
29822 int alt = which_alternative;
29824 while (*constraints == '=' || *constraints == '+')
29825 constraints++;
29826 while (alt-- > 0)
29827 while (*constraints++ != ',')
29829 /* Skip ignored operands. */
29830 if (*constraints == 'X')
29831 continue;
29834 int len = memory_address_length (XEXP (op, 0), false);
29836 /* Account for segment prefix for non-default addr spaces. */
29837 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29838 len++;
29840 return len;
29843 return 0;
29846 /* Compute default value for "length_vex" attribute. It includes
29847 2 or 3 byte VEX prefix and 1 opcode byte. */
29850 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29851 bool has_vex_w)
29853 int i;
29855 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29856 byte VEX prefix. */
29857 if (!has_0f_opcode || has_vex_w)
29858 return 3 + 1;
29860 /* We can always use 2 byte VEX prefix in 32bit. */
29861 if (!TARGET_64BIT)
29862 return 2 + 1;
29864 extract_insn_cached (insn);
29866 for (i = recog_data.n_operands - 1; i >= 0; --i)
29867 if (REG_P (recog_data.operand[i]))
29869 /* REX.W bit uses 3 byte VEX prefix. */
29870 if (GET_MODE (recog_data.operand[i]) == DImode
29871 && GENERAL_REG_P (recog_data.operand[i]))
29872 return 3 + 1;
29874 else
29876 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29877 if (MEM_P (recog_data.operand[i])
29878 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29879 return 3 + 1;
29882 return 2 + 1;
29885 /* Return the maximum number of instructions a cpu can issue. */
29887 static int
29888 ix86_issue_rate (void)
29890 switch (ix86_tune)
29892 case PROCESSOR_PENTIUM:
29893 case PROCESSOR_LAKEMONT:
29894 case PROCESSOR_BONNELL:
29895 case PROCESSOR_SILVERMONT:
29896 case PROCESSOR_KNL:
29897 case PROCESSOR_INTEL:
29898 case PROCESSOR_K6:
29899 case PROCESSOR_BTVER2:
29900 case PROCESSOR_PENTIUM4:
29901 case PROCESSOR_NOCONA:
29902 return 2;
29904 case PROCESSOR_PENTIUMPRO:
29905 case PROCESSOR_ATHLON:
29906 case PROCESSOR_K8:
29907 case PROCESSOR_AMDFAM10:
29908 case PROCESSOR_GENERIC:
29909 case PROCESSOR_BTVER1:
29910 return 3;
29912 case PROCESSOR_BDVER1:
29913 case PROCESSOR_BDVER2:
29914 case PROCESSOR_BDVER3:
29915 case PROCESSOR_BDVER4:
29916 case PROCESSOR_ZNVER1:
29917 case PROCESSOR_CORE2:
29918 case PROCESSOR_NEHALEM:
29919 case PROCESSOR_SANDYBRIDGE:
29920 case PROCESSOR_HASWELL:
29921 return 4;
29923 default:
29924 return 1;
29928 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
29929 by DEP_INSN and nothing set by DEP_INSN. */
29931 static bool
29932 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
29934 rtx set, set2;
29936 /* Simplify the test for uninteresting insns. */
29937 if (insn_type != TYPE_SETCC
29938 && insn_type != TYPE_ICMOV
29939 && insn_type != TYPE_FCMOV
29940 && insn_type != TYPE_IBR)
29941 return false;
29943 if ((set = single_set (dep_insn)) != 0)
29945 set = SET_DEST (set);
29946 set2 = NULL_RTX;
29948 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
29949 && XVECLEN (PATTERN (dep_insn), 0) == 2
29950 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
29951 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
29953 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29954 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
29956 else
29957 return false;
29959 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
29960 return false;
29962 /* This test is true if the dependent insn reads the flags but
29963 not any other potentially set register. */
29964 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
29965 return false;
29967 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
29968 return false;
29970 return true;
29973 /* Return true iff USE_INSN has a memory address with operands set by
29974 SET_INSN. */
29976 bool
29977 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
29979 int i;
29980 extract_insn_cached (use_insn);
29981 for (i = recog_data.n_operands - 1; i >= 0; --i)
29982 if (MEM_P (recog_data.operand[i]))
29984 rtx addr = XEXP (recog_data.operand[i], 0);
29985 if (modified_in_p (addr, set_insn) != 0)
29987 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
29988 has SP based memory (unless index reg is modified in a pop). */
29989 rtx set = single_set (set_insn);
29990 if (set
29991 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
29992 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
29994 struct ix86_address parts;
29995 if (ix86_decompose_address (addr, &parts)
29996 && parts.base == stack_pointer_rtx
29997 && (parts.index == NULL_RTX
29998 || MEM_P (SET_DEST (set))
29999 || !modified_in_p (parts.index, set_insn)))
30000 return false;
30002 return true;
30004 return false;
30006 return false;
30009 /* Helper function for exact_store_load_dependency.
30010 Return true if addr is found in insn. */
30011 static bool
30012 exact_dependency_1 (rtx addr, rtx insn)
30014 enum rtx_code code;
30015 const char *format_ptr;
30016 int i, j;
30018 code = GET_CODE (insn);
30019 switch (code)
30021 case MEM:
30022 if (rtx_equal_p (addr, insn))
30023 return true;
30024 break;
30025 case REG:
30026 CASE_CONST_ANY:
30027 case SYMBOL_REF:
30028 case CODE_LABEL:
30029 case PC:
30030 case CC0:
30031 case EXPR_LIST:
30032 return false;
30033 default:
30034 break;
30037 format_ptr = GET_RTX_FORMAT (code);
30038 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30040 switch (*format_ptr++)
30042 case 'e':
30043 if (exact_dependency_1 (addr, XEXP (insn, i)))
30044 return true;
30045 break;
30046 case 'E':
30047 for (j = 0; j < XVECLEN (insn, i); j++)
30048 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30049 return true;
30050 break;
30053 return false;
30056 /* Return true if there exists exact dependency for store & load, i.e.
30057 the same memory address is used in them. */
30058 static bool
30059 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30061 rtx set1, set2;
30063 set1 = single_set (store);
30064 if (!set1)
30065 return false;
30066 if (!MEM_P (SET_DEST (set1)))
30067 return false;
30068 set2 = single_set (load);
30069 if (!set2)
30070 return false;
30071 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30072 return true;
30073 return false;
30076 static int
30077 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30078 unsigned int)
30080 enum attr_type insn_type, dep_insn_type;
30081 enum attr_memory memory;
30082 rtx set, set2;
30083 int dep_insn_code_number;
30085 /* Anti and output dependencies have zero cost on all CPUs. */
30086 if (dep_type != 0)
30087 return 0;
30089 dep_insn_code_number = recog_memoized (dep_insn);
30091 /* If we can't recognize the insns, we can't really do anything. */
30092 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30093 return cost;
30095 insn_type = get_attr_type (insn);
30096 dep_insn_type = get_attr_type (dep_insn);
30098 switch (ix86_tune)
30100 case PROCESSOR_PENTIUM:
30101 case PROCESSOR_LAKEMONT:
30102 /* Address Generation Interlock adds a cycle of latency. */
30103 if (insn_type == TYPE_LEA)
30105 rtx addr = PATTERN (insn);
30107 if (GET_CODE (addr) == PARALLEL)
30108 addr = XVECEXP (addr, 0, 0);
30110 gcc_assert (GET_CODE (addr) == SET);
30112 addr = SET_SRC (addr);
30113 if (modified_in_p (addr, dep_insn))
30114 cost += 1;
30116 else if (ix86_agi_dependent (dep_insn, insn))
30117 cost += 1;
30119 /* ??? Compares pair with jump/setcc. */
30120 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30121 cost = 0;
30123 /* Floating point stores require value to be ready one cycle earlier. */
30124 if (insn_type == TYPE_FMOV
30125 && get_attr_memory (insn) == MEMORY_STORE
30126 && !ix86_agi_dependent (dep_insn, insn))
30127 cost += 1;
30128 break;
30130 case PROCESSOR_PENTIUMPRO:
30131 /* INT->FP conversion is expensive. */
30132 if (get_attr_fp_int_src (dep_insn))
30133 cost += 5;
30135 /* There is one cycle extra latency between an FP op and a store. */
30136 if (insn_type == TYPE_FMOV
30137 && (set = single_set (dep_insn)) != NULL_RTX
30138 && (set2 = single_set (insn)) != NULL_RTX
30139 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30140 && MEM_P (SET_DEST (set2)))
30141 cost += 1;
30143 memory = get_attr_memory (insn);
30145 /* Show ability of reorder buffer to hide latency of load by executing
30146 in parallel with previous instruction in case
30147 previous instruction is not needed to compute the address. */
30148 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30149 && !ix86_agi_dependent (dep_insn, insn))
30151 /* Claim moves to take one cycle, as core can issue one load
30152 at time and the next load can start cycle later. */
30153 if (dep_insn_type == TYPE_IMOV
30154 || dep_insn_type == TYPE_FMOV)
30155 cost = 1;
30156 else if (cost > 1)
30157 cost--;
30159 break;
30161 case PROCESSOR_K6:
30162 /* The esp dependency is resolved before
30163 the instruction is really finished. */
30164 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30165 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30166 return 1;
30168 /* INT->FP conversion is expensive. */
30169 if (get_attr_fp_int_src (dep_insn))
30170 cost += 5;
30172 memory = get_attr_memory (insn);
30174 /* Show ability of reorder buffer to hide latency of load by executing
30175 in parallel with previous instruction in case
30176 previous instruction is not needed to compute the address. */
30177 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30178 && !ix86_agi_dependent (dep_insn, insn))
30180 /* Claim moves to take one cycle, as core can issue one load
30181 at time and the next load can start cycle later. */
30182 if (dep_insn_type == TYPE_IMOV
30183 || dep_insn_type == TYPE_FMOV)
30184 cost = 1;
30185 else if (cost > 2)
30186 cost -= 2;
30187 else
30188 cost = 1;
30190 break;
30192 case PROCESSOR_AMDFAM10:
30193 case PROCESSOR_BDVER1:
30194 case PROCESSOR_BDVER2:
30195 case PROCESSOR_BDVER3:
30196 case PROCESSOR_BDVER4:
30197 case PROCESSOR_ZNVER1:
30198 case PROCESSOR_BTVER1:
30199 case PROCESSOR_BTVER2:
30200 case PROCESSOR_GENERIC:
30201 /* Stack engine allows to execute push&pop instructions in parall. */
30202 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30203 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30204 return 0;
30205 /* FALLTHRU */
30207 case PROCESSOR_ATHLON:
30208 case PROCESSOR_K8:
30209 memory = get_attr_memory (insn);
30211 /* Show ability of reorder buffer to hide latency of load by executing
30212 in parallel with previous instruction in case
30213 previous instruction is not needed to compute the address. */
30214 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30215 && !ix86_agi_dependent (dep_insn, insn))
30217 enum attr_unit unit = get_attr_unit (insn);
30218 int loadcost = 3;
30220 /* Because of the difference between the length of integer and
30221 floating unit pipeline preparation stages, the memory operands
30222 for floating point are cheaper.
30224 ??? For Athlon it the difference is most probably 2. */
30225 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30226 loadcost = 3;
30227 else
30228 loadcost = TARGET_ATHLON ? 2 : 0;
30230 if (cost >= loadcost)
30231 cost -= loadcost;
30232 else
30233 cost = 0;
30235 break;
30237 case PROCESSOR_CORE2:
30238 case PROCESSOR_NEHALEM:
30239 case PROCESSOR_SANDYBRIDGE:
30240 case PROCESSOR_HASWELL:
30241 /* Stack engine allows to execute push&pop instructions in parall. */
30242 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30243 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30244 return 0;
30246 memory = get_attr_memory (insn);
30248 /* Show ability of reorder buffer to hide latency of load by executing
30249 in parallel with previous instruction in case
30250 previous instruction is not needed to compute the address. */
30251 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30252 && !ix86_agi_dependent (dep_insn, insn))
30254 if (cost >= 4)
30255 cost -= 4;
30256 else
30257 cost = 0;
30259 break;
30261 case PROCESSOR_SILVERMONT:
30262 case PROCESSOR_KNL:
30263 case PROCESSOR_INTEL:
30264 if (!reload_completed)
30265 return cost;
30267 /* Increase cost of integer loads. */
30268 memory = get_attr_memory (dep_insn);
30269 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30271 enum attr_unit unit = get_attr_unit (dep_insn);
30272 if (unit == UNIT_INTEGER && cost == 1)
30274 if (memory == MEMORY_LOAD)
30275 cost = 3;
30276 else
30278 /* Increase cost of ld/st for short int types only
30279 because of store forwarding issue. */
30280 rtx set = single_set (dep_insn);
30281 if (set && (GET_MODE (SET_DEST (set)) == QImode
30282 || GET_MODE (SET_DEST (set)) == HImode))
30284 /* Increase cost of store/load insn if exact
30285 dependence exists and it is load insn. */
30286 enum attr_memory insn_memory = get_attr_memory (insn);
30287 if (insn_memory == MEMORY_LOAD
30288 && exact_store_load_dependency (dep_insn, insn))
30289 cost = 3;
30295 default:
30296 break;
30299 return cost;
30302 /* How many alternative schedules to try. This should be as wide as the
30303 scheduling freedom in the DFA, but no wider. Making this value too
30304 large results extra work for the scheduler. */
30306 static int
30307 ia32_multipass_dfa_lookahead (void)
30309 switch (ix86_tune)
30311 case PROCESSOR_PENTIUM:
30312 case PROCESSOR_LAKEMONT:
30313 return 2;
30315 case PROCESSOR_PENTIUMPRO:
30316 case PROCESSOR_K6:
30317 return 1;
30319 case PROCESSOR_BDVER1:
30320 case PROCESSOR_BDVER2:
30321 case PROCESSOR_BDVER3:
30322 case PROCESSOR_BDVER4:
30323 /* We use lookahead value 4 for BD both before and after reload
30324 schedules. Plan is to have value 8 included for O3. */
30325 return 4;
30327 case PROCESSOR_CORE2:
30328 case PROCESSOR_NEHALEM:
30329 case PROCESSOR_SANDYBRIDGE:
30330 case PROCESSOR_HASWELL:
30331 case PROCESSOR_BONNELL:
30332 case PROCESSOR_SILVERMONT:
30333 case PROCESSOR_KNL:
30334 case PROCESSOR_INTEL:
30335 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30336 as many instructions can be executed on a cycle, i.e.,
30337 issue_rate. I wonder why tuning for many CPUs does not do this. */
30338 if (reload_completed)
30339 return ix86_issue_rate ();
30340 /* Don't use lookahead for pre-reload schedule to save compile time. */
30341 return 0;
30343 default:
30344 return 0;
30348 /* Return true if target platform supports macro-fusion. */
30350 static bool
30351 ix86_macro_fusion_p ()
30353 return TARGET_FUSE_CMP_AND_BRANCH;
30356 /* Check whether current microarchitecture support macro fusion
30357 for insn pair "CONDGEN + CONDJMP". Refer to
30358 "Intel Architectures Optimization Reference Manual". */
30360 static bool
30361 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30363 rtx src, dest;
30364 enum rtx_code ccode;
30365 rtx compare_set = NULL_RTX, test_if, cond;
30366 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30368 if (!any_condjump_p (condjmp))
30369 return false;
30371 unsigned int condreg1, condreg2;
30372 rtx cc_reg_1;
30373 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30374 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30375 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30376 || !condgen
30377 || !modified_in_p (cc_reg_1, condgen))
30378 return false;
30380 if (get_attr_type (condgen) != TYPE_TEST
30381 && get_attr_type (condgen) != TYPE_ICMP
30382 && get_attr_type (condgen) != TYPE_INCDEC
30383 && get_attr_type (condgen) != TYPE_ALU)
30384 return false;
30386 compare_set = single_set (condgen);
30387 if (compare_set == NULL_RTX
30388 && !TARGET_FUSE_ALU_AND_BRANCH)
30389 return false;
30391 if (compare_set == NULL_RTX)
30393 int i;
30394 rtx pat = PATTERN (condgen);
30395 for (i = 0; i < XVECLEN (pat, 0); i++)
30396 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30398 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30399 if (GET_CODE (set_src) == COMPARE)
30400 compare_set = XVECEXP (pat, 0, i);
30401 else
30402 alu_set = XVECEXP (pat, 0, i);
30405 if (compare_set == NULL_RTX)
30406 return false;
30407 src = SET_SRC (compare_set);
30408 if (GET_CODE (src) != COMPARE)
30409 return false;
30411 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30412 supported. */
30413 if ((MEM_P (XEXP (src, 0))
30414 && CONST_INT_P (XEXP (src, 1)))
30415 || (MEM_P (XEXP (src, 1))
30416 && CONST_INT_P (XEXP (src, 0))))
30417 return false;
30419 /* No fusion for RIP-relative address. */
30420 if (MEM_P (XEXP (src, 0)))
30421 addr = XEXP (XEXP (src, 0), 0);
30422 else if (MEM_P (XEXP (src, 1)))
30423 addr = XEXP (XEXP (src, 1), 0);
30425 if (addr) {
30426 ix86_address parts;
30427 int ok = ix86_decompose_address (addr, &parts);
30428 gcc_assert (ok);
30430 if (rip_relative_addr_p (&parts))
30431 return false;
30434 test_if = SET_SRC (pc_set (condjmp));
30435 cond = XEXP (test_if, 0);
30436 ccode = GET_CODE (cond);
30437 /* Check whether conditional jump use Sign or Overflow Flags. */
30438 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30439 && (ccode == GE
30440 || ccode == GT
30441 || ccode == LE
30442 || ccode == LT))
30443 return false;
30445 /* Return true for TYPE_TEST and TYPE_ICMP. */
30446 if (get_attr_type (condgen) == TYPE_TEST
30447 || get_attr_type (condgen) == TYPE_ICMP)
30448 return true;
30450 /* The following is the case that macro-fusion for alu + jmp. */
30451 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30452 return false;
30454 /* No fusion for alu op with memory destination operand. */
30455 dest = SET_DEST (alu_set);
30456 if (MEM_P (dest))
30457 return false;
30459 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30460 supported. */
30461 if (get_attr_type (condgen) == TYPE_INCDEC
30462 && (ccode == GEU
30463 || ccode == GTU
30464 || ccode == LEU
30465 || ccode == LTU))
30466 return false;
30468 return true;
30471 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30472 execution. It is applied if
30473 (1) IMUL instruction is on the top of list;
30474 (2) There exists the only producer of independent IMUL instruction in
30475 ready list.
30476 Return index of IMUL producer if it was found and -1 otherwise. */
30477 static int
30478 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30480 rtx_insn *insn;
30481 rtx set, insn1, insn2;
30482 sd_iterator_def sd_it;
30483 dep_t dep;
30484 int index = -1;
30485 int i;
30487 if (!TARGET_BONNELL)
30488 return index;
30490 /* Check that IMUL instruction is on the top of ready list. */
30491 insn = ready[n_ready - 1];
30492 set = single_set (insn);
30493 if (!set)
30494 return index;
30495 if (!(GET_CODE (SET_SRC (set)) == MULT
30496 && GET_MODE (SET_SRC (set)) == SImode))
30497 return index;
30499 /* Search for producer of independent IMUL instruction. */
30500 for (i = n_ready - 2; i >= 0; i--)
30502 insn = ready[i];
30503 if (!NONDEBUG_INSN_P (insn))
30504 continue;
30505 /* Skip IMUL instruction. */
30506 insn2 = PATTERN (insn);
30507 if (GET_CODE (insn2) == PARALLEL)
30508 insn2 = XVECEXP (insn2, 0, 0);
30509 if (GET_CODE (insn2) == SET
30510 && GET_CODE (SET_SRC (insn2)) == MULT
30511 && GET_MODE (SET_SRC (insn2)) == SImode)
30512 continue;
30514 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30516 rtx con;
30517 con = DEP_CON (dep);
30518 if (!NONDEBUG_INSN_P (con))
30519 continue;
30520 insn1 = PATTERN (con);
30521 if (GET_CODE (insn1) == PARALLEL)
30522 insn1 = XVECEXP (insn1, 0, 0);
30524 if (GET_CODE (insn1) == SET
30525 && GET_CODE (SET_SRC (insn1)) == MULT
30526 && GET_MODE (SET_SRC (insn1)) == SImode)
30528 sd_iterator_def sd_it1;
30529 dep_t dep1;
30530 /* Check if there is no other dependee for IMUL. */
30531 index = i;
30532 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30534 rtx pro;
30535 pro = DEP_PRO (dep1);
30536 if (!NONDEBUG_INSN_P (pro))
30537 continue;
30538 if (pro != insn)
30539 index = -1;
30541 if (index >= 0)
30542 break;
30545 if (index >= 0)
30546 break;
30548 return index;
30551 /* Try to find the best candidate on the top of ready list if two insns
30552 have the same priority - candidate is best if its dependees were
30553 scheduled earlier. Applied for Silvermont only.
30554 Return true if top 2 insns must be interchanged. */
30555 static bool
30556 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30558 rtx_insn *top = ready[n_ready - 1];
30559 rtx_insn *next = ready[n_ready - 2];
30560 rtx set;
30561 sd_iterator_def sd_it;
30562 dep_t dep;
30563 int clock1 = -1;
30564 int clock2 = -1;
30565 #define INSN_TICK(INSN) (HID (INSN)->tick)
30567 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30568 return false;
30570 if (!NONDEBUG_INSN_P (top))
30571 return false;
30572 if (!NONJUMP_INSN_P (top))
30573 return false;
30574 if (!NONDEBUG_INSN_P (next))
30575 return false;
30576 if (!NONJUMP_INSN_P (next))
30577 return false;
30578 set = single_set (top);
30579 if (!set)
30580 return false;
30581 set = single_set (next);
30582 if (!set)
30583 return false;
30585 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30587 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30588 return false;
30589 /* Determine winner more precise. */
30590 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30592 rtx pro;
30593 pro = DEP_PRO (dep);
30594 if (!NONDEBUG_INSN_P (pro))
30595 continue;
30596 if (INSN_TICK (pro) > clock1)
30597 clock1 = INSN_TICK (pro);
30599 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30601 rtx pro;
30602 pro = DEP_PRO (dep);
30603 if (!NONDEBUG_INSN_P (pro))
30604 continue;
30605 if (INSN_TICK (pro) > clock2)
30606 clock2 = INSN_TICK (pro);
30609 if (clock1 == clock2)
30611 /* Determine winner - load must win. */
30612 enum attr_memory memory1, memory2;
30613 memory1 = get_attr_memory (top);
30614 memory2 = get_attr_memory (next);
30615 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30616 return true;
30618 return (bool) (clock2 < clock1);
30620 return false;
30621 #undef INSN_TICK
30624 /* Perform possible reodering of ready list for Atom/Silvermont only.
30625 Return issue rate. */
30626 static int
30627 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30628 int *pn_ready, int clock_var)
30630 int issue_rate = -1;
30631 int n_ready = *pn_ready;
30632 int i;
30633 rtx_insn *insn;
30634 int index = -1;
30636 /* Set up issue rate. */
30637 issue_rate = ix86_issue_rate ();
30639 /* Do reodering for BONNELL/SILVERMONT only. */
30640 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30641 return issue_rate;
30643 /* Nothing to do if ready list contains only 1 instruction. */
30644 if (n_ready <= 1)
30645 return issue_rate;
30647 /* Do reodering for post-reload scheduler only. */
30648 if (!reload_completed)
30649 return issue_rate;
30651 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30653 if (sched_verbose > 1)
30654 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30655 INSN_UID (ready[index]));
30657 /* Put IMUL producer (ready[index]) at the top of ready list. */
30658 insn = ready[index];
30659 for (i = index; i < n_ready - 1; i++)
30660 ready[i] = ready[i + 1];
30661 ready[n_ready - 1] = insn;
30662 return issue_rate;
30665 /* Skip selective scheduling since HID is not populated in it. */
30666 if (clock_var != 0
30667 && !sel_sched_p ()
30668 && swap_top_of_ready_list (ready, n_ready))
30670 if (sched_verbose > 1)
30671 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30672 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30673 /* Swap 2 top elements of ready list. */
30674 insn = ready[n_ready - 1];
30675 ready[n_ready - 1] = ready[n_ready - 2];
30676 ready[n_ready - 2] = insn;
30678 return issue_rate;
30681 static bool
30682 ix86_class_likely_spilled_p (reg_class_t);
30684 /* Returns true if lhs of insn is HW function argument register and set up
30685 is_spilled to true if it is likely spilled HW register. */
30686 static bool
30687 insn_is_function_arg (rtx insn, bool* is_spilled)
30689 rtx dst;
30691 if (!NONDEBUG_INSN_P (insn))
30692 return false;
30693 /* Call instructions are not movable, ignore it. */
30694 if (CALL_P (insn))
30695 return false;
30696 insn = PATTERN (insn);
30697 if (GET_CODE (insn) == PARALLEL)
30698 insn = XVECEXP (insn, 0, 0);
30699 if (GET_CODE (insn) != SET)
30700 return false;
30701 dst = SET_DEST (insn);
30702 if (REG_P (dst) && HARD_REGISTER_P (dst)
30703 && ix86_function_arg_regno_p (REGNO (dst)))
30705 /* Is it likely spilled HW register? */
30706 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30707 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30708 *is_spilled = true;
30709 return true;
30711 return false;
30714 /* Add output dependencies for chain of function adjacent arguments if only
30715 there is a move to likely spilled HW register. Return first argument
30716 if at least one dependence was added or NULL otherwise. */
30717 static rtx_insn *
30718 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30720 rtx_insn *insn;
30721 rtx_insn *last = call;
30722 rtx_insn *first_arg = NULL;
30723 bool is_spilled = false;
30725 head = PREV_INSN (head);
30727 /* Find nearest to call argument passing instruction. */
30728 while (true)
30730 last = PREV_INSN (last);
30731 if (last == head)
30732 return NULL;
30733 if (!NONDEBUG_INSN_P (last))
30734 continue;
30735 if (insn_is_function_arg (last, &is_spilled))
30736 break;
30737 return NULL;
30740 first_arg = last;
30741 while (true)
30743 insn = PREV_INSN (last);
30744 if (!INSN_P (insn))
30745 break;
30746 if (insn == head)
30747 break;
30748 if (!NONDEBUG_INSN_P (insn))
30750 last = insn;
30751 continue;
30753 if (insn_is_function_arg (insn, &is_spilled))
30755 /* Add output depdendence between two function arguments if chain
30756 of output arguments contains likely spilled HW registers. */
30757 if (is_spilled)
30758 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30759 first_arg = last = insn;
30761 else
30762 break;
30764 if (!is_spilled)
30765 return NULL;
30766 return first_arg;
30769 /* Add output or anti dependency from insn to first_arg to restrict its code
30770 motion. */
30771 static void
30772 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
30774 rtx set;
30775 rtx tmp;
30777 /* Add anti dependencies for bounds stores. */
30778 if (INSN_P (insn)
30779 && GET_CODE (PATTERN (insn)) == PARALLEL
30780 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
30781 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
30783 add_dependence (first_arg, insn, REG_DEP_ANTI);
30784 return;
30787 set = single_set (insn);
30788 if (!set)
30789 return;
30790 tmp = SET_DEST (set);
30791 if (REG_P (tmp))
30793 /* Add output dependency to the first function argument. */
30794 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30795 return;
30797 /* Add anti dependency. */
30798 add_dependence (first_arg, insn, REG_DEP_ANTI);
30801 /* Avoid cross block motion of function argument through adding dependency
30802 from the first non-jump instruction in bb. */
30803 static void
30804 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
30806 rtx_insn *insn = BB_END (bb);
30808 while (insn)
30810 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
30812 rtx set = single_set (insn);
30813 if (set)
30815 avoid_func_arg_motion (arg, insn);
30816 return;
30819 if (insn == BB_HEAD (bb))
30820 return;
30821 insn = PREV_INSN (insn);
30825 /* Hook for pre-reload schedule - avoid motion of function arguments
30826 passed in likely spilled HW registers. */
30827 static void
30828 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
30830 rtx_insn *insn;
30831 rtx_insn *first_arg = NULL;
30832 if (reload_completed)
30833 return;
30834 while (head != tail && DEBUG_INSN_P (head))
30835 head = NEXT_INSN (head);
30836 for (insn = tail; insn != head; insn = PREV_INSN (insn))
30837 if (INSN_P (insn) && CALL_P (insn))
30839 first_arg = add_parameter_dependencies (insn, head);
30840 if (first_arg)
30842 /* Add dependee for first argument to predecessors if only
30843 region contains more than one block. */
30844 basic_block bb = BLOCK_FOR_INSN (insn);
30845 int rgn = CONTAINING_RGN (bb->index);
30846 int nr_blks = RGN_NR_BLOCKS (rgn);
30847 /* Skip trivial regions and region head blocks that can have
30848 predecessors outside of region. */
30849 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
30851 edge e;
30852 edge_iterator ei;
30854 /* Regions are SCCs with the exception of selective
30855 scheduling with pipelining of outer blocks enabled.
30856 So also check that immediate predecessors of a non-head
30857 block are in the same region. */
30858 FOR_EACH_EDGE (e, ei, bb->preds)
30860 /* Avoid creating of loop-carried dependencies through
30861 using topological ordering in the region. */
30862 if (rgn == CONTAINING_RGN (e->src->index)
30863 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
30864 add_dependee_for_func_arg (first_arg, e->src);
30867 insn = first_arg;
30868 if (insn == head)
30869 break;
30872 else if (first_arg)
30873 avoid_func_arg_motion (first_arg, insn);
30876 /* Hook for pre-reload schedule - set priority of moves from likely spilled
30877 HW registers to maximum, to schedule them at soon as possible. These are
30878 moves from function argument registers at the top of the function entry
30879 and moves from function return value registers after call. */
30880 static int
30881 ix86_adjust_priority (rtx_insn *insn, int priority)
30883 rtx set;
30885 if (reload_completed)
30886 return priority;
30888 if (!NONDEBUG_INSN_P (insn))
30889 return priority;
30891 set = single_set (insn);
30892 if (set)
30894 rtx tmp = SET_SRC (set);
30895 if (REG_P (tmp)
30896 && HARD_REGISTER_P (tmp)
30897 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
30898 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
30899 return current_sched_info->sched_max_insns_priority;
30902 return priority;
30905 /* Model decoder of Core 2/i7.
30906 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
30907 track the instruction fetch block boundaries and make sure that long
30908 (9+ bytes) instructions are assigned to D0. */
30910 /* Maximum length of an insn that can be handled by
30911 a secondary decoder unit. '8' for Core 2/i7. */
30912 static int core2i7_secondary_decoder_max_insn_size;
30914 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
30915 '16' for Core 2/i7. */
30916 static int core2i7_ifetch_block_size;
30918 /* Maximum number of instructions decoder can handle per cycle.
30919 '6' for Core 2/i7. */
30920 static int core2i7_ifetch_block_max_insns;
30922 typedef struct ix86_first_cycle_multipass_data_ *
30923 ix86_first_cycle_multipass_data_t;
30924 typedef const struct ix86_first_cycle_multipass_data_ *
30925 const_ix86_first_cycle_multipass_data_t;
30927 /* A variable to store target state across calls to max_issue within
30928 one cycle. */
30929 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
30930 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
30932 /* Initialize DATA. */
30933 static void
30934 core2i7_first_cycle_multipass_init (void *_data)
30936 ix86_first_cycle_multipass_data_t data
30937 = (ix86_first_cycle_multipass_data_t) _data;
30939 data->ifetch_block_len = 0;
30940 data->ifetch_block_n_insns = 0;
30941 data->ready_try_change = NULL;
30942 data->ready_try_change_size = 0;
30945 /* Advancing the cycle; reset ifetch block counts. */
30946 static void
30947 core2i7_dfa_post_advance_cycle (void)
30949 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
30951 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
30953 data->ifetch_block_len = 0;
30954 data->ifetch_block_n_insns = 0;
30957 static int min_insn_size (rtx_insn *);
30959 /* Filter out insns from ready_try that the core will not be able to issue
30960 on current cycle due to decoder. */
30961 static void
30962 core2i7_first_cycle_multipass_filter_ready_try
30963 (const_ix86_first_cycle_multipass_data_t data,
30964 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
30966 while (n_ready--)
30968 rtx_insn *insn;
30969 int insn_size;
30971 if (ready_try[n_ready])
30972 continue;
30974 insn = get_ready_element (n_ready);
30975 insn_size = min_insn_size (insn);
30977 if (/* If this is a too long an insn for a secondary decoder ... */
30978 (!first_cycle_insn_p
30979 && insn_size > core2i7_secondary_decoder_max_insn_size)
30980 /* ... or it would not fit into the ifetch block ... */
30981 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
30982 /* ... or the decoder is full already ... */
30983 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
30984 /* ... mask the insn out. */
30986 ready_try[n_ready] = 1;
30988 if (data->ready_try_change)
30989 bitmap_set_bit (data->ready_try_change, n_ready);
30994 /* Prepare for a new round of multipass lookahead scheduling. */
30995 static void
30996 core2i7_first_cycle_multipass_begin (void *_data,
30997 signed char *ready_try, int n_ready,
30998 bool first_cycle_insn_p)
31000 ix86_first_cycle_multipass_data_t data
31001 = (ix86_first_cycle_multipass_data_t) _data;
31002 const_ix86_first_cycle_multipass_data_t prev_data
31003 = ix86_first_cycle_multipass_data;
31005 /* Restore the state from the end of the previous round. */
31006 data->ifetch_block_len = prev_data->ifetch_block_len;
31007 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31009 /* Filter instructions that cannot be issued on current cycle due to
31010 decoder restrictions. */
31011 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31012 first_cycle_insn_p);
31015 /* INSN is being issued in current solution. Account for its impact on
31016 the decoder model. */
31017 static void
31018 core2i7_first_cycle_multipass_issue (void *_data,
31019 signed char *ready_try, int n_ready,
31020 rtx_insn *insn, const void *_prev_data)
31022 ix86_first_cycle_multipass_data_t data
31023 = (ix86_first_cycle_multipass_data_t) _data;
31024 const_ix86_first_cycle_multipass_data_t prev_data
31025 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31027 int insn_size = min_insn_size (insn);
31029 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31030 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31031 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31032 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31034 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31035 if (!data->ready_try_change)
31037 data->ready_try_change = sbitmap_alloc (n_ready);
31038 data->ready_try_change_size = n_ready;
31040 else if (data->ready_try_change_size < n_ready)
31042 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31043 n_ready, 0);
31044 data->ready_try_change_size = n_ready;
31046 bitmap_clear (data->ready_try_change);
31048 /* Filter out insns from ready_try that the core will not be able to issue
31049 on current cycle due to decoder. */
31050 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31051 false);
31054 /* Revert the effect on ready_try. */
31055 static void
31056 core2i7_first_cycle_multipass_backtrack (const void *_data,
31057 signed char *ready_try,
31058 int n_ready ATTRIBUTE_UNUSED)
31060 const_ix86_first_cycle_multipass_data_t data
31061 = (const_ix86_first_cycle_multipass_data_t) _data;
31062 unsigned int i = 0;
31063 sbitmap_iterator sbi;
31065 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31066 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31068 ready_try[i] = 0;
31072 /* Save the result of multipass lookahead scheduling for the next round. */
31073 static void
31074 core2i7_first_cycle_multipass_end (const void *_data)
31076 const_ix86_first_cycle_multipass_data_t data
31077 = (const_ix86_first_cycle_multipass_data_t) _data;
31078 ix86_first_cycle_multipass_data_t next_data
31079 = ix86_first_cycle_multipass_data;
31081 if (data != NULL)
31083 next_data->ifetch_block_len = data->ifetch_block_len;
31084 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31088 /* Deallocate target data. */
31089 static void
31090 core2i7_first_cycle_multipass_fini (void *_data)
31092 ix86_first_cycle_multipass_data_t data
31093 = (ix86_first_cycle_multipass_data_t) _data;
31095 if (data->ready_try_change)
31097 sbitmap_free (data->ready_try_change);
31098 data->ready_try_change = NULL;
31099 data->ready_try_change_size = 0;
31103 /* Prepare for scheduling pass. */
31104 static void
31105 ix86_sched_init_global (FILE *, int, int)
31107 /* Install scheduling hooks for current CPU. Some of these hooks are used
31108 in time-critical parts of the scheduler, so we only set them up when
31109 they are actually used. */
31110 switch (ix86_tune)
31112 case PROCESSOR_CORE2:
31113 case PROCESSOR_NEHALEM:
31114 case PROCESSOR_SANDYBRIDGE:
31115 case PROCESSOR_HASWELL:
31116 /* Do not perform multipass scheduling for pre-reload schedule
31117 to save compile time. */
31118 if (reload_completed)
31120 targetm.sched.dfa_post_advance_cycle
31121 = core2i7_dfa_post_advance_cycle;
31122 targetm.sched.first_cycle_multipass_init
31123 = core2i7_first_cycle_multipass_init;
31124 targetm.sched.first_cycle_multipass_begin
31125 = core2i7_first_cycle_multipass_begin;
31126 targetm.sched.first_cycle_multipass_issue
31127 = core2i7_first_cycle_multipass_issue;
31128 targetm.sched.first_cycle_multipass_backtrack
31129 = core2i7_first_cycle_multipass_backtrack;
31130 targetm.sched.first_cycle_multipass_end
31131 = core2i7_first_cycle_multipass_end;
31132 targetm.sched.first_cycle_multipass_fini
31133 = core2i7_first_cycle_multipass_fini;
31135 /* Set decoder parameters. */
31136 core2i7_secondary_decoder_max_insn_size = 8;
31137 core2i7_ifetch_block_size = 16;
31138 core2i7_ifetch_block_max_insns = 6;
31139 break;
31141 /* Fall through. */
31142 default:
31143 targetm.sched.dfa_post_advance_cycle = NULL;
31144 targetm.sched.first_cycle_multipass_init = NULL;
31145 targetm.sched.first_cycle_multipass_begin = NULL;
31146 targetm.sched.first_cycle_multipass_issue = NULL;
31147 targetm.sched.first_cycle_multipass_backtrack = NULL;
31148 targetm.sched.first_cycle_multipass_end = NULL;
31149 targetm.sched.first_cycle_multipass_fini = NULL;
31150 break;
31155 /* Compute the alignment given to a constant that is being placed in memory.
31156 EXP is the constant and ALIGN is the alignment that the object would
31157 ordinarily have.
31158 The value of this function is used instead of that alignment to align
31159 the object. */
31162 ix86_constant_alignment (tree exp, int align)
31164 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31165 || TREE_CODE (exp) == INTEGER_CST)
31167 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31168 return 64;
31169 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31170 return 128;
31172 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31173 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31174 return BITS_PER_WORD;
31176 return align;
31179 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31180 the data type, and ALIGN is the alignment that the object would
31181 ordinarily have. */
31183 static int
31184 iamcu_alignment (tree type, int align)
31186 machine_mode mode;
31188 if (align < 32 || TYPE_USER_ALIGN (type))
31189 return align;
31191 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31192 bytes. */
31193 mode = TYPE_MODE (strip_array_types (type));
31194 switch (GET_MODE_CLASS (mode))
31196 case MODE_INT:
31197 case MODE_COMPLEX_INT:
31198 case MODE_COMPLEX_FLOAT:
31199 case MODE_FLOAT:
31200 case MODE_DECIMAL_FLOAT:
31201 return 32;
31202 default:
31203 return align;
31207 /* Compute the alignment for a static variable.
31208 TYPE is the data type, and ALIGN is the alignment that
31209 the object would ordinarily have. The value of this function is used
31210 instead of that alignment to align the object. */
31213 ix86_data_alignment (tree type, int align, bool opt)
31215 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31216 for symbols from other compilation units or symbols that don't need
31217 to bind locally. In order to preserve some ABI compatibility with
31218 those compilers, ensure we don't decrease alignment from what we
31219 used to assume. */
31221 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31223 /* A data structure, equal or greater than the size of a cache line
31224 (64 bytes in the Pentium 4 and other recent Intel processors, including
31225 processors based on Intel Core microarchitecture) should be aligned
31226 so that its base address is a multiple of a cache line size. */
31228 int max_align
31229 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31231 if (max_align < BITS_PER_WORD)
31232 max_align = BITS_PER_WORD;
31234 switch (ix86_align_data_type)
31236 case ix86_align_data_type_abi: opt = false; break;
31237 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31238 case ix86_align_data_type_cacheline: break;
31241 if (TARGET_IAMCU)
31242 align = iamcu_alignment (type, align);
31244 if (opt
31245 && AGGREGATE_TYPE_P (type)
31246 && TYPE_SIZE (type)
31247 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31249 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31250 && align < max_align_compat)
31251 align = max_align_compat;
31252 if (wi::geu_p (TYPE_SIZE (type), max_align)
31253 && align < max_align)
31254 align = max_align;
31257 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31258 to 16byte boundary. */
31259 if (TARGET_64BIT)
31261 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31262 && TYPE_SIZE (type)
31263 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31264 && wi::geu_p (TYPE_SIZE (type), 128)
31265 && align < 128)
31266 return 128;
31269 if (!opt)
31270 return align;
31272 if (TREE_CODE (type) == ARRAY_TYPE)
31274 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31275 return 64;
31276 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31277 return 128;
31279 else if (TREE_CODE (type) == COMPLEX_TYPE)
31282 if (TYPE_MODE (type) == DCmode && align < 64)
31283 return 64;
31284 if ((TYPE_MODE (type) == XCmode
31285 || TYPE_MODE (type) == TCmode) && align < 128)
31286 return 128;
31288 else if ((TREE_CODE (type) == RECORD_TYPE
31289 || TREE_CODE (type) == UNION_TYPE
31290 || TREE_CODE (type) == QUAL_UNION_TYPE)
31291 && TYPE_FIELDS (type))
31293 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31294 return 64;
31295 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31296 return 128;
31298 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31299 || TREE_CODE (type) == INTEGER_TYPE)
31301 if (TYPE_MODE (type) == DFmode && align < 64)
31302 return 64;
31303 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31304 return 128;
31307 return align;
31310 /* Compute the alignment for a local variable or a stack slot. EXP is
31311 the data type or decl itself, MODE is the widest mode available and
31312 ALIGN is the alignment that the object would ordinarily have. The
31313 value of this macro is used instead of that alignment to align the
31314 object. */
31316 unsigned int
31317 ix86_local_alignment (tree exp, machine_mode mode,
31318 unsigned int align)
31320 tree type, decl;
31322 if (exp && DECL_P (exp))
31324 type = TREE_TYPE (exp);
31325 decl = exp;
31327 else
31329 type = exp;
31330 decl = NULL;
31333 /* Don't do dynamic stack realignment for long long objects with
31334 -mpreferred-stack-boundary=2. */
31335 if (!TARGET_64BIT
31336 && align == 64
31337 && ix86_preferred_stack_boundary < 64
31338 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31339 && (!type || !TYPE_USER_ALIGN (type))
31340 && (!decl || !DECL_USER_ALIGN (decl)))
31341 align = 32;
31343 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31344 register in MODE. We will return the largest alignment of XF
31345 and DF. */
31346 if (!type)
31348 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31349 align = GET_MODE_ALIGNMENT (DFmode);
31350 return align;
31353 /* Don't increase alignment for Intel MCU psABI. */
31354 if (TARGET_IAMCU)
31355 return align;
31357 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31358 to 16byte boundary. Exact wording is:
31360 An array uses the same alignment as its elements, except that a local or
31361 global array variable of length at least 16 bytes or
31362 a C99 variable-length array variable always has alignment of at least 16 bytes.
31364 This was added to allow use of aligned SSE instructions at arrays. This
31365 rule is meant for static storage (where compiler can not do the analysis
31366 by itself). We follow it for automatic variables only when convenient.
31367 We fully control everything in the function compiled and functions from
31368 other unit can not rely on the alignment.
31370 Exclude va_list type. It is the common case of local array where
31371 we can not benefit from the alignment.
31373 TODO: Probably one should optimize for size only when var is not escaping. */
31374 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31375 && TARGET_SSE)
31377 if (AGGREGATE_TYPE_P (type)
31378 && (va_list_type_node == NULL_TREE
31379 || (TYPE_MAIN_VARIANT (type)
31380 != TYPE_MAIN_VARIANT (va_list_type_node)))
31381 && TYPE_SIZE (type)
31382 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31383 && wi::geu_p (TYPE_SIZE (type), 128)
31384 && align < 128)
31385 return 128;
31387 if (TREE_CODE (type) == ARRAY_TYPE)
31389 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31390 return 64;
31391 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31392 return 128;
31394 else if (TREE_CODE (type) == COMPLEX_TYPE)
31396 if (TYPE_MODE (type) == DCmode && align < 64)
31397 return 64;
31398 if ((TYPE_MODE (type) == XCmode
31399 || TYPE_MODE (type) == TCmode) && align < 128)
31400 return 128;
31402 else if ((TREE_CODE (type) == RECORD_TYPE
31403 || TREE_CODE (type) == UNION_TYPE
31404 || TREE_CODE (type) == QUAL_UNION_TYPE)
31405 && TYPE_FIELDS (type))
31407 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31408 return 64;
31409 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31410 return 128;
31412 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31413 || TREE_CODE (type) == INTEGER_TYPE)
31416 if (TYPE_MODE (type) == DFmode && align < 64)
31417 return 64;
31418 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31419 return 128;
31421 return align;
31424 /* Compute the minimum required alignment for dynamic stack realignment
31425 purposes for a local variable, parameter or a stack slot. EXP is
31426 the data type or decl itself, MODE is its mode and ALIGN is the
31427 alignment that the object would ordinarily have. */
31429 unsigned int
31430 ix86_minimum_alignment (tree exp, machine_mode mode,
31431 unsigned int align)
31433 tree type, decl;
31435 if (exp && DECL_P (exp))
31437 type = TREE_TYPE (exp);
31438 decl = exp;
31440 else
31442 type = exp;
31443 decl = NULL;
31446 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31447 return align;
31449 /* Don't do dynamic stack realignment for long long objects with
31450 -mpreferred-stack-boundary=2. */
31451 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31452 && (!type || !TYPE_USER_ALIGN (type))
31453 && (!decl || !DECL_USER_ALIGN (decl)))
31455 gcc_checking_assert (!TARGET_STV);
31456 return 32;
31459 return align;
31462 /* Find a location for the static chain incoming to a nested function.
31463 This is a register, unless all free registers are used by arguments. */
31465 static rtx
31466 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31468 unsigned regno;
31470 /* While this function won't be called by the middle-end when a static
31471 chain isn't needed, it's also used throughout the backend so it's
31472 easiest to keep this check centralized. */
31473 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31474 return NULL;
31476 if (TARGET_64BIT)
31478 /* We always use R10 in 64-bit mode. */
31479 regno = R10_REG;
31481 else
31483 const_tree fntype, fndecl;
31484 unsigned int ccvt;
31486 /* By default in 32-bit mode we use ECX to pass the static chain. */
31487 regno = CX_REG;
31489 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31491 fntype = TREE_TYPE (fndecl_or_type);
31492 fndecl = fndecl_or_type;
31494 else
31496 fntype = fndecl_or_type;
31497 fndecl = NULL;
31500 ccvt = ix86_get_callcvt (fntype);
31501 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31503 /* Fastcall functions use ecx/edx for arguments, which leaves
31504 us with EAX for the static chain.
31505 Thiscall functions use ecx for arguments, which also
31506 leaves us with EAX for the static chain. */
31507 regno = AX_REG;
31509 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31511 /* Thiscall functions use ecx for arguments, which leaves
31512 us with EAX and EDX for the static chain.
31513 We are using for abi-compatibility EAX. */
31514 regno = AX_REG;
31516 else if (ix86_function_regparm (fntype, fndecl) == 3)
31518 /* For regparm 3, we have no free call-clobbered registers in
31519 which to store the static chain. In order to implement this,
31520 we have the trampoline push the static chain to the stack.
31521 However, we can't push a value below the return address when
31522 we call the nested function directly, so we have to use an
31523 alternate entry point. For this we use ESI, and have the
31524 alternate entry point push ESI, so that things appear the
31525 same once we're executing the nested function. */
31526 if (incoming_p)
31528 if (fndecl == current_function_decl
31529 && !ix86_static_chain_on_stack)
31531 gcc_assert (!reload_completed);
31532 ix86_static_chain_on_stack = true;
31534 return gen_frame_mem (SImode,
31535 plus_constant (Pmode,
31536 arg_pointer_rtx, -8));
31538 regno = SI_REG;
31542 return gen_rtx_REG (Pmode, regno);
31545 /* Emit RTL insns to initialize the variable parts of a trampoline.
31546 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31547 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31548 to be passed to the target function. */
31550 static void
31551 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31553 rtx mem, fnaddr;
31554 int opcode;
31555 int offset = 0;
31557 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31559 if (TARGET_64BIT)
31561 int size;
31563 /* Load the function address to r11. Try to load address using
31564 the shorter movl instead of movabs. We may want to support
31565 movq for kernel mode, but kernel does not use trampolines at
31566 the moment. FNADDR is a 32bit address and may not be in
31567 DImode when ptr_mode == SImode. Always use movl in this
31568 case. */
31569 if (ptr_mode == SImode
31570 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31572 fnaddr = copy_addr_to_reg (fnaddr);
31574 mem = adjust_address (m_tramp, HImode, offset);
31575 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31577 mem = adjust_address (m_tramp, SImode, offset + 2);
31578 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31579 offset += 6;
31581 else
31583 mem = adjust_address (m_tramp, HImode, offset);
31584 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31586 mem = adjust_address (m_tramp, DImode, offset + 2);
31587 emit_move_insn (mem, fnaddr);
31588 offset += 10;
31591 /* Load static chain using movabs to r10. Use the shorter movl
31592 instead of movabs when ptr_mode == SImode. */
31593 if (ptr_mode == SImode)
31595 opcode = 0xba41;
31596 size = 6;
31598 else
31600 opcode = 0xba49;
31601 size = 10;
31604 mem = adjust_address (m_tramp, HImode, offset);
31605 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31607 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31608 emit_move_insn (mem, chain_value);
31609 offset += size;
31611 /* Jump to r11; the last (unused) byte is a nop, only there to
31612 pad the write out to a single 32-bit store. */
31613 mem = adjust_address (m_tramp, SImode, offset);
31614 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31615 offset += 4;
31617 else
31619 rtx disp, chain;
31621 /* Depending on the static chain location, either load a register
31622 with a constant, or push the constant to the stack. All of the
31623 instructions are the same size. */
31624 chain = ix86_static_chain (fndecl, true);
31625 if (REG_P (chain))
31627 switch (REGNO (chain))
31629 case AX_REG:
31630 opcode = 0xb8; break;
31631 case CX_REG:
31632 opcode = 0xb9; break;
31633 default:
31634 gcc_unreachable ();
31637 else
31638 opcode = 0x68;
31640 mem = adjust_address (m_tramp, QImode, offset);
31641 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31643 mem = adjust_address (m_tramp, SImode, offset + 1);
31644 emit_move_insn (mem, chain_value);
31645 offset += 5;
31647 mem = adjust_address (m_tramp, QImode, offset);
31648 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31650 mem = adjust_address (m_tramp, SImode, offset + 1);
31652 /* Compute offset from the end of the jmp to the target function.
31653 In the case in which the trampoline stores the static chain on
31654 the stack, we need to skip the first insn which pushes the
31655 (call-saved) register static chain; this push is 1 byte. */
31656 offset += 5;
31657 disp = expand_binop (SImode, sub_optab, fnaddr,
31658 plus_constant (Pmode, XEXP (m_tramp, 0),
31659 offset - (MEM_P (chain) ? 1 : 0)),
31660 NULL_RTX, 1, OPTAB_DIRECT);
31661 emit_move_insn (mem, disp);
31664 gcc_assert (offset <= TRAMPOLINE_SIZE);
31666 #ifdef HAVE_ENABLE_EXECUTE_STACK
31667 #ifdef CHECK_EXECUTE_STACK_ENABLED
31668 if (CHECK_EXECUTE_STACK_ENABLED)
31669 #endif
31670 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31671 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
31672 #endif
31675 static bool
31676 ix86_allocate_stack_slots_for_args (void)
31678 /* Naked functions should not allocate stack slots for arguments. */
31679 return !ix86_function_naked (current_function_decl);
31682 static bool
31683 ix86_warn_func_return (tree decl)
31685 /* Naked functions are implemented entirely in assembly, including the
31686 return sequence, so suppress warnings about this. */
31687 return !ix86_function_naked (decl);
31690 /* The following file contains several enumerations and data structures
31691 built from the definitions in i386-builtin-types.def. */
31693 #include "i386-builtin-types.inc"
31695 /* Table for the ix86 builtin non-function types. */
31696 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31698 /* Retrieve an element from the above table, building some of
31699 the types lazily. */
31701 static tree
31702 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31704 unsigned int index;
31705 tree type, itype;
31707 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31709 type = ix86_builtin_type_tab[(int) tcode];
31710 if (type != NULL)
31711 return type;
31713 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31714 if (tcode <= IX86_BT_LAST_VECT)
31716 machine_mode mode;
31718 index = tcode - IX86_BT_LAST_PRIM - 1;
31719 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31720 mode = ix86_builtin_type_vect_mode[index];
31722 type = build_vector_type_for_mode (itype, mode);
31724 else
31726 int quals;
31728 index = tcode - IX86_BT_LAST_VECT - 1;
31729 if (tcode <= IX86_BT_LAST_PTR)
31730 quals = TYPE_UNQUALIFIED;
31731 else
31732 quals = TYPE_QUAL_CONST;
31734 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
31735 if (quals != TYPE_UNQUALIFIED)
31736 itype = build_qualified_type (itype, quals);
31738 type = build_pointer_type (itype);
31741 ix86_builtin_type_tab[(int) tcode] = type;
31742 return type;
31745 /* Table for the ix86 builtin function types. */
31746 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
31748 /* Retrieve an element from the above table, building some of
31749 the types lazily. */
31751 static tree
31752 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
31754 tree type;
31756 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
31758 type = ix86_builtin_func_type_tab[(int) tcode];
31759 if (type != NULL)
31760 return type;
31762 if (tcode <= IX86_BT_LAST_FUNC)
31764 unsigned start = ix86_builtin_func_start[(int) tcode];
31765 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
31766 tree rtype, atype, args = void_list_node;
31767 unsigned i;
31769 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
31770 for (i = after - 1; i > start; --i)
31772 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
31773 args = tree_cons (NULL, atype, args);
31776 type = build_function_type (rtype, args);
31778 else
31780 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
31781 enum ix86_builtin_func_type icode;
31783 icode = ix86_builtin_func_alias_base[index];
31784 type = ix86_get_builtin_func_type (icode);
31787 ix86_builtin_func_type_tab[(int) tcode] = type;
31788 return type;
31792 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
31793 bdesc_* arrays below should come first, then builtins for each bdesc_*
31794 array in ascending order, so that we can use direct array accesses. */
31795 enum ix86_builtins
31797 IX86_BUILTIN_MASKMOVQ,
31798 IX86_BUILTIN_LDMXCSR,
31799 IX86_BUILTIN_STMXCSR,
31800 IX86_BUILTIN_MASKMOVDQU,
31801 IX86_BUILTIN_PSLLDQ128,
31802 IX86_BUILTIN_CLFLUSH,
31803 IX86_BUILTIN_MONITOR,
31804 IX86_BUILTIN_MWAIT,
31805 IX86_BUILTIN_CLZERO,
31806 IX86_BUILTIN_VEC_INIT_V2SI,
31807 IX86_BUILTIN_VEC_INIT_V4HI,
31808 IX86_BUILTIN_VEC_INIT_V8QI,
31809 IX86_BUILTIN_VEC_EXT_V2DF,
31810 IX86_BUILTIN_VEC_EXT_V2DI,
31811 IX86_BUILTIN_VEC_EXT_V4SF,
31812 IX86_BUILTIN_VEC_EXT_V4SI,
31813 IX86_BUILTIN_VEC_EXT_V8HI,
31814 IX86_BUILTIN_VEC_EXT_V2SI,
31815 IX86_BUILTIN_VEC_EXT_V4HI,
31816 IX86_BUILTIN_VEC_EXT_V16QI,
31817 IX86_BUILTIN_VEC_SET_V2DI,
31818 IX86_BUILTIN_VEC_SET_V4SF,
31819 IX86_BUILTIN_VEC_SET_V4SI,
31820 IX86_BUILTIN_VEC_SET_V8HI,
31821 IX86_BUILTIN_VEC_SET_V4HI,
31822 IX86_BUILTIN_VEC_SET_V16QI,
31823 IX86_BUILTIN_GATHERSIV2DF,
31824 IX86_BUILTIN_GATHERSIV4DF,
31825 IX86_BUILTIN_GATHERDIV2DF,
31826 IX86_BUILTIN_GATHERDIV4DF,
31827 IX86_BUILTIN_GATHERSIV4SF,
31828 IX86_BUILTIN_GATHERSIV8SF,
31829 IX86_BUILTIN_GATHERDIV4SF,
31830 IX86_BUILTIN_GATHERDIV8SF,
31831 IX86_BUILTIN_GATHERSIV2DI,
31832 IX86_BUILTIN_GATHERSIV4DI,
31833 IX86_BUILTIN_GATHERDIV2DI,
31834 IX86_BUILTIN_GATHERDIV4DI,
31835 IX86_BUILTIN_GATHERSIV4SI,
31836 IX86_BUILTIN_GATHERSIV8SI,
31837 IX86_BUILTIN_GATHERDIV4SI,
31838 IX86_BUILTIN_GATHERDIV8SI,
31839 IX86_BUILTIN_VFMSUBSD3_MASK3,
31840 IX86_BUILTIN_VFMSUBSS3_MASK3,
31841 IX86_BUILTIN_GATHER3SIV8SF,
31842 IX86_BUILTIN_GATHER3SIV4SF,
31843 IX86_BUILTIN_GATHER3SIV4DF,
31844 IX86_BUILTIN_GATHER3SIV2DF,
31845 IX86_BUILTIN_GATHER3DIV8SF,
31846 IX86_BUILTIN_GATHER3DIV4SF,
31847 IX86_BUILTIN_GATHER3DIV4DF,
31848 IX86_BUILTIN_GATHER3DIV2DF,
31849 IX86_BUILTIN_GATHER3SIV8SI,
31850 IX86_BUILTIN_GATHER3SIV4SI,
31851 IX86_BUILTIN_GATHER3SIV4DI,
31852 IX86_BUILTIN_GATHER3SIV2DI,
31853 IX86_BUILTIN_GATHER3DIV8SI,
31854 IX86_BUILTIN_GATHER3DIV4SI,
31855 IX86_BUILTIN_GATHER3DIV4DI,
31856 IX86_BUILTIN_GATHER3DIV2DI,
31857 IX86_BUILTIN_SCATTERSIV8SF,
31858 IX86_BUILTIN_SCATTERSIV4SF,
31859 IX86_BUILTIN_SCATTERSIV4DF,
31860 IX86_BUILTIN_SCATTERSIV2DF,
31861 IX86_BUILTIN_SCATTERDIV8SF,
31862 IX86_BUILTIN_SCATTERDIV4SF,
31863 IX86_BUILTIN_SCATTERDIV4DF,
31864 IX86_BUILTIN_SCATTERDIV2DF,
31865 IX86_BUILTIN_SCATTERSIV8SI,
31866 IX86_BUILTIN_SCATTERSIV4SI,
31867 IX86_BUILTIN_SCATTERSIV4DI,
31868 IX86_BUILTIN_SCATTERSIV2DI,
31869 IX86_BUILTIN_SCATTERDIV8SI,
31870 IX86_BUILTIN_SCATTERDIV4SI,
31871 IX86_BUILTIN_SCATTERDIV4DI,
31872 IX86_BUILTIN_SCATTERDIV2DI,
31873 /* Alternate 4 and 8 element gather/scatter for the vectorizer
31874 where all operands are 32-byte or 64-byte wide respectively. */
31875 IX86_BUILTIN_GATHERALTSIV4DF,
31876 IX86_BUILTIN_GATHERALTDIV8SF,
31877 IX86_BUILTIN_GATHERALTSIV4DI,
31878 IX86_BUILTIN_GATHERALTDIV8SI,
31879 IX86_BUILTIN_GATHER3ALTDIV16SF,
31880 IX86_BUILTIN_GATHER3ALTDIV16SI,
31881 IX86_BUILTIN_GATHER3ALTSIV4DF,
31882 IX86_BUILTIN_GATHER3ALTDIV8SF,
31883 IX86_BUILTIN_GATHER3ALTSIV4DI,
31884 IX86_BUILTIN_GATHER3ALTDIV8SI,
31885 IX86_BUILTIN_GATHER3ALTSIV8DF,
31886 IX86_BUILTIN_GATHER3ALTSIV8DI,
31887 IX86_BUILTIN_GATHER3DIV16SF,
31888 IX86_BUILTIN_GATHER3DIV16SI,
31889 IX86_BUILTIN_GATHER3DIV8DF,
31890 IX86_BUILTIN_GATHER3DIV8DI,
31891 IX86_BUILTIN_GATHER3SIV16SF,
31892 IX86_BUILTIN_GATHER3SIV16SI,
31893 IX86_BUILTIN_GATHER3SIV8DF,
31894 IX86_BUILTIN_GATHER3SIV8DI,
31895 IX86_BUILTIN_SCATTERALTSIV8DF,
31896 IX86_BUILTIN_SCATTERALTDIV16SF,
31897 IX86_BUILTIN_SCATTERALTSIV8DI,
31898 IX86_BUILTIN_SCATTERALTDIV16SI,
31899 IX86_BUILTIN_SCATTERDIV16SF,
31900 IX86_BUILTIN_SCATTERDIV16SI,
31901 IX86_BUILTIN_SCATTERDIV8DF,
31902 IX86_BUILTIN_SCATTERDIV8DI,
31903 IX86_BUILTIN_SCATTERSIV16SF,
31904 IX86_BUILTIN_SCATTERSIV16SI,
31905 IX86_BUILTIN_SCATTERSIV8DF,
31906 IX86_BUILTIN_SCATTERSIV8DI,
31907 IX86_BUILTIN_GATHERPFQPD,
31908 IX86_BUILTIN_GATHERPFDPS,
31909 IX86_BUILTIN_GATHERPFDPD,
31910 IX86_BUILTIN_GATHERPFQPS,
31911 IX86_BUILTIN_SCATTERPFDPD,
31912 IX86_BUILTIN_SCATTERPFDPS,
31913 IX86_BUILTIN_SCATTERPFQPD,
31914 IX86_BUILTIN_SCATTERPFQPS,
31915 IX86_BUILTIN_CLWB,
31916 IX86_BUILTIN_CLFLUSHOPT,
31917 IX86_BUILTIN_INFQ,
31918 IX86_BUILTIN_HUGE_VALQ,
31919 IX86_BUILTIN_NANQ,
31920 IX86_BUILTIN_NANSQ,
31921 IX86_BUILTIN_XABORT,
31922 IX86_BUILTIN_ADDCARRYX32,
31923 IX86_BUILTIN_ADDCARRYX64,
31924 IX86_BUILTIN_SBB32,
31925 IX86_BUILTIN_SBB64,
31926 IX86_BUILTIN_RDRAND16_STEP,
31927 IX86_BUILTIN_RDRAND32_STEP,
31928 IX86_BUILTIN_RDRAND64_STEP,
31929 IX86_BUILTIN_RDSEED16_STEP,
31930 IX86_BUILTIN_RDSEED32_STEP,
31931 IX86_BUILTIN_RDSEED64_STEP,
31932 IX86_BUILTIN_MONITORX,
31933 IX86_BUILTIN_MWAITX,
31934 IX86_BUILTIN_CFSTRING,
31935 IX86_BUILTIN_CPU_INIT,
31936 IX86_BUILTIN_CPU_IS,
31937 IX86_BUILTIN_CPU_SUPPORTS,
31938 IX86_BUILTIN_READ_FLAGS,
31939 IX86_BUILTIN_WRITE_FLAGS,
31941 /* All the remaining builtins are tracked in bdesc_* arrays in
31942 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
31943 this point. */
31944 #define BDESC(mask, icode, name, code, comparison, flag) \
31945 code,
31946 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
31947 code, \
31948 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
31949 #define BDESC_END(kind, next_kind)
31951 #include "i386-builtin.def"
31953 #undef BDESC
31954 #undef BDESC_FIRST
31955 #undef BDESC_END
31957 IX86_BUILTIN_MAX,
31959 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
31961 /* Now just the aliases for bdesc_* start/end. */
31962 #define BDESC(mask, icode, name, code, comparison, flag)
31963 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
31964 #define BDESC_END(kind, next_kind) \
31965 IX86_BUILTIN__BDESC_##kind##_LAST \
31966 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
31968 #include "i386-builtin.def"
31970 #undef BDESC
31971 #undef BDESC_FIRST
31972 #undef BDESC_END
31974 /* Just to make sure there is no comma after the last enumerator. */
31975 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
31978 /* Table for the ix86 builtin decls. */
31979 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
31981 /* Table of all of the builtin functions that are possible with different ISA's
31982 but are waiting to be built until a function is declared to use that
31983 ISA. */
31984 struct builtin_isa {
31985 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
31986 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
31987 const char *name; /* function name */
31988 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
31989 unsigned char const_p:1; /* true if the declaration is constant */
31990 unsigned char pure_p:1; /* true if the declaration has pure attribute */
31991 bool leaf_p; /* true if the declaration has leaf attribute */
31992 bool nothrow_p; /* true if the declaration has nothrow attribute */
31993 bool set_and_not_built_p;
31996 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
31998 /* Bits that can still enable any inclusion of a builtin. */
31999 static HOST_WIDE_INT deferred_isa_values = 0;
32000 static HOST_WIDE_INT deferred_isa_values2 = 0;
32002 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
32003 of which isa_flags to use in the ix86_builtins_isa array. Stores the
32004 function decl in the ix86_builtins array. Returns the function decl or
32005 NULL_TREE, if the builtin was not added.
32007 If the front end has a special hook for builtin functions, delay adding
32008 builtin functions that aren't in the current ISA until the ISA is changed
32009 with function specific optimization. Doing so, can save about 300K for the
32010 default compiler. When the builtin is expanded, check at that time whether
32011 it is valid.
32013 If the front end doesn't have a special hook, record all builtins, even if
32014 it isn't an instruction set in the current ISA in case the user uses
32015 function specific options for a different ISA, so that we don't get scope
32016 errors if a builtin is added in the middle of a function scope. */
32018 static inline tree
32019 def_builtin (HOST_WIDE_INT mask, const char *name,
32020 enum ix86_builtin_func_type tcode,
32021 enum ix86_builtins code)
32023 tree decl = NULL_TREE;
32025 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32027 ix86_builtins_isa[(int) code].isa = mask;
32029 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32030 where any bit set means that built-in is enable, this bit must be *and-ed*
32031 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32032 means that *both* cpuid bits must be set for the built-in to be available.
32033 Handle this here. */
32034 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32035 mask &= ~OPTION_MASK_ISA_AVX512VL;
32037 mask &= ~OPTION_MASK_ISA_64BIT;
32038 if (mask == 0
32039 || (mask & ix86_isa_flags) != 0
32040 || (lang_hooks.builtin_function
32041 == lang_hooks.builtin_function_ext_scope))
32044 tree type = ix86_get_builtin_func_type (tcode);
32045 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32046 NULL, NULL_TREE);
32047 ix86_builtins[(int) code] = decl;
32048 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32050 else
32052 /* Just a MASK where set_and_not_built_p == true can potentially
32053 include a builtin. */
32054 deferred_isa_values |= mask;
32055 ix86_builtins[(int) code] = NULL_TREE;
32056 ix86_builtins_isa[(int) code].tcode = tcode;
32057 ix86_builtins_isa[(int) code].name = name;
32058 ix86_builtins_isa[(int) code].leaf_p = false;
32059 ix86_builtins_isa[(int) code].nothrow_p = false;
32060 ix86_builtins_isa[(int) code].const_p = false;
32061 ix86_builtins_isa[(int) code].pure_p = false;
32062 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32066 return decl;
32069 /* Like def_builtin, but also marks the function decl "const". */
32071 static inline tree
32072 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32073 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32075 tree decl = def_builtin (mask, name, tcode, code);
32076 if (decl)
32077 TREE_READONLY (decl) = 1;
32078 else
32079 ix86_builtins_isa[(int) code].const_p = true;
32081 return decl;
32084 /* Like def_builtin, but also marks the function decl "pure". */
32086 static inline tree
32087 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32088 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32090 tree decl = def_builtin (mask, name, tcode, code);
32091 if (decl)
32092 DECL_PURE_P (decl) = 1;
32093 else
32094 ix86_builtins_isa[(int) code].pure_p = true;
32096 return decl;
32099 /* Like def_builtin, but for additional isa2 flags. */
32101 static inline tree
32102 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32103 enum ix86_builtin_func_type tcode,
32104 enum ix86_builtins code)
32106 tree decl = NULL_TREE;
32108 ix86_builtins_isa[(int) code].isa2 = mask;
32110 if (mask == 0
32111 || (mask & ix86_isa_flags2) != 0
32112 || (lang_hooks.builtin_function
32113 == lang_hooks.builtin_function_ext_scope))
32116 tree type = ix86_get_builtin_func_type (tcode);
32117 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32118 NULL, NULL_TREE);
32119 ix86_builtins[(int) code] = decl;
32120 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32122 else
32124 /* Just a MASK where set_and_not_built_p == true can potentially
32125 include a builtin. */
32126 deferred_isa_values2 |= mask;
32127 ix86_builtins[(int) code] = NULL_TREE;
32128 ix86_builtins_isa[(int) code].tcode = tcode;
32129 ix86_builtins_isa[(int) code].name = name;
32130 ix86_builtins_isa[(int) code].leaf_p = false;
32131 ix86_builtins_isa[(int) code].nothrow_p = false;
32132 ix86_builtins_isa[(int) code].const_p = false;
32133 ix86_builtins_isa[(int) code].pure_p = false;
32134 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32137 return decl;
32140 /* Like def_builtin, but also marks the function decl "const". */
32142 static inline tree
32143 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32144 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32146 tree decl = def_builtin2 (mask, name, tcode, code);
32147 if (decl)
32148 TREE_READONLY (decl) = 1;
32149 else
32150 ix86_builtins_isa[(int) code].const_p = true;
32152 return decl;
32155 /* Like def_builtin, but also marks the function decl "pure". */
32157 static inline tree
32158 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32159 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32161 tree decl = def_builtin2 (mask, name, tcode, code);
32162 if (decl)
32163 DECL_PURE_P (decl) = 1;
32164 else
32165 ix86_builtins_isa[(int) code].pure_p = true;
32167 return decl;
32170 /* Add any new builtin functions for a given ISA that may not have been
32171 declared. This saves a bit of space compared to adding all of the
32172 declarations to the tree, even if we didn't use them. */
32174 static void
32175 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32177 if ((isa & deferred_isa_values) == 0
32178 && (isa2 & deferred_isa_values2) == 0)
32179 return;
32181 /* Bits in ISA value can be removed from potential isa values. */
32182 deferred_isa_values &= ~isa;
32183 deferred_isa_values2 &= ~isa2;
32185 int i;
32186 tree saved_current_target_pragma = current_target_pragma;
32187 current_target_pragma = NULL_TREE;
32189 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32191 if (((ix86_builtins_isa[i].isa & isa) != 0
32192 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32193 && ix86_builtins_isa[i].set_and_not_built_p)
32195 tree decl, type;
32197 /* Don't define the builtin again. */
32198 ix86_builtins_isa[i].set_and_not_built_p = false;
32200 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32201 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32202 type, i, BUILT_IN_MD, NULL,
32203 NULL_TREE);
32205 ix86_builtins[i] = decl;
32206 if (ix86_builtins_isa[i].const_p)
32207 TREE_READONLY (decl) = 1;
32208 if (ix86_builtins_isa[i].pure_p)
32209 DECL_PURE_P (decl) = 1;
32210 if (ix86_builtins_isa[i].leaf_p)
32211 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32212 NULL_TREE);
32213 if (ix86_builtins_isa[i].nothrow_p)
32214 TREE_NOTHROW (decl) = 1;
32218 current_target_pragma = saved_current_target_pragma;
32221 /* Bits for builtin_description.flag. */
32223 /* Set when we don't support the comparison natively, and should
32224 swap_comparison in order to support it. */
32225 #define BUILTIN_DESC_SWAP_OPERANDS 1
32227 struct builtin_description
32229 const HOST_WIDE_INT mask;
32230 const enum insn_code icode;
32231 const char *const name;
32232 const enum ix86_builtins code;
32233 const enum rtx_code comparison;
32234 const int flag;
32237 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32238 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32239 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32240 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32241 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32242 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32243 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32244 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32245 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32246 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32247 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32248 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32249 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32250 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32251 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32252 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32253 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32254 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32255 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32256 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32257 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32258 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32259 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32260 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32261 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32262 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32263 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32264 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32265 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32266 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32267 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32268 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32269 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32270 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32271 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32272 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32273 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32274 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32275 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32276 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32277 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32278 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32279 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32280 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32281 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32282 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32283 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32284 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32285 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32286 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32287 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32288 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32290 #define BDESC(mask, icode, name, code, comparison, flag) \
32291 { mask, icode, name, code, comparison, flag },
32292 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32293 static const struct builtin_description bdesc_##kind[] = \
32295 BDESC (mask, icode, name, code, comparison, flag)
32296 #define BDESC_END(kind, next_kind) \
32299 #include "i386-builtin.def"
32301 #undef BDESC
32302 #undef BDESC_FIRST
32303 #undef BDESC_END
32305 /* TM vector builtins. */
32307 /* Reuse the existing x86-specific `struct builtin_description' cause
32308 we're lazy. Add casts to make them fit. */
32309 static const struct builtin_description bdesc_tm[] =
32311 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32312 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32313 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32314 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32315 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32316 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32317 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32319 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32320 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32321 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32322 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32323 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32324 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32325 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32327 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32328 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32329 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32330 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32331 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32332 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32333 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32335 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32336 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32337 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32340 /* Initialize the transactional memory vector load/store builtins. */
32342 static void
32343 ix86_init_tm_builtins (void)
32345 enum ix86_builtin_func_type ftype;
32346 const struct builtin_description *d;
32347 size_t i;
32348 tree decl;
32349 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32350 tree attrs_log, attrs_type_log;
32352 if (!flag_tm)
32353 return;
32355 /* If there are no builtins defined, we must be compiling in a
32356 language without trans-mem support. */
32357 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32358 return;
32360 /* Use whatever attributes a normal TM load has. */
32361 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32362 attrs_load = DECL_ATTRIBUTES (decl);
32363 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32364 /* Use whatever attributes a normal TM store has. */
32365 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32366 attrs_store = DECL_ATTRIBUTES (decl);
32367 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32368 /* Use whatever attributes a normal TM log has. */
32369 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32370 attrs_log = DECL_ATTRIBUTES (decl);
32371 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32373 for (i = 0, d = bdesc_tm;
32374 i < ARRAY_SIZE (bdesc_tm);
32375 i++, d++)
32377 if ((d->mask & ix86_isa_flags) != 0
32378 || (lang_hooks.builtin_function
32379 == lang_hooks.builtin_function_ext_scope))
32381 tree type, attrs, attrs_type;
32382 enum built_in_function code = (enum built_in_function) d->code;
32384 ftype = (enum ix86_builtin_func_type) d->flag;
32385 type = ix86_get_builtin_func_type (ftype);
32387 if (BUILTIN_TM_LOAD_P (code))
32389 attrs = attrs_load;
32390 attrs_type = attrs_type_load;
32392 else if (BUILTIN_TM_STORE_P (code))
32394 attrs = attrs_store;
32395 attrs_type = attrs_type_store;
32397 else
32399 attrs = attrs_log;
32400 attrs_type = attrs_type_log;
32402 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32403 /* The builtin without the prefix for
32404 calling it directly. */
32405 d->name + strlen ("__builtin_"),
32406 attrs);
32407 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32408 set the TYPE_ATTRIBUTES. */
32409 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32411 set_builtin_decl (code, decl, false);
32416 /* Macros for verification of enum ix86_builtins order. */
32417 #define BDESC_VERIFY(x, y, z) \
32418 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32419 #define BDESC_VERIFYS(x, y, z) \
32420 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32422 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32423 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32424 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32425 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32426 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32427 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32428 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32429 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32430 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32431 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32432 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32433 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32434 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32435 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32436 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32437 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32438 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32439 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32440 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32441 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32443 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32444 in the current target ISA to allow the user to compile particular modules
32445 with different target specific options that differ from the command line
32446 options. */
32447 static void
32448 ix86_init_mmx_sse_builtins (void)
32450 const struct builtin_description * d;
32451 enum ix86_builtin_func_type ftype;
32452 size_t i;
32454 /* Add all special builtins with variable number of operands. */
32455 for (i = 0, d = bdesc_special_args;
32456 i < ARRAY_SIZE (bdesc_special_args);
32457 i++, d++)
32459 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32460 if (d->name == 0)
32461 continue;
32463 ftype = (enum ix86_builtin_func_type) d->flag;
32464 def_builtin (d->mask, d->name, ftype, d->code);
32466 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32467 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32468 ARRAY_SIZE (bdesc_special_args) - 1);
32470 /* Add all builtins with variable number of operands. */
32471 for (i = 0, d = bdesc_args;
32472 i < ARRAY_SIZE (bdesc_args);
32473 i++, d++)
32475 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32476 if (d->name == 0)
32477 continue;
32479 ftype = (enum ix86_builtin_func_type) d->flag;
32480 def_builtin_const (d->mask, d->name, ftype, d->code);
32482 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32483 IX86_BUILTIN__BDESC_ARGS_FIRST,
32484 ARRAY_SIZE (bdesc_args) - 1);
32486 /* Add all builtins with variable number of operands. */
32487 for (i = 0, d = bdesc_args2;
32488 i < ARRAY_SIZE (bdesc_args2);
32489 i++, d++)
32491 if (d->name == 0)
32492 continue;
32494 ftype = (enum ix86_builtin_func_type) d->flag;
32495 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32498 /* Add all builtins with rounding. */
32499 for (i = 0, d = bdesc_round_args;
32500 i < ARRAY_SIZE (bdesc_round_args);
32501 i++, d++)
32503 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32504 if (d->name == 0)
32505 continue;
32507 ftype = (enum ix86_builtin_func_type) d->flag;
32508 def_builtin_const (d->mask, d->name, ftype, d->code);
32510 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32511 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32512 ARRAY_SIZE (bdesc_round_args) - 1);
32514 /* pcmpestr[im] insns. */
32515 for (i = 0, d = bdesc_pcmpestr;
32516 i < ARRAY_SIZE (bdesc_pcmpestr);
32517 i++, d++)
32519 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32520 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32521 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32522 else
32523 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32524 def_builtin_const (d->mask, d->name, ftype, d->code);
32526 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32527 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32528 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32530 /* pcmpistr[im] insns. */
32531 for (i = 0, d = bdesc_pcmpistr;
32532 i < ARRAY_SIZE (bdesc_pcmpistr);
32533 i++, d++)
32535 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32536 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32537 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32538 else
32539 ftype = INT_FTYPE_V16QI_V16QI_INT;
32540 def_builtin_const (d->mask, d->name, ftype, d->code);
32542 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32543 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32544 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32546 /* comi/ucomi insns. */
32547 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32549 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32550 if (d->mask == OPTION_MASK_ISA_SSE2)
32551 ftype = INT_FTYPE_V2DF_V2DF;
32552 else
32553 ftype = INT_FTYPE_V4SF_V4SF;
32554 def_builtin_const (d->mask, d->name, ftype, d->code);
32556 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32557 IX86_BUILTIN__BDESC_COMI_FIRST,
32558 ARRAY_SIZE (bdesc_comi) - 1);
32560 /* SSE */
32561 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32562 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32563 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32564 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32566 /* SSE or 3DNow!A */
32567 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32568 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32569 IX86_BUILTIN_MASKMOVQ);
32571 /* SSE2 */
32572 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32573 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32575 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32576 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32577 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32578 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32580 /* SSE3. */
32581 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32582 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32583 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32584 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32586 /* AES */
32587 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32588 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32589 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32590 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32591 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32592 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32593 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32594 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32595 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32596 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32597 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32598 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32600 /* PCLMUL */
32601 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32602 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32604 /* RDRND */
32605 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32606 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32607 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32608 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32609 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32610 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32611 IX86_BUILTIN_RDRAND64_STEP);
32613 /* AVX2 */
32614 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32615 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32616 IX86_BUILTIN_GATHERSIV2DF);
32618 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32619 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32620 IX86_BUILTIN_GATHERSIV4DF);
32622 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32623 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32624 IX86_BUILTIN_GATHERDIV2DF);
32626 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32627 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32628 IX86_BUILTIN_GATHERDIV4DF);
32630 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32631 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32632 IX86_BUILTIN_GATHERSIV4SF);
32634 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32635 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32636 IX86_BUILTIN_GATHERSIV8SF);
32638 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32639 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32640 IX86_BUILTIN_GATHERDIV4SF);
32642 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32643 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32644 IX86_BUILTIN_GATHERDIV8SF);
32646 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32647 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32648 IX86_BUILTIN_GATHERSIV2DI);
32650 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32651 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32652 IX86_BUILTIN_GATHERSIV4DI);
32654 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32655 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32656 IX86_BUILTIN_GATHERDIV2DI);
32658 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32659 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32660 IX86_BUILTIN_GATHERDIV4DI);
32662 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32663 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32664 IX86_BUILTIN_GATHERSIV4SI);
32666 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32667 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32668 IX86_BUILTIN_GATHERSIV8SI);
32670 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32671 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32672 IX86_BUILTIN_GATHERDIV4SI);
32674 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32675 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32676 IX86_BUILTIN_GATHERDIV8SI);
32678 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32679 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32680 IX86_BUILTIN_GATHERALTSIV4DF);
32682 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32683 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32684 IX86_BUILTIN_GATHERALTDIV8SF);
32686 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32687 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32688 IX86_BUILTIN_GATHERALTSIV4DI);
32690 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32691 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32692 IX86_BUILTIN_GATHERALTDIV8SI);
32694 /* AVX512F */
32695 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32696 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32697 IX86_BUILTIN_GATHER3SIV16SF);
32699 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32700 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32701 IX86_BUILTIN_GATHER3SIV8DF);
32703 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32704 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32705 IX86_BUILTIN_GATHER3DIV16SF);
32707 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32708 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32709 IX86_BUILTIN_GATHER3DIV8DF);
32711 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32712 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32713 IX86_BUILTIN_GATHER3SIV16SI);
32715 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32716 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32717 IX86_BUILTIN_GATHER3SIV8DI);
32719 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32720 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32721 IX86_BUILTIN_GATHER3DIV16SI);
32723 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32724 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32725 IX86_BUILTIN_GATHER3DIV8DI);
32727 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32728 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32729 IX86_BUILTIN_GATHER3ALTSIV8DF);
32731 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32732 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32733 IX86_BUILTIN_GATHER3ALTDIV16SF);
32735 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
32736 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
32737 IX86_BUILTIN_GATHER3ALTSIV8DI);
32739 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
32740 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
32741 IX86_BUILTIN_GATHER3ALTDIV16SI);
32743 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
32744 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
32745 IX86_BUILTIN_SCATTERSIV16SF);
32747 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
32748 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
32749 IX86_BUILTIN_SCATTERSIV8DF);
32751 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
32752 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
32753 IX86_BUILTIN_SCATTERDIV16SF);
32755 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
32756 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
32757 IX86_BUILTIN_SCATTERDIV8DF);
32759 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
32760 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
32761 IX86_BUILTIN_SCATTERSIV16SI);
32763 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
32764 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
32765 IX86_BUILTIN_SCATTERSIV8DI);
32767 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
32768 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
32769 IX86_BUILTIN_SCATTERDIV16SI);
32771 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
32772 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
32773 IX86_BUILTIN_SCATTERDIV8DI);
32775 /* AVX512VL */
32776 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
32777 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
32778 IX86_BUILTIN_GATHER3SIV2DF);
32780 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
32781 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
32782 IX86_BUILTIN_GATHER3SIV4DF);
32784 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
32785 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
32786 IX86_BUILTIN_GATHER3DIV2DF);
32788 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
32789 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
32790 IX86_BUILTIN_GATHER3DIV4DF);
32792 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
32793 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
32794 IX86_BUILTIN_GATHER3SIV4SF);
32796 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
32797 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
32798 IX86_BUILTIN_GATHER3SIV8SF);
32800 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
32801 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
32802 IX86_BUILTIN_GATHER3DIV4SF);
32804 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
32805 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
32806 IX86_BUILTIN_GATHER3DIV8SF);
32808 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
32809 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
32810 IX86_BUILTIN_GATHER3SIV2DI);
32812 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
32813 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
32814 IX86_BUILTIN_GATHER3SIV4DI);
32816 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
32817 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
32818 IX86_BUILTIN_GATHER3DIV2DI);
32820 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
32821 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
32822 IX86_BUILTIN_GATHER3DIV4DI);
32824 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
32825 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
32826 IX86_BUILTIN_GATHER3SIV4SI);
32828 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
32829 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
32830 IX86_BUILTIN_GATHER3SIV8SI);
32832 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
32833 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
32834 IX86_BUILTIN_GATHER3DIV4SI);
32836 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
32837 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
32838 IX86_BUILTIN_GATHER3DIV8SI);
32840 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
32841 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
32842 IX86_BUILTIN_GATHER3ALTSIV4DF);
32844 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
32845 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
32846 IX86_BUILTIN_GATHER3ALTDIV8SF);
32848 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
32849 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
32850 IX86_BUILTIN_GATHER3ALTSIV4DI);
32852 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
32853 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
32854 IX86_BUILTIN_GATHER3ALTDIV8SI);
32856 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
32857 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
32858 IX86_BUILTIN_SCATTERSIV8SF);
32860 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
32861 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
32862 IX86_BUILTIN_SCATTERSIV4SF);
32864 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
32865 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
32866 IX86_BUILTIN_SCATTERSIV4DF);
32868 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
32869 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
32870 IX86_BUILTIN_SCATTERSIV2DF);
32872 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
32873 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
32874 IX86_BUILTIN_SCATTERDIV8SF);
32876 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
32877 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
32878 IX86_BUILTIN_SCATTERDIV4SF);
32880 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
32881 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
32882 IX86_BUILTIN_SCATTERDIV4DF);
32884 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
32885 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
32886 IX86_BUILTIN_SCATTERDIV2DF);
32888 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
32889 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
32890 IX86_BUILTIN_SCATTERSIV8SI);
32892 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
32893 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
32894 IX86_BUILTIN_SCATTERSIV4SI);
32896 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
32897 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
32898 IX86_BUILTIN_SCATTERSIV4DI);
32900 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
32901 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
32902 IX86_BUILTIN_SCATTERSIV2DI);
32904 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
32905 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
32906 IX86_BUILTIN_SCATTERDIV8SI);
32908 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
32909 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
32910 IX86_BUILTIN_SCATTERDIV4SI);
32912 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
32913 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
32914 IX86_BUILTIN_SCATTERDIV4DI);
32916 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
32917 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
32918 IX86_BUILTIN_SCATTERDIV2DI);
32919 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
32920 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
32921 IX86_BUILTIN_SCATTERALTSIV8DF);
32923 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
32924 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
32925 IX86_BUILTIN_SCATTERALTDIV16SF);
32927 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
32928 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
32929 IX86_BUILTIN_SCATTERALTSIV8DI);
32931 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
32932 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
32933 IX86_BUILTIN_SCATTERALTDIV16SI);
32935 /* AVX512PF */
32936 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
32937 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32938 IX86_BUILTIN_GATHERPFDPD);
32939 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
32940 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32941 IX86_BUILTIN_GATHERPFDPS);
32942 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
32943 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32944 IX86_BUILTIN_GATHERPFQPD);
32945 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
32946 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32947 IX86_BUILTIN_GATHERPFQPS);
32948 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
32949 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
32950 IX86_BUILTIN_SCATTERPFDPD);
32951 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
32952 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
32953 IX86_BUILTIN_SCATTERPFDPS);
32954 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
32955 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32956 IX86_BUILTIN_SCATTERPFQPD);
32957 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
32958 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
32959 IX86_BUILTIN_SCATTERPFQPS);
32961 /* SHA */
32962 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
32963 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
32964 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
32965 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
32966 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
32967 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
32968 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
32969 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
32970 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
32971 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
32972 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
32973 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
32974 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
32975 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
32977 /* RTM. */
32978 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
32979 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
32981 /* MMX access to the vec_init patterns. */
32982 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
32983 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
32985 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
32986 V4HI_FTYPE_HI_HI_HI_HI,
32987 IX86_BUILTIN_VEC_INIT_V4HI);
32989 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
32990 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
32991 IX86_BUILTIN_VEC_INIT_V8QI);
32993 /* Access to the vec_extract patterns. */
32994 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
32995 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
32996 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
32997 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
32998 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
32999 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
33000 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
33001 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
33002 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
33003 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
33005 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33006 "__builtin_ia32_vec_ext_v4hi",
33007 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
33009 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
33010 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
33012 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
33013 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
33015 /* Access to the vec_set patterns. */
33016 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
33017 "__builtin_ia32_vec_set_v2di",
33018 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33020 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33021 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33023 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33024 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33026 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33027 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33029 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33030 "__builtin_ia32_vec_set_v4hi",
33031 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33033 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33034 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33036 /* RDSEED */
33037 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33038 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33039 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33040 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33041 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33042 "__builtin_ia32_rdseed_di_step",
33043 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33045 /* ADCX */
33046 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33047 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33048 def_builtin (OPTION_MASK_ISA_64BIT,
33049 "__builtin_ia32_addcarryx_u64",
33050 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33051 IX86_BUILTIN_ADDCARRYX64);
33053 /* SBB */
33054 def_builtin (0, "__builtin_ia32_sbb_u32",
33055 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33056 def_builtin (OPTION_MASK_ISA_64BIT,
33057 "__builtin_ia32_sbb_u64",
33058 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33059 IX86_BUILTIN_SBB64);
33061 /* Read/write FLAGS. */
33062 def_builtin (0, "__builtin_ia32_readeflags_u32",
33063 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33064 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33065 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33066 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33067 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33068 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33069 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33071 /* CLFLUSHOPT. */
33072 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33073 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33075 /* CLWB. */
33076 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33077 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33079 /* MONITORX and MWAITX. */
33080 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33081 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33082 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33083 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33085 /* CLZERO. */
33086 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33087 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33089 /* Add FMA4 multi-arg argument instructions */
33090 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33092 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33093 if (d->name == 0)
33094 continue;
33096 ftype = (enum ix86_builtin_func_type) d->flag;
33097 def_builtin_const (d->mask, d->name, ftype, d->code);
33099 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33100 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33101 ARRAY_SIZE (bdesc_multi_arg) - 1);
33104 static void
33105 ix86_init_mpx_builtins ()
33107 const struct builtin_description * d;
33108 enum ix86_builtin_func_type ftype;
33109 tree decl;
33110 size_t i;
33112 for (i = 0, d = bdesc_mpx;
33113 i < ARRAY_SIZE (bdesc_mpx);
33114 i++, d++)
33116 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33117 if (d->name == 0)
33118 continue;
33120 ftype = (enum ix86_builtin_func_type) d->flag;
33121 decl = def_builtin (d->mask, d->name, ftype, d->code);
33123 /* With no leaf and nothrow flags for MPX builtins
33124 abnormal edges may follow its call when setjmp
33125 presents in the function. Since we may have a lot
33126 of MPX builtins calls it causes lots of useless
33127 edges and enormous PHI nodes. To avoid this we mark
33128 MPX builtins as leaf and nothrow. */
33129 if (decl)
33131 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33132 NULL_TREE);
33133 TREE_NOTHROW (decl) = 1;
33135 else
33137 ix86_builtins_isa[(int)d->code].leaf_p = true;
33138 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33141 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33142 IX86_BUILTIN__BDESC_MPX_FIRST,
33143 ARRAY_SIZE (bdesc_mpx) - 1);
33145 for (i = 0, d = bdesc_mpx_const;
33146 i < ARRAY_SIZE (bdesc_mpx_const);
33147 i++, d++)
33149 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33150 if (d->name == 0)
33151 continue;
33153 ftype = (enum ix86_builtin_func_type) d->flag;
33154 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33156 if (decl)
33158 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33159 NULL_TREE);
33160 TREE_NOTHROW (decl) = 1;
33162 else
33164 ix86_builtins_isa[(int)d->code].leaf_p = true;
33165 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33168 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33169 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33170 ARRAY_SIZE (bdesc_mpx_const) - 1);
33172 #undef BDESC_VERIFY
33173 #undef BDESC_VERIFYS
33175 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33176 to return a pointer to VERSION_DECL if the outcome of the expression
33177 formed by PREDICATE_CHAIN is true. This function will be called during
33178 version dispatch to decide which function version to execute. It returns
33179 the basic block at the end, to which more conditions can be added. */
33181 static basic_block
33182 add_condition_to_bb (tree function_decl, tree version_decl,
33183 tree predicate_chain, basic_block new_bb)
33185 gimple *return_stmt;
33186 tree convert_expr, result_var;
33187 gimple *convert_stmt;
33188 gimple *call_cond_stmt;
33189 gimple *if_else_stmt;
33191 basic_block bb1, bb2, bb3;
33192 edge e12, e23;
33194 tree cond_var, and_expr_var = NULL_TREE;
33195 gimple_seq gseq;
33197 tree predicate_decl, predicate_arg;
33199 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33201 gcc_assert (new_bb != NULL);
33202 gseq = bb_seq (new_bb);
33205 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33206 build_fold_addr_expr (version_decl));
33207 result_var = create_tmp_var (ptr_type_node);
33208 convert_stmt = gimple_build_assign (result_var, convert_expr);
33209 return_stmt = gimple_build_return (result_var);
33211 if (predicate_chain == NULL_TREE)
33213 gimple_seq_add_stmt (&gseq, convert_stmt);
33214 gimple_seq_add_stmt (&gseq, return_stmt);
33215 set_bb_seq (new_bb, gseq);
33216 gimple_set_bb (convert_stmt, new_bb);
33217 gimple_set_bb (return_stmt, new_bb);
33218 pop_cfun ();
33219 return new_bb;
33222 while (predicate_chain != NULL)
33224 cond_var = create_tmp_var (integer_type_node);
33225 predicate_decl = TREE_PURPOSE (predicate_chain);
33226 predicate_arg = TREE_VALUE (predicate_chain);
33227 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33228 gimple_call_set_lhs (call_cond_stmt, cond_var);
33230 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33231 gimple_set_bb (call_cond_stmt, new_bb);
33232 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33234 predicate_chain = TREE_CHAIN (predicate_chain);
33236 if (and_expr_var == NULL)
33237 and_expr_var = cond_var;
33238 else
33240 gimple *assign_stmt;
33241 /* Use MIN_EXPR to check if any integer is zero?.
33242 and_expr_var = min_expr <cond_var, and_expr_var> */
33243 assign_stmt = gimple_build_assign (and_expr_var,
33244 build2 (MIN_EXPR, integer_type_node,
33245 cond_var, and_expr_var));
33247 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33248 gimple_set_bb (assign_stmt, new_bb);
33249 gimple_seq_add_stmt (&gseq, assign_stmt);
33253 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33254 integer_zero_node,
33255 NULL_TREE, NULL_TREE);
33256 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33257 gimple_set_bb (if_else_stmt, new_bb);
33258 gimple_seq_add_stmt (&gseq, if_else_stmt);
33260 gimple_seq_add_stmt (&gseq, convert_stmt);
33261 gimple_seq_add_stmt (&gseq, return_stmt);
33262 set_bb_seq (new_bb, gseq);
33264 bb1 = new_bb;
33265 e12 = split_block (bb1, if_else_stmt);
33266 bb2 = e12->dest;
33267 e12->flags &= ~EDGE_FALLTHRU;
33268 e12->flags |= EDGE_TRUE_VALUE;
33270 e23 = split_block (bb2, return_stmt);
33272 gimple_set_bb (convert_stmt, bb2);
33273 gimple_set_bb (return_stmt, bb2);
33275 bb3 = e23->dest;
33276 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33278 remove_edge (e23);
33279 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33281 pop_cfun ();
33283 return bb3;
33286 /* This parses the attribute arguments to target in DECL and determines
33287 the right builtin to use to match the platform specification.
33288 It returns the priority value for this version decl. If PREDICATE_LIST
33289 is not NULL, it stores the list of cpu features that need to be checked
33290 before dispatching this function. */
33292 static unsigned int
33293 get_builtin_code_for_version (tree decl, tree *predicate_list)
33295 tree attrs;
33296 struct cl_target_option cur_target;
33297 tree target_node;
33298 struct cl_target_option *new_target;
33299 const char *arg_str = NULL;
33300 const char *attrs_str = NULL;
33301 char *tok_str = NULL;
33302 char *token;
33304 /* Priority of i386 features, greater value is higher priority. This is
33305 used to decide the order in which function dispatch must happen. For
33306 instance, a version specialized for SSE4.2 should be checked for dispatch
33307 before a version for SSE3, as SSE4.2 implies SSE3. */
33308 enum feature_priority
33310 P_ZERO = 0,
33311 P_MMX,
33312 P_SSE,
33313 P_SSE2,
33314 P_SSE3,
33315 P_SSSE3,
33316 P_PROC_SSSE3,
33317 P_SSE4_A,
33318 P_PROC_SSE4_A,
33319 P_SSE4_1,
33320 P_SSE4_2,
33321 P_PROC_SSE4_2,
33322 P_POPCNT,
33323 P_AES,
33324 P_PCLMUL,
33325 P_AVX,
33326 P_PROC_AVX,
33327 P_BMI,
33328 P_PROC_BMI,
33329 P_FMA4,
33330 P_XOP,
33331 P_PROC_XOP,
33332 P_FMA,
33333 P_PROC_FMA,
33334 P_BMI2,
33335 P_AVX2,
33336 P_PROC_AVX2,
33337 P_AVX512F,
33338 P_PROC_AVX512F
33341 enum feature_priority priority = P_ZERO;
33343 /* These are the target attribute strings for which a dispatcher is
33344 available, from fold_builtin_cpu. */
33346 static struct _feature_list
33348 const char *const name;
33349 const enum feature_priority priority;
33351 const feature_list[] =
33353 {"mmx", P_MMX},
33354 {"sse", P_SSE},
33355 {"sse2", P_SSE2},
33356 {"sse3", P_SSE3},
33357 {"sse4a", P_SSE4_A},
33358 {"ssse3", P_SSSE3},
33359 {"sse4.1", P_SSE4_1},
33360 {"sse4.2", P_SSE4_2},
33361 {"popcnt", P_POPCNT},
33362 {"aes", P_AES},
33363 {"pclmul", P_PCLMUL},
33364 {"avx", P_AVX},
33365 {"bmi", P_BMI},
33366 {"fma4", P_FMA4},
33367 {"xop", P_XOP},
33368 {"fma", P_FMA},
33369 {"bmi2", P_BMI2},
33370 {"avx2", P_AVX2},
33371 {"avx512f", P_AVX512F}
33375 static unsigned int NUM_FEATURES
33376 = sizeof (feature_list) / sizeof (struct _feature_list);
33378 unsigned int i;
33380 tree predicate_chain = NULL_TREE;
33381 tree predicate_decl, predicate_arg;
33383 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33384 gcc_assert (attrs != NULL);
33386 attrs = TREE_VALUE (TREE_VALUE (attrs));
33388 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33389 attrs_str = TREE_STRING_POINTER (attrs);
33391 /* Return priority zero for default function. */
33392 if (strcmp (attrs_str, "default") == 0)
33393 return 0;
33395 /* Handle arch= if specified. For priority, set it to be 1 more than
33396 the best instruction set the processor can handle. For instance, if
33397 there is a version for atom and a version for ssse3 (the highest ISA
33398 priority for atom), the atom version must be checked for dispatch
33399 before the ssse3 version. */
33400 if (strstr (attrs_str, "arch=") != NULL)
33402 cl_target_option_save (&cur_target, &global_options);
33403 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33404 &global_options_set);
33406 gcc_assert (target_node);
33407 new_target = TREE_TARGET_OPTION (target_node);
33408 gcc_assert (new_target);
33410 if (new_target->arch_specified && new_target->arch > 0)
33412 switch (new_target->arch)
33414 case PROCESSOR_CORE2:
33415 arg_str = "core2";
33416 priority = P_PROC_SSSE3;
33417 break;
33418 case PROCESSOR_NEHALEM:
33419 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33420 arg_str = "westmere";
33421 else
33422 /* We translate "arch=corei7" and "arch=nehalem" to
33423 "corei7" so that it will be mapped to M_INTEL_COREI7
33424 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33425 arg_str = "corei7";
33426 priority = P_PROC_SSE4_2;
33427 break;
33428 case PROCESSOR_SANDYBRIDGE:
33429 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33430 arg_str = "ivybridge";
33431 else
33432 arg_str = "sandybridge";
33433 priority = P_PROC_AVX;
33434 break;
33435 case PROCESSOR_HASWELL:
33436 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33437 arg_str = "skylake-avx512";
33438 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33439 arg_str = "skylake";
33440 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33441 arg_str = "broadwell";
33442 else
33443 arg_str = "haswell";
33444 priority = P_PROC_AVX2;
33445 break;
33446 case PROCESSOR_BONNELL:
33447 arg_str = "bonnell";
33448 priority = P_PROC_SSSE3;
33449 break;
33450 case PROCESSOR_KNL:
33451 arg_str = "knl";
33452 priority = P_PROC_AVX512F;
33453 break;
33454 case PROCESSOR_SILVERMONT:
33455 arg_str = "silvermont";
33456 priority = P_PROC_SSE4_2;
33457 break;
33458 case PROCESSOR_AMDFAM10:
33459 arg_str = "amdfam10h";
33460 priority = P_PROC_SSE4_A;
33461 break;
33462 case PROCESSOR_BTVER1:
33463 arg_str = "btver1";
33464 priority = P_PROC_SSE4_A;
33465 break;
33466 case PROCESSOR_BTVER2:
33467 arg_str = "btver2";
33468 priority = P_PROC_BMI;
33469 break;
33470 case PROCESSOR_BDVER1:
33471 arg_str = "bdver1";
33472 priority = P_PROC_XOP;
33473 break;
33474 case PROCESSOR_BDVER2:
33475 arg_str = "bdver2";
33476 priority = P_PROC_FMA;
33477 break;
33478 case PROCESSOR_BDVER3:
33479 arg_str = "bdver3";
33480 priority = P_PROC_FMA;
33481 break;
33482 case PROCESSOR_BDVER4:
33483 arg_str = "bdver4";
33484 priority = P_PROC_AVX2;
33485 break;
33486 case PROCESSOR_ZNVER1:
33487 arg_str = "znver1";
33488 priority = P_PROC_AVX2;
33489 break;
33493 cl_target_option_restore (&global_options, &cur_target);
33495 if (predicate_list && arg_str == NULL)
33497 error_at (DECL_SOURCE_LOCATION (decl),
33498 "No dispatcher found for the versioning attributes");
33499 return 0;
33502 if (predicate_list)
33504 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33505 /* For a C string literal the length includes the trailing NULL. */
33506 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33507 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33508 predicate_chain);
33512 /* Process feature name. */
33513 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33514 strcpy (tok_str, attrs_str);
33515 token = strtok (tok_str, ",");
33516 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33518 while (token != NULL)
33520 /* Do not process "arch=" */
33521 if (strncmp (token, "arch=", 5) == 0)
33523 token = strtok (NULL, ",");
33524 continue;
33526 for (i = 0; i < NUM_FEATURES; ++i)
33528 if (strcmp (token, feature_list[i].name) == 0)
33530 if (predicate_list)
33532 predicate_arg = build_string_literal (
33533 strlen (feature_list[i].name) + 1,
33534 feature_list[i].name);
33535 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33536 predicate_chain);
33538 /* Find the maximum priority feature. */
33539 if (feature_list[i].priority > priority)
33540 priority = feature_list[i].priority;
33542 break;
33545 if (predicate_list && i == NUM_FEATURES)
33547 error_at (DECL_SOURCE_LOCATION (decl),
33548 "No dispatcher found for %s", token);
33549 return 0;
33551 token = strtok (NULL, ",");
33553 free (tok_str);
33555 if (predicate_list && predicate_chain == NULL_TREE)
33557 error_at (DECL_SOURCE_LOCATION (decl),
33558 "No dispatcher found for the versioning attributes : %s",
33559 attrs_str);
33560 return 0;
33562 else if (predicate_list)
33564 predicate_chain = nreverse (predicate_chain);
33565 *predicate_list = predicate_chain;
33568 return priority;
33571 /* This compares the priority of target features in function DECL1
33572 and DECL2. It returns positive value if DECL1 is higher priority,
33573 negative value if DECL2 is higher priority and 0 if they are the
33574 same. */
33576 static int
33577 ix86_compare_version_priority (tree decl1, tree decl2)
33579 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33580 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33582 return (int)priority1 - (int)priority2;
33585 /* V1 and V2 point to function versions with different priorities
33586 based on the target ISA. This function compares their priorities. */
33588 static int
33589 feature_compare (const void *v1, const void *v2)
33591 typedef struct _function_version_info
33593 tree version_decl;
33594 tree predicate_chain;
33595 unsigned int dispatch_priority;
33596 } function_version_info;
33598 const function_version_info c1 = *(const function_version_info *)v1;
33599 const function_version_info c2 = *(const function_version_info *)v2;
33600 return (c2.dispatch_priority - c1.dispatch_priority);
33603 /* This function generates the dispatch function for
33604 multi-versioned functions. DISPATCH_DECL is the function which will
33605 contain the dispatch logic. FNDECLS are the function choices for
33606 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33607 in DISPATCH_DECL in which the dispatch code is generated. */
33609 static int
33610 dispatch_function_versions (tree dispatch_decl,
33611 void *fndecls_p,
33612 basic_block *empty_bb)
33614 tree default_decl;
33615 gimple *ifunc_cpu_init_stmt;
33616 gimple_seq gseq;
33617 int ix;
33618 tree ele;
33619 vec<tree> *fndecls;
33620 unsigned int num_versions = 0;
33621 unsigned int actual_versions = 0;
33622 unsigned int i;
33624 struct _function_version_info
33626 tree version_decl;
33627 tree predicate_chain;
33628 unsigned int dispatch_priority;
33629 }*function_version_info;
33631 gcc_assert (dispatch_decl != NULL
33632 && fndecls_p != NULL
33633 && empty_bb != NULL);
33635 /*fndecls_p is actually a vector. */
33636 fndecls = static_cast<vec<tree> *> (fndecls_p);
33638 /* At least one more version other than the default. */
33639 num_versions = fndecls->length ();
33640 gcc_assert (num_versions >= 2);
33642 function_version_info = (struct _function_version_info *)
33643 XNEWVEC (struct _function_version_info, (num_versions - 1));
33645 /* The first version in the vector is the default decl. */
33646 default_decl = (*fndecls)[0];
33648 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33650 gseq = bb_seq (*empty_bb);
33651 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33652 constructors, so explicity call __builtin_cpu_init here. */
33653 ifunc_cpu_init_stmt = gimple_build_call_vec (
33654 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33655 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33656 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33657 set_bb_seq (*empty_bb, gseq);
33659 pop_cfun ();
33662 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33664 tree version_decl = ele;
33665 tree predicate_chain = NULL_TREE;
33666 unsigned int priority;
33667 /* Get attribute string, parse it and find the right predicate decl.
33668 The predicate function could be a lengthy combination of many
33669 features, like arch-type and various isa-variants. */
33670 priority = get_builtin_code_for_version (version_decl,
33671 &predicate_chain);
33673 if (predicate_chain == NULL_TREE)
33674 continue;
33676 function_version_info [actual_versions].version_decl = version_decl;
33677 function_version_info [actual_versions].predicate_chain
33678 = predicate_chain;
33679 function_version_info [actual_versions].dispatch_priority = priority;
33680 actual_versions++;
33683 /* Sort the versions according to descending order of dispatch priority. The
33684 priority is based on the ISA. This is not a perfect solution. There
33685 could still be ambiguity. If more than one function version is suitable
33686 to execute, which one should be dispatched? In future, allow the user
33687 to specify a dispatch priority next to the version. */
33688 qsort (function_version_info, actual_versions,
33689 sizeof (struct _function_version_info), feature_compare);
33691 for (i = 0; i < actual_versions; ++i)
33692 *empty_bb = add_condition_to_bb (dispatch_decl,
33693 function_version_info[i].version_decl,
33694 function_version_info[i].predicate_chain,
33695 *empty_bb);
33697 /* dispatch default version at the end. */
33698 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33699 NULL, *empty_bb);
33701 free (function_version_info);
33702 return 0;
33705 /* This function changes the assembler name for functions that are
33706 versions. If DECL is a function version and has a "target"
33707 attribute, it appends the attribute string to its assembler name. */
33709 static tree
33710 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33712 tree version_attr;
33713 const char *orig_name, *version_string;
33714 char *attr_str, *assembler_name;
33716 if (DECL_DECLARED_INLINE_P (decl)
33717 && lookup_attribute ("gnu_inline",
33718 DECL_ATTRIBUTES (decl)))
33719 error_at (DECL_SOURCE_LOCATION (decl),
33720 "Function versions cannot be marked as gnu_inline,"
33721 " bodies have to be generated");
33723 if (DECL_VIRTUAL_P (decl)
33724 || DECL_VINDEX (decl))
33725 sorry ("Virtual function multiversioning not supported");
33727 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33729 /* target attribute string cannot be NULL. */
33730 gcc_assert (version_attr != NULL_TREE);
33732 orig_name = IDENTIFIER_POINTER (id);
33733 version_string
33734 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
33736 if (strcmp (version_string, "default") == 0)
33737 return id;
33739 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
33740 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
33742 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
33744 /* Allow assembler name to be modified if already set. */
33745 if (DECL_ASSEMBLER_NAME_SET_P (decl))
33746 SET_DECL_RTL (decl, NULL);
33748 tree ret = get_identifier (assembler_name);
33749 XDELETEVEC (attr_str);
33750 XDELETEVEC (assembler_name);
33751 return ret;
33755 static tree
33756 ix86_mangle_decl_assembler_name (tree decl, tree id)
33758 /* For function version, add the target suffix to the assembler name. */
33759 if (TREE_CODE (decl) == FUNCTION_DECL
33760 && DECL_FUNCTION_VERSIONED (decl))
33761 id = ix86_mangle_function_version_assembler_name (decl, id);
33762 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
33763 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
33764 #endif
33766 return id;
33769 /* Make a dispatcher declaration for the multi-versioned function DECL.
33770 Calls to DECL function will be replaced with calls to the dispatcher
33771 by the front-end. Returns the decl of the dispatcher function. */
33773 static tree
33774 ix86_get_function_versions_dispatcher (void *decl)
33776 tree fn = (tree) decl;
33777 struct cgraph_node *node = NULL;
33778 struct cgraph_node *default_node = NULL;
33779 struct cgraph_function_version_info *node_v = NULL;
33780 struct cgraph_function_version_info *first_v = NULL;
33782 tree dispatch_decl = NULL;
33784 struct cgraph_function_version_info *default_version_info = NULL;
33786 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33788 node = cgraph_node::get (fn);
33789 gcc_assert (node != NULL);
33791 node_v = node->function_version ();
33792 gcc_assert (node_v != NULL);
33794 if (node_v->dispatcher_resolver != NULL)
33795 return node_v->dispatcher_resolver;
33797 /* Find the default version and make it the first node. */
33798 first_v = node_v;
33799 /* Go to the beginning of the chain. */
33800 while (first_v->prev != NULL)
33801 first_v = first_v->prev;
33802 default_version_info = first_v;
33803 while (default_version_info != NULL)
33805 if (is_function_default_version
33806 (default_version_info->this_node->decl))
33807 break;
33808 default_version_info = default_version_info->next;
33811 /* If there is no default node, just return NULL. */
33812 if (default_version_info == NULL)
33813 return NULL;
33815 /* Make default info the first node. */
33816 if (first_v != default_version_info)
33818 default_version_info->prev->next = default_version_info->next;
33819 if (default_version_info->next)
33820 default_version_info->next->prev = default_version_info->prev;
33821 first_v->prev = default_version_info;
33822 default_version_info->next = first_v;
33823 default_version_info->prev = NULL;
33826 default_node = default_version_info->this_node;
33828 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33829 if (targetm.has_ifunc_p ())
33831 struct cgraph_function_version_info *it_v = NULL;
33832 struct cgraph_node *dispatcher_node = NULL;
33833 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33835 /* Right now, the dispatching is done via ifunc. */
33836 dispatch_decl = make_dispatcher_decl (default_node->decl);
33838 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33839 gcc_assert (dispatcher_node != NULL);
33840 dispatcher_node->dispatcher_function = 1;
33841 dispatcher_version_info
33842 = dispatcher_node->insert_new_function_version ();
33843 dispatcher_version_info->next = default_version_info;
33844 dispatcher_node->definition = 1;
33846 /* Set the dispatcher for all the versions. */
33847 it_v = default_version_info;
33848 while (it_v != NULL)
33850 it_v->dispatcher_resolver = dispatch_decl;
33851 it_v = it_v->next;
33854 else
33855 #endif
33857 error_at (DECL_SOURCE_LOCATION (default_node->decl),
33858 "multiversioning needs ifunc which is not supported "
33859 "on this target");
33862 return dispatch_decl;
33865 /* Make the resolver function decl to dispatch the versions of
33866 a multi-versioned function, DEFAULT_DECL. Create an
33867 empty basic block in the resolver and store the pointer in
33868 EMPTY_BB. Return the decl of the resolver function. */
33870 static tree
33871 make_resolver_func (const tree default_decl,
33872 const tree dispatch_decl,
33873 basic_block *empty_bb)
33875 char *resolver_name;
33876 tree decl, type, decl_name, t;
33877 bool is_uniq = false;
33879 /* IFUNC's have to be globally visible. So, if the default_decl is
33880 not, then the name of the IFUNC should be made unique. */
33881 if (TREE_PUBLIC (default_decl) == 0)
33882 is_uniq = true;
33884 /* Append the filename to the resolver function if the versions are
33885 not externally visible. This is because the resolver function has
33886 to be externally visible for the loader to find it. So, appending
33887 the filename will prevent conflicts with a resolver function from
33888 another module which is based on the same version name. */
33889 resolver_name = make_unique_name (default_decl, "resolver", is_uniq);
33891 /* The resolver function should return a (void *). */
33892 type = build_function_type_list (ptr_type_node, NULL_TREE);
33894 decl = build_fn_decl (resolver_name, type);
33895 decl_name = get_identifier (resolver_name);
33896 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
33898 DECL_NAME (decl) = decl_name;
33899 TREE_USED (decl) = 1;
33900 DECL_ARTIFICIAL (decl) = 1;
33901 DECL_IGNORED_P (decl) = 0;
33902 /* IFUNC resolvers have to be externally visible. */
33903 TREE_PUBLIC (decl) = 1;
33904 DECL_UNINLINABLE (decl) = 1;
33906 /* Resolver is not external, body is generated. */
33907 DECL_EXTERNAL (decl) = 0;
33908 DECL_EXTERNAL (dispatch_decl) = 0;
33910 DECL_CONTEXT (decl) = NULL_TREE;
33911 DECL_INITIAL (decl) = make_node (BLOCK);
33912 DECL_STATIC_CONSTRUCTOR (decl) = 0;
33914 if (DECL_COMDAT_GROUP (default_decl)
33915 || TREE_PUBLIC (default_decl))
33917 /* In this case, each translation unit with a call to this
33918 versioned function will put out a resolver. Ensure it
33919 is comdat to keep just one copy. */
33920 DECL_COMDAT (decl) = 1;
33921 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
33923 /* Build result decl and add to function_decl. */
33924 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
33925 DECL_ARTIFICIAL (t) = 1;
33926 DECL_IGNORED_P (t) = 1;
33927 DECL_RESULT (decl) = t;
33929 gimplify_function_tree (decl);
33930 push_cfun (DECL_STRUCT_FUNCTION (decl));
33931 *empty_bb = init_lowered_empty_function (decl, false,
33932 profile_count::uninitialized ());
33934 cgraph_node::add_new_function (decl, true);
33935 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
33937 pop_cfun ();
33939 gcc_assert (dispatch_decl != NULL);
33940 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
33941 DECL_ATTRIBUTES (dispatch_decl)
33942 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
33944 /* Create the alias for dispatch to resolver here. */
33945 /*cgraph_create_function_alias (dispatch_decl, decl);*/
33946 cgraph_node::create_same_body_alias (dispatch_decl, decl);
33947 XDELETEVEC (resolver_name);
33948 return decl;
33951 /* Generate the dispatching code body to dispatch multi-versioned function
33952 DECL. The target hook is called to process the "target" attributes and
33953 provide the code to dispatch the right function at run-time. NODE points
33954 to the dispatcher decl whose body will be created. */
33956 static tree
33957 ix86_generate_version_dispatcher_body (void *node_p)
33959 tree resolver_decl;
33960 basic_block empty_bb;
33961 tree default_ver_decl;
33962 struct cgraph_node *versn;
33963 struct cgraph_node *node;
33965 struct cgraph_function_version_info *node_version_info = NULL;
33966 struct cgraph_function_version_info *versn_info = NULL;
33968 node = (cgraph_node *)node_p;
33970 node_version_info = node->function_version ();
33971 gcc_assert (node->dispatcher_function
33972 && node_version_info != NULL);
33974 if (node_version_info->dispatcher_resolver)
33975 return node_version_info->dispatcher_resolver;
33977 /* The first version in the chain corresponds to the default version. */
33978 default_ver_decl = node_version_info->next->this_node->decl;
33980 /* node is going to be an alias, so remove the finalized bit. */
33981 node->definition = false;
33983 resolver_decl = make_resolver_func (default_ver_decl,
33984 node->decl, &empty_bb);
33986 node_version_info->dispatcher_resolver = resolver_decl;
33988 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
33990 auto_vec<tree, 2> fn_ver_vec;
33992 for (versn_info = node_version_info->next; versn_info;
33993 versn_info = versn_info->next)
33995 versn = versn_info->this_node;
33996 /* Check for virtual functions here again, as by this time it should
33997 have been determined if this function needs a vtable index or
33998 not. This happens for methods in derived classes that override
33999 virtual methods in base classes but are not explicitly marked as
34000 virtual. */
34001 if (DECL_VINDEX (versn->decl))
34002 sorry ("Virtual function multiversioning not supported");
34004 fn_ver_vec.safe_push (versn->decl);
34007 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
34008 cgraph_edge::rebuild_edges ();
34009 pop_cfun ();
34010 return resolver_decl;
34012 /* This builds the processor_model struct type defined in
34013 libgcc/config/i386/cpuinfo.c */
34015 static tree
34016 build_processor_model_struct (void)
34018 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
34019 "__cpu_features"};
34020 tree field = NULL_TREE, field_chain = NULL_TREE;
34021 int i;
34022 tree type = make_node (RECORD_TYPE);
34024 /* The first 3 fields are unsigned int. */
34025 for (i = 0; i < 3; ++i)
34027 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34028 get_identifier (field_name[i]), unsigned_type_node);
34029 if (field_chain != NULL_TREE)
34030 DECL_CHAIN (field) = field_chain;
34031 field_chain = field;
34034 /* The last field is an array of unsigned integers of size one. */
34035 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34036 get_identifier (field_name[3]),
34037 build_array_type (unsigned_type_node,
34038 build_index_type (size_one_node)));
34039 if (field_chain != NULL_TREE)
34040 DECL_CHAIN (field) = field_chain;
34041 field_chain = field;
34043 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34044 return type;
34047 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34049 static tree
34050 make_var_decl (tree type, const char *name)
34052 tree new_decl;
34054 new_decl = build_decl (UNKNOWN_LOCATION,
34055 VAR_DECL,
34056 get_identifier(name),
34057 type);
34059 DECL_EXTERNAL (new_decl) = 1;
34060 TREE_STATIC (new_decl) = 1;
34061 TREE_PUBLIC (new_decl) = 1;
34062 DECL_INITIAL (new_decl) = 0;
34063 DECL_ARTIFICIAL (new_decl) = 0;
34064 DECL_PRESERVE_P (new_decl) = 1;
34066 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34067 assemble_variable (new_decl, 0, 0, 0);
34069 return new_decl;
34072 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34073 into an integer defined in libgcc/config/i386/cpuinfo.c */
34075 static tree
34076 fold_builtin_cpu (tree fndecl, tree *args)
34078 unsigned int i;
34079 enum ix86_builtins fn_code = (enum ix86_builtins)
34080 DECL_FUNCTION_CODE (fndecl);
34081 tree param_string_cst = NULL;
34083 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34084 enum processor_features
34086 F_CMOV = 0,
34087 F_MMX,
34088 F_POPCNT,
34089 F_SSE,
34090 F_SSE2,
34091 F_SSE3,
34092 F_SSSE3,
34093 F_SSE4_1,
34094 F_SSE4_2,
34095 F_AVX,
34096 F_AVX2,
34097 F_SSE4_A,
34098 F_FMA4,
34099 F_XOP,
34100 F_FMA,
34101 F_AVX512F,
34102 F_BMI,
34103 F_BMI2,
34104 F_AES,
34105 F_PCLMUL,
34106 F_AVX512VL,
34107 F_AVX512BW,
34108 F_AVX512DQ,
34109 F_AVX512CD,
34110 F_AVX512ER,
34111 F_AVX512PF,
34112 F_AVX512VBMI,
34113 F_AVX512IFMA,
34114 F_AVX5124VNNIW,
34115 F_AVX5124FMAPS,
34116 F_AVX512VPOPCNTDQ,
34117 F_MAX
34120 /* These are the values for vendor types and cpu types and subtypes
34121 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34122 the corresponding start value. */
34123 enum processor_model
34125 M_INTEL = 1,
34126 M_AMD,
34127 M_CPU_TYPE_START,
34128 M_INTEL_BONNELL,
34129 M_INTEL_CORE2,
34130 M_INTEL_COREI7,
34131 M_AMDFAM10H,
34132 M_AMDFAM15H,
34133 M_INTEL_SILVERMONT,
34134 M_INTEL_KNL,
34135 M_AMD_BTVER1,
34136 M_AMD_BTVER2,
34137 M_CPU_SUBTYPE_START,
34138 M_INTEL_COREI7_NEHALEM,
34139 M_INTEL_COREI7_WESTMERE,
34140 M_INTEL_COREI7_SANDYBRIDGE,
34141 M_AMDFAM10H_BARCELONA,
34142 M_AMDFAM10H_SHANGHAI,
34143 M_AMDFAM10H_ISTANBUL,
34144 M_AMDFAM15H_BDVER1,
34145 M_AMDFAM15H_BDVER2,
34146 M_AMDFAM15H_BDVER3,
34147 M_AMDFAM15H_BDVER4,
34148 M_AMDFAM17H_ZNVER1,
34149 M_INTEL_COREI7_IVYBRIDGE,
34150 M_INTEL_COREI7_HASWELL,
34151 M_INTEL_COREI7_BROADWELL,
34152 M_INTEL_COREI7_SKYLAKE,
34153 M_INTEL_COREI7_SKYLAKE_AVX512
34156 static struct _arch_names_table
34158 const char *const name;
34159 const enum processor_model model;
34161 const arch_names_table[] =
34163 {"amd", M_AMD},
34164 {"intel", M_INTEL},
34165 {"atom", M_INTEL_BONNELL},
34166 {"slm", M_INTEL_SILVERMONT},
34167 {"core2", M_INTEL_CORE2},
34168 {"corei7", M_INTEL_COREI7},
34169 {"nehalem", M_INTEL_COREI7_NEHALEM},
34170 {"westmere", M_INTEL_COREI7_WESTMERE},
34171 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34172 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34173 {"haswell", M_INTEL_COREI7_HASWELL},
34174 {"broadwell", M_INTEL_COREI7_BROADWELL},
34175 {"skylake", M_INTEL_COREI7_SKYLAKE},
34176 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34177 {"bonnell", M_INTEL_BONNELL},
34178 {"silvermont", M_INTEL_SILVERMONT},
34179 {"knl", M_INTEL_KNL},
34180 {"amdfam10h", M_AMDFAM10H},
34181 {"barcelona", M_AMDFAM10H_BARCELONA},
34182 {"shanghai", M_AMDFAM10H_SHANGHAI},
34183 {"istanbul", M_AMDFAM10H_ISTANBUL},
34184 {"btver1", M_AMD_BTVER1},
34185 {"amdfam15h", M_AMDFAM15H},
34186 {"bdver1", M_AMDFAM15H_BDVER1},
34187 {"bdver2", M_AMDFAM15H_BDVER2},
34188 {"bdver3", M_AMDFAM15H_BDVER3},
34189 {"bdver4", M_AMDFAM15H_BDVER4},
34190 {"btver2", M_AMD_BTVER2},
34191 {"znver1", M_AMDFAM17H_ZNVER1},
34194 static struct _isa_names_table
34196 const char *const name;
34197 const enum processor_features feature;
34199 const isa_names_table[] =
34201 {"cmov", F_CMOV},
34202 {"mmx", F_MMX},
34203 {"popcnt", F_POPCNT},
34204 {"sse", F_SSE},
34205 {"sse2", F_SSE2},
34206 {"sse3", F_SSE3},
34207 {"ssse3", F_SSSE3},
34208 {"sse4a", F_SSE4_A},
34209 {"sse4.1", F_SSE4_1},
34210 {"sse4.2", F_SSE4_2},
34211 {"avx", F_AVX},
34212 {"fma4", F_FMA4},
34213 {"xop", F_XOP},
34214 {"fma", F_FMA},
34215 {"avx2", F_AVX2},
34216 {"avx512f", F_AVX512F},
34217 {"bmi", F_BMI},
34218 {"bmi2", F_BMI2},
34219 {"aes", F_AES},
34220 {"pclmul", F_PCLMUL},
34221 {"avx512vl",F_AVX512VL},
34222 {"avx512bw",F_AVX512BW},
34223 {"avx512dq",F_AVX512DQ},
34224 {"avx512cd",F_AVX512CD},
34225 {"avx512er",F_AVX512ER},
34226 {"avx512pf",F_AVX512PF},
34227 {"avx512vbmi",F_AVX512VBMI},
34228 {"avx512ifma",F_AVX512IFMA},
34229 {"avx5124vnniw",F_AVX5124VNNIW},
34230 {"avx5124fmaps",F_AVX5124FMAPS},
34231 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34234 tree __processor_model_type = build_processor_model_struct ();
34235 tree __cpu_model_var = make_var_decl (__processor_model_type,
34236 "__cpu_model");
34239 varpool_node::add (__cpu_model_var);
34241 gcc_assert ((args != NULL) && (*args != NULL));
34243 param_string_cst = *args;
34244 while (param_string_cst
34245 && TREE_CODE (param_string_cst) != STRING_CST)
34247 /* *args must be a expr that can contain other EXPRS leading to a
34248 STRING_CST. */
34249 if (!EXPR_P (param_string_cst))
34251 error ("Parameter to builtin must be a string constant or literal");
34252 return integer_zero_node;
34254 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34257 gcc_assert (param_string_cst);
34259 if (fn_code == IX86_BUILTIN_CPU_IS)
34261 tree ref;
34262 tree field;
34263 tree final;
34265 unsigned int field_val = 0;
34266 unsigned int NUM_ARCH_NAMES
34267 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34269 for (i = 0; i < NUM_ARCH_NAMES; i++)
34270 if (strcmp (arch_names_table[i].name,
34271 TREE_STRING_POINTER (param_string_cst)) == 0)
34272 break;
34274 if (i == NUM_ARCH_NAMES)
34276 error ("Parameter to builtin not valid: %s",
34277 TREE_STRING_POINTER (param_string_cst));
34278 return integer_zero_node;
34281 field = TYPE_FIELDS (__processor_model_type);
34282 field_val = arch_names_table[i].model;
34284 /* CPU types are stored in the next field. */
34285 if (field_val > M_CPU_TYPE_START
34286 && field_val < M_CPU_SUBTYPE_START)
34288 field = DECL_CHAIN (field);
34289 field_val -= M_CPU_TYPE_START;
34292 /* CPU subtypes are stored in the next field. */
34293 if (field_val > M_CPU_SUBTYPE_START)
34295 field = DECL_CHAIN ( DECL_CHAIN (field));
34296 field_val -= M_CPU_SUBTYPE_START;
34299 /* Get the appropriate field in __cpu_model. */
34300 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34301 field, NULL_TREE);
34303 /* Check the value. */
34304 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34305 build_int_cstu (unsigned_type_node, field_val));
34306 return build1 (CONVERT_EXPR, integer_type_node, final);
34308 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34310 tree ref;
34311 tree array_elt;
34312 tree field;
34313 tree final;
34315 unsigned int field_val = 0;
34316 unsigned int NUM_ISA_NAMES
34317 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34319 for (i = 0; i < NUM_ISA_NAMES; i++)
34320 if (strcmp (isa_names_table[i].name,
34321 TREE_STRING_POINTER (param_string_cst)) == 0)
34322 break;
34324 if (i == NUM_ISA_NAMES)
34326 error ("Parameter to builtin not valid: %s",
34327 TREE_STRING_POINTER (param_string_cst));
34328 return integer_zero_node;
34331 field = TYPE_FIELDS (__processor_model_type);
34332 /* Get the last field, which is __cpu_features. */
34333 while (DECL_CHAIN (field))
34334 field = DECL_CHAIN (field);
34336 /* Get the appropriate field: __cpu_model.__cpu_features */
34337 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34338 field, NULL_TREE);
34340 /* Access the 0th element of __cpu_features array. */
34341 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34342 integer_zero_node, NULL_TREE, NULL_TREE);
34344 field_val = (1 << isa_names_table[i].feature);
34345 /* Return __cpu_model.__cpu_features[0] & field_val */
34346 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34347 build_int_cstu (unsigned_type_node, field_val));
34348 return build1 (CONVERT_EXPR, integer_type_node, final);
34350 gcc_unreachable ();
34353 static tree
34354 ix86_fold_builtin (tree fndecl, int n_args,
34355 tree *args, bool ignore ATTRIBUTE_UNUSED)
34357 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34359 enum ix86_builtins fn_code = (enum ix86_builtins)
34360 DECL_FUNCTION_CODE (fndecl);
34361 switch (fn_code)
34363 case IX86_BUILTIN_CPU_IS:
34364 case IX86_BUILTIN_CPU_SUPPORTS:
34365 gcc_assert (n_args == 1);
34366 return fold_builtin_cpu (fndecl, args);
34368 case IX86_BUILTIN_NANQ:
34369 case IX86_BUILTIN_NANSQ:
34371 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34372 const char *str = c_getstr (*args);
34373 int quiet = fn_code == IX86_BUILTIN_NANQ;
34374 REAL_VALUE_TYPE real;
34376 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34377 return build_real (type, real);
34378 return NULL_TREE;
34381 case IX86_BUILTIN_INFQ:
34382 case IX86_BUILTIN_HUGE_VALQ:
34384 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34385 REAL_VALUE_TYPE inf;
34386 real_inf (&inf);
34387 return build_real (type, inf);
34390 case IX86_BUILTIN_TZCNT16:
34391 case IX86_BUILTIN_CTZS:
34392 case IX86_BUILTIN_TZCNT32:
34393 case IX86_BUILTIN_TZCNT64:
34394 gcc_assert (n_args == 1);
34395 if (TREE_CODE (args[0]) == INTEGER_CST)
34397 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34398 tree arg = args[0];
34399 if (fn_code == IX86_BUILTIN_TZCNT16
34400 || fn_code == IX86_BUILTIN_CTZS)
34401 arg = fold_convert (short_unsigned_type_node, arg);
34402 if (integer_zerop (arg))
34403 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34404 else
34405 return fold_const_call (CFN_CTZ, type, arg);
34407 break;
34409 case IX86_BUILTIN_LZCNT16:
34410 case IX86_BUILTIN_CLZS:
34411 case IX86_BUILTIN_LZCNT32:
34412 case IX86_BUILTIN_LZCNT64:
34413 gcc_assert (n_args == 1);
34414 if (TREE_CODE (args[0]) == INTEGER_CST)
34416 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34417 tree arg = args[0];
34418 if (fn_code == IX86_BUILTIN_LZCNT16
34419 || fn_code == IX86_BUILTIN_CLZS)
34420 arg = fold_convert (short_unsigned_type_node, arg);
34421 if (integer_zerop (arg))
34422 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34423 else
34424 return fold_const_call (CFN_CLZ, type, arg);
34426 break;
34428 case IX86_BUILTIN_BEXTR32:
34429 case IX86_BUILTIN_BEXTR64:
34430 case IX86_BUILTIN_BEXTRI32:
34431 case IX86_BUILTIN_BEXTRI64:
34432 gcc_assert (n_args == 2);
34433 if (tree_fits_uhwi_p (args[1]))
34435 unsigned HOST_WIDE_INT res = 0;
34436 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34437 unsigned int start = tree_to_uhwi (args[1]);
34438 unsigned int len = (start & 0xff00) >> 8;
34439 start &= 0xff;
34440 if (start >= prec || len == 0)
34441 res = 0;
34442 else if (!tree_fits_uhwi_p (args[0]))
34443 break;
34444 else
34445 res = tree_to_uhwi (args[0]) >> start;
34446 if (len > prec)
34447 len = prec;
34448 if (len < HOST_BITS_PER_WIDE_INT)
34449 res &= (HOST_WIDE_INT_1U << len) - 1;
34450 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34452 break;
34454 case IX86_BUILTIN_BZHI32:
34455 case IX86_BUILTIN_BZHI64:
34456 gcc_assert (n_args == 2);
34457 if (tree_fits_uhwi_p (args[1]))
34459 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34460 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34461 return args[0];
34462 if (!tree_fits_uhwi_p (args[0]))
34463 break;
34464 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34465 res &= ~(HOST_WIDE_INT_M1U << idx);
34466 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34468 break;
34470 case IX86_BUILTIN_PDEP32:
34471 case IX86_BUILTIN_PDEP64:
34472 gcc_assert (n_args == 2);
34473 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34475 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34476 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34477 unsigned HOST_WIDE_INT res = 0;
34478 unsigned HOST_WIDE_INT m, k = 1;
34479 for (m = 1; m; m <<= 1)
34480 if ((mask & m) != 0)
34482 if ((src & k) != 0)
34483 res |= m;
34484 k <<= 1;
34486 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34488 break;
34490 case IX86_BUILTIN_PEXT32:
34491 case IX86_BUILTIN_PEXT64:
34492 gcc_assert (n_args == 2);
34493 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34495 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34496 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34497 unsigned HOST_WIDE_INT res = 0;
34498 unsigned HOST_WIDE_INT m, k = 1;
34499 for (m = 1; m; m <<= 1)
34500 if ((mask & m) != 0)
34502 if ((src & m) != 0)
34503 res |= k;
34504 k <<= 1;
34506 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34508 break;
34510 default:
34511 break;
34515 #ifdef SUBTARGET_FOLD_BUILTIN
34516 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34517 #endif
34519 return NULL_TREE;
34522 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34523 constant) in GIMPLE. */
34525 bool
34526 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34528 gimple *stmt = gsi_stmt (*gsi);
34529 tree fndecl = gimple_call_fndecl (stmt);
34530 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34531 int n_args = gimple_call_num_args (stmt);
34532 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34533 tree decl = NULL_TREE;
34534 tree arg0, arg1;
34536 switch (fn_code)
34538 case IX86_BUILTIN_TZCNT32:
34539 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34540 goto fold_tzcnt_lzcnt;
34542 case IX86_BUILTIN_TZCNT64:
34543 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34544 goto fold_tzcnt_lzcnt;
34546 case IX86_BUILTIN_LZCNT32:
34547 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34548 goto fold_tzcnt_lzcnt;
34550 case IX86_BUILTIN_LZCNT64:
34551 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34552 goto fold_tzcnt_lzcnt;
34554 fold_tzcnt_lzcnt:
34555 gcc_assert (n_args == 1);
34556 arg0 = gimple_call_arg (stmt, 0);
34557 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34559 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34560 /* If arg0 is provably non-zero, optimize into generic
34561 __builtin_c[tl]z{,ll} function the middle-end handles
34562 better. */
34563 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34564 return false;
34566 location_t loc = gimple_location (stmt);
34567 gimple *g = gimple_build_call (decl, 1, arg0);
34568 gimple_set_location (g, loc);
34569 tree lhs = make_ssa_name (integer_type_node);
34570 gimple_call_set_lhs (g, lhs);
34571 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34572 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34573 gimple_set_location (g, loc);
34574 gsi_replace (gsi, g, false);
34575 return true;
34577 break;
34579 case IX86_BUILTIN_BZHI32:
34580 case IX86_BUILTIN_BZHI64:
34581 gcc_assert (n_args == 2);
34582 arg1 = gimple_call_arg (stmt, 1);
34583 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34585 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34586 arg0 = gimple_call_arg (stmt, 0);
34587 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34588 break;
34589 location_t loc = gimple_location (stmt);
34590 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34591 gimple_set_location (g, loc);
34592 gsi_replace (gsi, g, false);
34593 return true;
34595 break;
34597 case IX86_BUILTIN_PDEP32:
34598 case IX86_BUILTIN_PDEP64:
34599 case IX86_BUILTIN_PEXT32:
34600 case IX86_BUILTIN_PEXT64:
34601 gcc_assert (n_args == 2);
34602 arg1 = gimple_call_arg (stmt, 1);
34603 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34605 location_t loc = gimple_location (stmt);
34606 arg0 = gimple_call_arg (stmt, 0);
34607 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34608 gimple_set_location (g, loc);
34609 gsi_replace (gsi, g, false);
34610 return true;
34612 break;
34614 default:
34615 break;
34618 return false;
34621 /* Make builtins to detect cpu type and features supported. NAME is
34622 the builtin name, CODE is the builtin code, and FTYPE is the function
34623 type of the builtin. */
34625 static void
34626 make_cpu_type_builtin (const char* name, int code,
34627 enum ix86_builtin_func_type ftype, bool is_const)
34629 tree decl;
34630 tree type;
34632 type = ix86_get_builtin_func_type (ftype);
34633 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34634 NULL, NULL_TREE);
34635 gcc_assert (decl != NULL_TREE);
34636 ix86_builtins[(int) code] = decl;
34637 TREE_READONLY (decl) = is_const;
34640 /* Make builtins to get CPU type and features supported. The created
34641 builtins are :
34643 __builtin_cpu_init (), to detect cpu type and features,
34644 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34645 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34648 static void
34649 ix86_init_platform_type_builtins (void)
34651 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34652 INT_FTYPE_VOID, false);
34653 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34654 INT_FTYPE_PCCHAR, true);
34655 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34656 INT_FTYPE_PCCHAR, true);
34659 /* Internal method for ix86_init_builtins. */
34661 static void
34662 ix86_init_builtins_va_builtins_abi (void)
34664 tree ms_va_ref, sysv_va_ref;
34665 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34666 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34667 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34668 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34670 if (!TARGET_64BIT)
34671 return;
34672 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34673 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34674 ms_va_ref = build_reference_type (ms_va_list_type_node);
34675 sysv_va_ref =
34676 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34678 fnvoid_va_end_ms =
34679 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34680 fnvoid_va_start_ms =
34681 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34682 fnvoid_va_end_sysv =
34683 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34684 fnvoid_va_start_sysv =
34685 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34686 NULL_TREE);
34687 fnvoid_va_copy_ms =
34688 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34689 NULL_TREE);
34690 fnvoid_va_copy_sysv =
34691 build_function_type_list (void_type_node, sysv_va_ref,
34692 sysv_va_ref, NULL_TREE);
34694 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34695 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34696 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34697 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34698 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34699 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34700 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34701 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34702 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34703 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34704 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34705 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34708 static void
34709 ix86_init_builtin_types (void)
34711 tree float80_type_node, const_string_type_node;
34713 /* The __float80 type. */
34714 float80_type_node = long_double_type_node;
34715 if (TYPE_MODE (float80_type_node) != XFmode)
34717 if (float64x_type_node != NULL_TREE
34718 && TYPE_MODE (float64x_type_node) == XFmode)
34719 float80_type_node = float64x_type_node;
34720 else
34722 /* The __float80 type. */
34723 float80_type_node = make_node (REAL_TYPE);
34725 TYPE_PRECISION (float80_type_node) = 80;
34726 layout_type (float80_type_node);
34729 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34731 /* The __float128 type. The node has already been created as
34732 _Float128, so we only need to register the __float128 name for
34733 it. */
34734 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34736 const_string_type_node
34737 = build_pointer_type (build_qualified_type
34738 (char_type_node, TYPE_QUAL_CONST));
34740 /* This macro is built by i386-builtin-types.awk. */
34741 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34744 static void
34745 ix86_init_builtins (void)
34747 tree ftype, decl;
34749 ix86_init_builtin_types ();
34751 /* Builtins to get CPU type and features. */
34752 ix86_init_platform_type_builtins ();
34754 /* TFmode support builtins. */
34755 def_builtin_const (0, "__builtin_infq",
34756 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34757 def_builtin_const (0, "__builtin_huge_valq",
34758 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34760 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34761 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34762 BUILT_IN_MD, "nanq", NULL_TREE);
34763 TREE_READONLY (decl) = 1;
34764 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34766 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34767 BUILT_IN_MD, "nansq", NULL_TREE);
34768 TREE_READONLY (decl) = 1;
34769 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34771 /* We will expand them to normal call if SSE isn't available since
34772 they are used by libgcc. */
34773 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34774 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34775 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34776 TREE_READONLY (decl) = 1;
34777 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34779 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34780 decl = add_builtin_function ("__builtin_copysignq", ftype,
34781 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34782 "__copysigntf3", NULL_TREE);
34783 TREE_READONLY (decl) = 1;
34784 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34786 ix86_init_tm_builtins ();
34787 ix86_init_mmx_sse_builtins ();
34788 ix86_init_mpx_builtins ();
34790 if (TARGET_LP64)
34791 ix86_init_builtins_va_builtins_abi ();
34793 #ifdef SUBTARGET_INIT_BUILTINS
34794 SUBTARGET_INIT_BUILTINS;
34795 #endif
34798 /* Return the ix86 builtin for CODE. */
34800 static tree
34801 ix86_builtin_decl (unsigned code, bool)
34803 if (code >= IX86_BUILTIN_MAX)
34804 return error_mark_node;
34806 return ix86_builtins[code];
34809 /* Errors in the source file can cause expand_expr to return const0_rtx
34810 where we expect a vector. To avoid crashing, use one of the vector
34811 clear instructions. */
34812 static rtx
34813 safe_vector_operand (rtx x, machine_mode mode)
34815 if (x == const0_rtx)
34816 x = CONST0_RTX (mode);
34817 return x;
34820 /* Fixup modeless constants to fit required mode. */
34821 static rtx
34822 fixup_modeless_constant (rtx x, machine_mode mode)
34824 if (GET_MODE (x) == VOIDmode)
34825 x = convert_to_mode (mode, x, 1);
34826 return x;
34829 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34831 static rtx
34832 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34834 rtx pat;
34835 tree arg0 = CALL_EXPR_ARG (exp, 0);
34836 tree arg1 = CALL_EXPR_ARG (exp, 1);
34837 rtx op0 = expand_normal (arg0);
34838 rtx op1 = expand_normal (arg1);
34839 machine_mode tmode = insn_data[icode].operand[0].mode;
34840 machine_mode mode0 = insn_data[icode].operand[1].mode;
34841 machine_mode mode1 = insn_data[icode].operand[2].mode;
34843 if (VECTOR_MODE_P (mode0))
34844 op0 = safe_vector_operand (op0, mode0);
34845 if (VECTOR_MODE_P (mode1))
34846 op1 = safe_vector_operand (op1, mode1);
34848 if (optimize || !target
34849 || GET_MODE (target) != tmode
34850 || !insn_data[icode].operand[0].predicate (target, tmode))
34851 target = gen_reg_rtx (tmode);
34853 if (GET_MODE (op1) == SImode && mode1 == TImode)
34855 rtx x = gen_reg_rtx (V4SImode);
34856 emit_insn (gen_sse2_loadd (x, op1));
34857 op1 = gen_lowpart (TImode, x);
34860 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34861 op0 = copy_to_mode_reg (mode0, op0);
34862 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34863 op1 = copy_to_mode_reg (mode1, op1);
34865 pat = GEN_FCN (icode) (target, op0, op1);
34866 if (! pat)
34867 return 0;
34869 emit_insn (pat);
34871 return target;
34874 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34876 static rtx
34877 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34878 enum ix86_builtin_func_type m_type,
34879 enum rtx_code sub_code)
34881 rtx pat;
34882 int i;
34883 int nargs;
34884 bool comparison_p = false;
34885 bool tf_p = false;
34886 bool last_arg_constant = false;
34887 int num_memory = 0;
34888 struct {
34889 rtx op;
34890 machine_mode mode;
34891 } args[4];
34893 machine_mode tmode = insn_data[icode].operand[0].mode;
34895 switch (m_type)
34897 case MULTI_ARG_4_DF2_DI_I:
34898 case MULTI_ARG_4_DF2_DI_I1:
34899 case MULTI_ARG_4_SF2_SI_I:
34900 case MULTI_ARG_4_SF2_SI_I1:
34901 nargs = 4;
34902 last_arg_constant = true;
34903 break;
34905 case MULTI_ARG_3_SF:
34906 case MULTI_ARG_3_DF:
34907 case MULTI_ARG_3_SF2:
34908 case MULTI_ARG_3_DF2:
34909 case MULTI_ARG_3_DI:
34910 case MULTI_ARG_3_SI:
34911 case MULTI_ARG_3_SI_DI:
34912 case MULTI_ARG_3_HI:
34913 case MULTI_ARG_3_HI_SI:
34914 case MULTI_ARG_3_QI:
34915 case MULTI_ARG_3_DI2:
34916 case MULTI_ARG_3_SI2:
34917 case MULTI_ARG_3_HI2:
34918 case MULTI_ARG_3_QI2:
34919 nargs = 3;
34920 break;
34922 case MULTI_ARG_2_SF:
34923 case MULTI_ARG_2_DF:
34924 case MULTI_ARG_2_DI:
34925 case MULTI_ARG_2_SI:
34926 case MULTI_ARG_2_HI:
34927 case MULTI_ARG_2_QI:
34928 nargs = 2;
34929 break;
34931 case MULTI_ARG_2_DI_IMM:
34932 case MULTI_ARG_2_SI_IMM:
34933 case MULTI_ARG_2_HI_IMM:
34934 case MULTI_ARG_2_QI_IMM:
34935 nargs = 2;
34936 last_arg_constant = true;
34937 break;
34939 case MULTI_ARG_1_SF:
34940 case MULTI_ARG_1_DF:
34941 case MULTI_ARG_1_SF2:
34942 case MULTI_ARG_1_DF2:
34943 case MULTI_ARG_1_DI:
34944 case MULTI_ARG_1_SI:
34945 case MULTI_ARG_1_HI:
34946 case MULTI_ARG_1_QI:
34947 case MULTI_ARG_1_SI_DI:
34948 case MULTI_ARG_1_HI_DI:
34949 case MULTI_ARG_1_HI_SI:
34950 case MULTI_ARG_1_QI_DI:
34951 case MULTI_ARG_1_QI_SI:
34952 case MULTI_ARG_1_QI_HI:
34953 nargs = 1;
34954 break;
34956 case MULTI_ARG_2_DI_CMP:
34957 case MULTI_ARG_2_SI_CMP:
34958 case MULTI_ARG_2_HI_CMP:
34959 case MULTI_ARG_2_QI_CMP:
34960 nargs = 2;
34961 comparison_p = true;
34962 break;
34964 case MULTI_ARG_2_SF_TF:
34965 case MULTI_ARG_2_DF_TF:
34966 case MULTI_ARG_2_DI_TF:
34967 case MULTI_ARG_2_SI_TF:
34968 case MULTI_ARG_2_HI_TF:
34969 case MULTI_ARG_2_QI_TF:
34970 nargs = 2;
34971 tf_p = true;
34972 break;
34974 default:
34975 gcc_unreachable ();
34978 if (optimize || !target
34979 || GET_MODE (target) != tmode
34980 || !insn_data[icode].operand[0].predicate (target, tmode))
34981 target = gen_reg_rtx (tmode);
34982 else if (memory_operand (target, tmode))
34983 num_memory++;
34985 gcc_assert (nargs <= 4);
34987 for (i = 0; i < nargs; i++)
34989 tree arg = CALL_EXPR_ARG (exp, i);
34990 rtx op = expand_normal (arg);
34991 int adjust = (comparison_p) ? 1 : 0;
34992 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34994 if (last_arg_constant && i == nargs - 1)
34996 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34998 enum insn_code new_icode = icode;
34999 switch (icode)
35001 case CODE_FOR_xop_vpermil2v2df3:
35002 case CODE_FOR_xop_vpermil2v4sf3:
35003 case CODE_FOR_xop_vpermil2v4df3:
35004 case CODE_FOR_xop_vpermil2v8sf3:
35005 error ("the last argument must be a 2-bit immediate");
35006 return gen_reg_rtx (tmode);
35007 case CODE_FOR_xop_rotlv2di3:
35008 new_icode = CODE_FOR_rotlv2di3;
35009 goto xop_rotl;
35010 case CODE_FOR_xop_rotlv4si3:
35011 new_icode = CODE_FOR_rotlv4si3;
35012 goto xop_rotl;
35013 case CODE_FOR_xop_rotlv8hi3:
35014 new_icode = CODE_FOR_rotlv8hi3;
35015 goto xop_rotl;
35016 case CODE_FOR_xop_rotlv16qi3:
35017 new_icode = CODE_FOR_rotlv16qi3;
35018 xop_rotl:
35019 if (CONST_INT_P (op))
35021 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35022 op = GEN_INT (INTVAL (op) & mask);
35023 gcc_checking_assert
35024 (insn_data[icode].operand[i + 1].predicate (op, mode));
35026 else
35028 gcc_checking_assert
35029 (nargs == 2
35030 && insn_data[new_icode].operand[0].mode == tmode
35031 && insn_data[new_icode].operand[1].mode == tmode
35032 && insn_data[new_icode].operand[2].mode == mode
35033 && insn_data[new_icode].operand[0].predicate
35034 == insn_data[icode].operand[0].predicate
35035 && insn_data[new_icode].operand[1].predicate
35036 == insn_data[icode].operand[1].predicate);
35037 icode = new_icode;
35038 goto non_constant;
35040 break;
35041 default:
35042 gcc_unreachable ();
35046 else
35048 non_constant:
35049 if (VECTOR_MODE_P (mode))
35050 op = safe_vector_operand (op, mode);
35052 /* If we aren't optimizing, only allow one memory operand to be
35053 generated. */
35054 if (memory_operand (op, mode))
35055 num_memory++;
35057 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35059 if (optimize
35060 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35061 || num_memory > 1)
35062 op = force_reg (mode, op);
35065 args[i].op = op;
35066 args[i].mode = mode;
35069 switch (nargs)
35071 case 1:
35072 pat = GEN_FCN (icode) (target, args[0].op);
35073 break;
35075 case 2:
35076 if (tf_p)
35077 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35078 GEN_INT ((int)sub_code));
35079 else if (! comparison_p)
35080 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35081 else
35083 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35084 args[0].op,
35085 args[1].op);
35087 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35089 break;
35091 case 3:
35092 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35093 break;
35095 case 4:
35096 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35097 break;
35099 default:
35100 gcc_unreachable ();
35103 if (! pat)
35104 return 0;
35106 emit_insn (pat);
35107 return target;
35110 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35111 insns with vec_merge. */
35113 static rtx
35114 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35115 rtx target)
35117 rtx pat;
35118 tree arg0 = CALL_EXPR_ARG (exp, 0);
35119 rtx op1, op0 = expand_normal (arg0);
35120 machine_mode tmode = insn_data[icode].operand[0].mode;
35121 machine_mode mode0 = insn_data[icode].operand[1].mode;
35123 if (optimize || !target
35124 || GET_MODE (target) != tmode
35125 || !insn_data[icode].operand[0].predicate (target, tmode))
35126 target = gen_reg_rtx (tmode);
35128 if (VECTOR_MODE_P (mode0))
35129 op0 = safe_vector_operand (op0, mode0);
35131 if ((optimize && !register_operand (op0, mode0))
35132 || !insn_data[icode].operand[1].predicate (op0, mode0))
35133 op0 = copy_to_mode_reg (mode0, op0);
35135 op1 = op0;
35136 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35137 op1 = copy_to_mode_reg (mode0, op1);
35139 pat = GEN_FCN (icode) (target, op0, op1);
35140 if (! pat)
35141 return 0;
35142 emit_insn (pat);
35143 return target;
35146 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35148 static rtx
35149 ix86_expand_sse_compare (const struct builtin_description *d,
35150 tree exp, rtx target, bool swap)
35152 rtx pat;
35153 tree arg0 = CALL_EXPR_ARG (exp, 0);
35154 tree arg1 = CALL_EXPR_ARG (exp, 1);
35155 rtx op0 = expand_normal (arg0);
35156 rtx op1 = expand_normal (arg1);
35157 rtx op2;
35158 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35159 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35160 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35161 enum rtx_code comparison = d->comparison;
35163 if (VECTOR_MODE_P (mode0))
35164 op0 = safe_vector_operand (op0, mode0);
35165 if (VECTOR_MODE_P (mode1))
35166 op1 = safe_vector_operand (op1, mode1);
35168 /* Swap operands if we have a comparison that isn't available in
35169 hardware. */
35170 if (swap)
35171 std::swap (op0, op1);
35173 if (optimize || !target
35174 || GET_MODE (target) != tmode
35175 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35176 target = gen_reg_rtx (tmode);
35178 if ((optimize && !register_operand (op0, mode0))
35179 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35180 op0 = copy_to_mode_reg (mode0, op0);
35181 if ((optimize && !register_operand (op1, mode1))
35182 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35183 op1 = copy_to_mode_reg (mode1, op1);
35185 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35186 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35187 if (! pat)
35188 return 0;
35189 emit_insn (pat);
35190 return target;
35193 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35195 static rtx
35196 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35197 rtx target)
35199 rtx pat;
35200 tree arg0 = CALL_EXPR_ARG (exp, 0);
35201 tree arg1 = CALL_EXPR_ARG (exp, 1);
35202 rtx op0 = expand_normal (arg0);
35203 rtx op1 = expand_normal (arg1);
35204 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35205 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35206 enum rtx_code comparison = d->comparison;
35208 if (VECTOR_MODE_P (mode0))
35209 op0 = safe_vector_operand (op0, mode0);
35210 if (VECTOR_MODE_P (mode1))
35211 op1 = safe_vector_operand (op1, mode1);
35213 /* Swap operands if we have a comparison that isn't available in
35214 hardware. */
35215 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35216 std::swap (op0, op1);
35218 target = gen_reg_rtx (SImode);
35219 emit_move_insn (target, const0_rtx);
35220 target = gen_rtx_SUBREG (QImode, target, 0);
35222 if ((optimize && !register_operand (op0, mode0))
35223 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35224 op0 = copy_to_mode_reg (mode0, op0);
35225 if ((optimize && !register_operand (op1, mode1))
35226 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35227 op1 = copy_to_mode_reg (mode1, op1);
35229 pat = GEN_FCN (d->icode) (op0, op1);
35230 if (! pat)
35231 return 0;
35232 emit_insn (pat);
35233 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35234 gen_rtx_fmt_ee (comparison, QImode,
35235 SET_DEST (pat),
35236 const0_rtx)));
35238 return SUBREG_REG (target);
35241 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35243 static rtx
35244 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35245 rtx target)
35247 rtx pat;
35248 tree arg0 = CALL_EXPR_ARG (exp, 0);
35249 rtx op1, op0 = expand_normal (arg0);
35250 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35251 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35253 if (optimize || target == 0
35254 || GET_MODE (target) != tmode
35255 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35256 target = gen_reg_rtx (tmode);
35258 if (VECTOR_MODE_P (mode0))
35259 op0 = safe_vector_operand (op0, mode0);
35261 if ((optimize && !register_operand (op0, mode0))
35262 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35263 op0 = copy_to_mode_reg (mode0, op0);
35265 op1 = GEN_INT (d->comparison);
35267 pat = GEN_FCN (d->icode) (target, op0, op1);
35268 if (! pat)
35269 return 0;
35270 emit_insn (pat);
35271 return target;
35274 static rtx
35275 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35276 tree exp, rtx target)
35278 rtx pat;
35279 tree arg0 = CALL_EXPR_ARG (exp, 0);
35280 tree arg1 = CALL_EXPR_ARG (exp, 1);
35281 rtx op0 = expand_normal (arg0);
35282 rtx op1 = expand_normal (arg1);
35283 rtx op2;
35284 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35285 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35286 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35288 if (optimize || target == 0
35289 || GET_MODE (target) != tmode
35290 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35291 target = gen_reg_rtx (tmode);
35293 op0 = safe_vector_operand (op0, mode0);
35294 op1 = safe_vector_operand (op1, mode1);
35296 if ((optimize && !register_operand (op0, mode0))
35297 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35298 op0 = copy_to_mode_reg (mode0, op0);
35299 if ((optimize && !register_operand (op1, mode1))
35300 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35301 op1 = copy_to_mode_reg (mode1, op1);
35303 op2 = GEN_INT (d->comparison);
35305 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35306 if (! pat)
35307 return 0;
35308 emit_insn (pat);
35309 return target;
35312 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35314 static rtx
35315 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35316 rtx target)
35318 rtx pat;
35319 tree arg0 = CALL_EXPR_ARG (exp, 0);
35320 tree arg1 = CALL_EXPR_ARG (exp, 1);
35321 rtx op0 = expand_normal (arg0);
35322 rtx op1 = expand_normal (arg1);
35323 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35324 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35325 enum rtx_code comparison = d->comparison;
35327 if (VECTOR_MODE_P (mode0))
35328 op0 = safe_vector_operand (op0, mode0);
35329 if (VECTOR_MODE_P (mode1))
35330 op1 = safe_vector_operand (op1, mode1);
35332 target = gen_reg_rtx (SImode);
35333 emit_move_insn (target, const0_rtx);
35334 target = gen_rtx_SUBREG (QImode, target, 0);
35336 if ((optimize && !register_operand (op0, mode0))
35337 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35338 op0 = copy_to_mode_reg (mode0, op0);
35339 if ((optimize && !register_operand (op1, mode1))
35340 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35341 op1 = copy_to_mode_reg (mode1, op1);
35343 pat = GEN_FCN (d->icode) (op0, op1);
35344 if (! pat)
35345 return 0;
35346 emit_insn (pat);
35347 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35348 gen_rtx_fmt_ee (comparison, QImode,
35349 SET_DEST (pat),
35350 const0_rtx)));
35352 return SUBREG_REG (target);
35355 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35357 static rtx
35358 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35359 tree exp, rtx target)
35361 rtx pat;
35362 tree arg0 = CALL_EXPR_ARG (exp, 0);
35363 tree arg1 = CALL_EXPR_ARG (exp, 1);
35364 tree arg2 = CALL_EXPR_ARG (exp, 2);
35365 tree arg3 = CALL_EXPR_ARG (exp, 3);
35366 tree arg4 = CALL_EXPR_ARG (exp, 4);
35367 rtx scratch0, scratch1;
35368 rtx op0 = expand_normal (arg0);
35369 rtx op1 = expand_normal (arg1);
35370 rtx op2 = expand_normal (arg2);
35371 rtx op3 = expand_normal (arg3);
35372 rtx op4 = expand_normal (arg4);
35373 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35375 tmode0 = insn_data[d->icode].operand[0].mode;
35376 tmode1 = insn_data[d->icode].operand[1].mode;
35377 modev2 = insn_data[d->icode].operand[2].mode;
35378 modei3 = insn_data[d->icode].operand[3].mode;
35379 modev4 = insn_data[d->icode].operand[4].mode;
35380 modei5 = insn_data[d->icode].operand[5].mode;
35381 modeimm = insn_data[d->icode].operand[6].mode;
35383 if (VECTOR_MODE_P (modev2))
35384 op0 = safe_vector_operand (op0, modev2);
35385 if (VECTOR_MODE_P (modev4))
35386 op2 = safe_vector_operand (op2, modev4);
35388 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35389 op0 = copy_to_mode_reg (modev2, op0);
35390 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35391 op1 = copy_to_mode_reg (modei3, op1);
35392 if ((optimize && !register_operand (op2, modev4))
35393 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35394 op2 = copy_to_mode_reg (modev4, op2);
35395 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35396 op3 = copy_to_mode_reg (modei5, op3);
35398 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35400 error ("the fifth argument must be an 8-bit immediate");
35401 return const0_rtx;
35404 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35406 if (optimize || !target
35407 || GET_MODE (target) != tmode0
35408 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35409 target = gen_reg_rtx (tmode0);
35411 scratch1 = gen_reg_rtx (tmode1);
35413 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35415 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35417 if (optimize || !target
35418 || GET_MODE (target) != tmode1
35419 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35420 target = gen_reg_rtx (tmode1);
35422 scratch0 = gen_reg_rtx (tmode0);
35424 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35426 else
35428 gcc_assert (d->flag);
35430 scratch0 = gen_reg_rtx (tmode0);
35431 scratch1 = gen_reg_rtx (tmode1);
35433 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35436 if (! pat)
35437 return 0;
35439 emit_insn (pat);
35441 if (d->flag)
35443 target = gen_reg_rtx (SImode);
35444 emit_move_insn (target, const0_rtx);
35445 target = gen_rtx_SUBREG (QImode, target, 0);
35447 emit_insn
35448 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35449 gen_rtx_fmt_ee (EQ, QImode,
35450 gen_rtx_REG ((machine_mode) d->flag,
35451 FLAGS_REG),
35452 const0_rtx)));
35453 return SUBREG_REG (target);
35455 else
35456 return target;
35460 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35462 static rtx
35463 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35464 tree exp, rtx target)
35466 rtx pat;
35467 tree arg0 = CALL_EXPR_ARG (exp, 0);
35468 tree arg1 = CALL_EXPR_ARG (exp, 1);
35469 tree arg2 = CALL_EXPR_ARG (exp, 2);
35470 rtx scratch0, scratch1;
35471 rtx op0 = expand_normal (arg0);
35472 rtx op1 = expand_normal (arg1);
35473 rtx op2 = expand_normal (arg2);
35474 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35476 tmode0 = insn_data[d->icode].operand[0].mode;
35477 tmode1 = insn_data[d->icode].operand[1].mode;
35478 modev2 = insn_data[d->icode].operand[2].mode;
35479 modev3 = insn_data[d->icode].operand[3].mode;
35480 modeimm = insn_data[d->icode].operand[4].mode;
35482 if (VECTOR_MODE_P (modev2))
35483 op0 = safe_vector_operand (op0, modev2);
35484 if (VECTOR_MODE_P (modev3))
35485 op1 = safe_vector_operand (op1, modev3);
35487 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35488 op0 = copy_to_mode_reg (modev2, op0);
35489 if ((optimize && !register_operand (op1, modev3))
35490 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35491 op1 = copy_to_mode_reg (modev3, op1);
35493 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35495 error ("the third argument must be an 8-bit immediate");
35496 return const0_rtx;
35499 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35501 if (optimize || !target
35502 || GET_MODE (target) != tmode0
35503 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35504 target = gen_reg_rtx (tmode0);
35506 scratch1 = gen_reg_rtx (tmode1);
35508 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35510 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35512 if (optimize || !target
35513 || GET_MODE (target) != tmode1
35514 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35515 target = gen_reg_rtx (tmode1);
35517 scratch0 = gen_reg_rtx (tmode0);
35519 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35521 else
35523 gcc_assert (d->flag);
35525 scratch0 = gen_reg_rtx (tmode0);
35526 scratch1 = gen_reg_rtx (tmode1);
35528 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35531 if (! pat)
35532 return 0;
35534 emit_insn (pat);
35536 if (d->flag)
35538 target = gen_reg_rtx (SImode);
35539 emit_move_insn (target, const0_rtx);
35540 target = gen_rtx_SUBREG (QImode, target, 0);
35542 emit_insn
35543 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35544 gen_rtx_fmt_ee (EQ, QImode,
35545 gen_rtx_REG ((machine_mode) d->flag,
35546 FLAGS_REG),
35547 const0_rtx)));
35548 return SUBREG_REG (target);
35550 else
35551 return target;
35554 /* Subroutine of ix86_expand_builtin to take care of insns with
35555 variable number of operands. */
35557 static rtx
35558 ix86_expand_args_builtin (const struct builtin_description *d,
35559 tree exp, rtx target)
35561 rtx pat, real_target;
35562 unsigned int i, nargs;
35563 unsigned int nargs_constant = 0;
35564 unsigned int mask_pos = 0;
35565 int num_memory = 0;
35566 struct
35568 rtx op;
35569 machine_mode mode;
35570 } args[6];
35571 bool second_arg_count = false;
35572 enum insn_code icode = d->icode;
35573 const struct insn_data_d *insn_p = &insn_data[icode];
35574 machine_mode tmode = insn_p->operand[0].mode;
35575 machine_mode rmode = VOIDmode;
35576 bool swap = false;
35577 enum rtx_code comparison = d->comparison;
35579 switch ((enum ix86_builtin_func_type) d->flag)
35581 case V2DF_FTYPE_V2DF_ROUND:
35582 case V4DF_FTYPE_V4DF_ROUND:
35583 case V8DF_FTYPE_V8DF_ROUND:
35584 case V4SF_FTYPE_V4SF_ROUND:
35585 case V8SF_FTYPE_V8SF_ROUND:
35586 case V16SF_FTYPE_V16SF_ROUND:
35587 case V4SI_FTYPE_V4SF_ROUND:
35588 case V8SI_FTYPE_V8SF_ROUND:
35589 case V16SI_FTYPE_V16SF_ROUND:
35590 return ix86_expand_sse_round (d, exp, target);
35591 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35592 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35593 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35594 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35595 case INT_FTYPE_V8SF_V8SF_PTEST:
35596 case INT_FTYPE_V4DI_V4DI_PTEST:
35597 case INT_FTYPE_V4DF_V4DF_PTEST:
35598 case INT_FTYPE_V4SF_V4SF_PTEST:
35599 case INT_FTYPE_V2DI_V2DI_PTEST:
35600 case INT_FTYPE_V2DF_V2DF_PTEST:
35601 return ix86_expand_sse_ptest (d, exp, target);
35602 case FLOAT128_FTYPE_FLOAT128:
35603 case FLOAT_FTYPE_FLOAT:
35604 case INT_FTYPE_INT:
35605 case UINT_FTYPE_UINT:
35606 case UINT16_FTYPE_UINT16:
35607 case UINT64_FTYPE_INT:
35608 case UINT64_FTYPE_UINT64:
35609 case INT64_FTYPE_INT64:
35610 case INT64_FTYPE_V4SF:
35611 case INT64_FTYPE_V2DF:
35612 case INT_FTYPE_V16QI:
35613 case INT_FTYPE_V8QI:
35614 case INT_FTYPE_V8SF:
35615 case INT_FTYPE_V4DF:
35616 case INT_FTYPE_V4SF:
35617 case INT_FTYPE_V2DF:
35618 case INT_FTYPE_V32QI:
35619 case V16QI_FTYPE_V16QI:
35620 case V8SI_FTYPE_V8SF:
35621 case V8SI_FTYPE_V4SI:
35622 case V8HI_FTYPE_V8HI:
35623 case V8HI_FTYPE_V16QI:
35624 case V8QI_FTYPE_V8QI:
35625 case V8SF_FTYPE_V8SF:
35626 case V8SF_FTYPE_V8SI:
35627 case V8SF_FTYPE_V4SF:
35628 case V8SF_FTYPE_V8HI:
35629 case V4SI_FTYPE_V4SI:
35630 case V4SI_FTYPE_V16QI:
35631 case V4SI_FTYPE_V4SF:
35632 case V4SI_FTYPE_V8SI:
35633 case V4SI_FTYPE_V8HI:
35634 case V4SI_FTYPE_V4DF:
35635 case V4SI_FTYPE_V2DF:
35636 case V4HI_FTYPE_V4HI:
35637 case V4DF_FTYPE_V4DF:
35638 case V4DF_FTYPE_V4SI:
35639 case V4DF_FTYPE_V4SF:
35640 case V4DF_FTYPE_V2DF:
35641 case V4SF_FTYPE_V4SF:
35642 case V4SF_FTYPE_V4SI:
35643 case V4SF_FTYPE_V8SF:
35644 case V4SF_FTYPE_V4DF:
35645 case V4SF_FTYPE_V8HI:
35646 case V4SF_FTYPE_V2DF:
35647 case V2DI_FTYPE_V2DI:
35648 case V2DI_FTYPE_V16QI:
35649 case V2DI_FTYPE_V8HI:
35650 case V2DI_FTYPE_V4SI:
35651 case V2DF_FTYPE_V2DF:
35652 case V2DF_FTYPE_V4SI:
35653 case V2DF_FTYPE_V4DF:
35654 case V2DF_FTYPE_V4SF:
35655 case V2DF_FTYPE_V2SI:
35656 case V2SI_FTYPE_V2SI:
35657 case V2SI_FTYPE_V4SF:
35658 case V2SI_FTYPE_V2SF:
35659 case V2SI_FTYPE_V2DF:
35660 case V2SF_FTYPE_V2SF:
35661 case V2SF_FTYPE_V2SI:
35662 case V32QI_FTYPE_V32QI:
35663 case V32QI_FTYPE_V16QI:
35664 case V16HI_FTYPE_V16HI:
35665 case V16HI_FTYPE_V8HI:
35666 case V8SI_FTYPE_V8SI:
35667 case V16HI_FTYPE_V16QI:
35668 case V8SI_FTYPE_V16QI:
35669 case V4DI_FTYPE_V16QI:
35670 case V8SI_FTYPE_V8HI:
35671 case V4DI_FTYPE_V8HI:
35672 case V4DI_FTYPE_V4SI:
35673 case V4DI_FTYPE_V2DI:
35674 case UQI_FTYPE_UQI:
35675 case UHI_FTYPE_UHI:
35676 case USI_FTYPE_USI:
35677 case USI_FTYPE_UQI:
35678 case USI_FTYPE_UHI:
35679 case UDI_FTYPE_UDI:
35680 case UHI_FTYPE_V16QI:
35681 case USI_FTYPE_V32QI:
35682 case UDI_FTYPE_V64QI:
35683 case V16QI_FTYPE_UHI:
35684 case V32QI_FTYPE_USI:
35685 case V64QI_FTYPE_UDI:
35686 case V8HI_FTYPE_UQI:
35687 case V16HI_FTYPE_UHI:
35688 case V32HI_FTYPE_USI:
35689 case V4SI_FTYPE_UQI:
35690 case V8SI_FTYPE_UQI:
35691 case V4SI_FTYPE_UHI:
35692 case V8SI_FTYPE_UHI:
35693 case UQI_FTYPE_V8HI:
35694 case UHI_FTYPE_V16HI:
35695 case USI_FTYPE_V32HI:
35696 case UQI_FTYPE_V4SI:
35697 case UQI_FTYPE_V8SI:
35698 case UHI_FTYPE_V16SI:
35699 case UQI_FTYPE_V2DI:
35700 case UQI_FTYPE_V4DI:
35701 case UQI_FTYPE_V8DI:
35702 case V16SI_FTYPE_UHI:
35703 case V2DI_FTYPE_UQI:
35704 case V4DI_FTYPE_UQI:
35705 case V16SI_FTYPE_INT:
35706 case V16SF_FTYPE_V8SF:
35707 case V16SI_FTYPE_V8SI:
35708 case V16SF_FTYPE_V4SF:
35709 case V16SI_FTYPE_V4SI:
35710 case V16SI_FTYPE_V16SF:
35711 case V16SI_FTYPE_V16SI:
35712 case V16SF_FTYPE_V16SF:
35713 case V8DI_FTYPE_UQI:
35714 case V8DI_FTYPE_V8DI:
35715 case V8DF_FTYPE_V4DF:
35716 case V8DF_FTYPE_V2DF:
35717 case V8DF_FTYPE_V8DF:
35718 nargs = 1;
35719 break;
35720 case V4SF_FTYPE_V4SF_VEC_MERGE:
35721 case V2DF_FTYPE_V2DF_VEC_MERGE:
35722 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35723 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35724 case V16QI_FTYPE_V16QI_V16QI:
35725 case V16QI_FTYPE_V8HI_V8HI:
35726 case V16SF_FTYPE_V16SF_V16SF:
35727 case V8QI_FTYPE_V8QI_V8QI:
35728 case V8QI_FTYPE_V4HI_V4HI:
35729 case V8HI_FTYPE_V8HI_V8HI:
35730 case V8HI_FTYPE_V16QI_V16QI:
35731 case V8HI_FTYPE_V4SI_V4SI:
35732 case V8SF_FTYPE_V8SF_V8SF:
35733 case V8SF_FTYPE_V8SF_V8SI:
35734 case V8DF_FTYPE_V8DF_V8DF:
35735 case V4SI_FTYPE_V4SI_V4SI:
35736 case V4SI_FTYPE_V8HI_V8HI:
35737 case V4SI_FTYPE_V2DF_V2DF:
35738 case V4HI_FTYPE_V4HI_V4HI:
35739 case V4HI_FTYPE_V8QI_V8QI:
35740 case V4HI_FTYPE_V2SI_V2SI:
35741 case V4DF_FTYPE_V4DF_V4DF:
35742 case V4DF_FTYPE_V4DF_V4DI:
35743 case V4SF_FTYPE_V4SF_V4SF:
35744 case V4SF_FTYPE_V4SF_V4SI:
35745 case V4SF_FTYPE_V4SF_V2SI:
35746 case V4SF_FTYPE_V4SF_V2DF:
35747 case V4SF_FTYPE_V4SF_UINT:
35748 case V4SF_FTYPE_V4SF_DI:
35749 case V4SF_FTYPE_V4SF_SI:
35750 case V2DI_FTYPE_V2DI_V2DI:
35751 case V2DI_FTYPE_V16QI_V16QI:
35752 case V2DI_FTYPE_V4SI_V4SI:
35753 case V2DI_FTYPE_V2DI_V16QI:
35754 case V2SI_FTYPE_V2SI_V2SI:
35755 case V2SI_FTYPE_V4HI_V4HI:
35756 case V2SI_FTYPE_V2SF_V2SF:
35757 case V2DF_FTYPE_V2DF_V2DF:
35758 case V2DF_FTYPE_V2DF_V4SF:
35759 case V2DF_FTYPE_V2DF_V2DI:
35760 case V2DF_FTYPE_V2DF_DI:
35761 case V2DF_FTYPE_V2DF_SI:
35762 case V2DF_FTYPE_V2DF_UINT:
35763 case V2SF_FTYPE_V2SF_V2SF:
35764 case V1DI_FTYPE_V1DI_V1DI:
35765 case V1DI_FTYPE_V8QI_V8QI:
35766 case V1DI_FTYPE_V2SI_V2SI:
35767 case V32QI_FTYPE_V16HI_V16HI:
35768 case V16HI_FTYPE_V8SI_V8SI:
35769 case V32QI_FTYPE_V32QI_V32QI:
35770 case V16HI_FTYPE_V32QI_V32QI:
35771 case V16HI_FTYPE_V16HI_V16HI:
35772 case V8SI_FTYPE_V4DF_V4DF:
35773 case V8SI_FTYPE_V8SI_V8SI:
35774 case V8SI_FTYPE_V16HI_V16HI:
35775 case V4DI_FTYPE_V4DI_V4DI:
35776 case V4DI_FTYPE_V8SI_V8SI:
35777 case V8DI_FTYPE_V64QI_V64QI:
35778 if (comparison == UNKNOWN)
35779 return ix86_expand_binop_builtin (icode, exp, target);
35780 nargs = 2;
35781 break;
35782 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35783 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35784 gcc_assert (comparison != UNKNOWN);
35785 nargs = 2;
35786 swap = true;
35787 break;
35788 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35789 case V16HI_FTYPE_V16HI_SI_COUNT:
35790 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35791 case V8SI_FTYPE_V8SI_SI_COUNT:
35792 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35793 case V4DI_FTYPE_V4DI_INT_COUNT:
35794 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35795 case V8HI_FTYPE_V8HI_SI_COUNT:
35796 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35797 case V4SI_FTYPE_V4SI_SI_COUNT:
35798 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35799 case V4HI_FTYPE_V4HI_SI_COUNT:
35800 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35801 case V2DI_FTYPE_V2DI_SI_COUNT:
35802 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35803 case V2SI_FTYPE_V2SI_SI_COUNT:
35804 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35805 case V1DI_FTYPE_V1DI_SI_COUNT:
35806 nargs = 2;
35807 second_arg_count = true;
35808 break;
35809 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35810 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35811 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35812 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35813 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35814 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35815 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35816 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35817 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35818 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35819 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35820 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35821 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35822 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35823 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35824 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35825 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35826 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35827 nargs = 4;
35828 second_arg_count = true;
35829 break;
35830 case UINT64_FTYPE_UINT64_UINT64:
35831 case UINT_FTYPE_UINT_UINT:
35832 case UINT_FTYPE_UINT_USHORT:
35833 case UINT_FTYPE_UINT_UCHAR:
35834 case UINT16_FTYPE_UINT16_INT:
35835 case UINT8_FTYPE_UINT8_INT:
35836 case UQI_FTYPE_UQI_UQI:
35837 case UHI_FTYPE_UHI_UHI:
35838 case USI_FTYPE_USI_USI:
35839 case UDI_FTYPE_UDI_UDI:
35840 case V16SI_FTYPE_V8DF_V8DF:
35841 nargs = 2;
35842 break;
35843 case V2DI_FTYPE_V2DI_INT_CONVERT:
35844 nargs = 2;
35845 rmode = V1TImode;
35846 nargs_constant = 1;
35847 break;
35848 case V4DI_FTYPE_V4DI_INT_CONVERT:
35849 nargs = 2;
35850 rmode = V2TImode;
35851 nargs_constant = 1;
35852 break;
35853 case V8DI_FTYPE_V8DI_INT_CONVERT:
35854 nargs = 2;
35855 rmode = V4TImode;
35856 nargs_constant = 1;
35857 break;
35858 case V8HI_FTYPE_V8HI_INT:
35859 case V8HI_FTYPE_V8SF_INT:
35860 case V16HI_FTYPE_V16SF_INT:
35861 case V8HI_FTYPE_V4SF_INT:
35862 case V8SF_FTYPE_V8SF_INT:
35863 case V4SF_FTYPE_V16SF_INT:
35864 case V16SF_FTYPE_V16SF_INT:
35865 case V4SI_FTYPE_V4SI_INT:
35866 case V4SI_FTYPE_V8SI_INT:
35867 case V4HI_FTYPE_V4HI_INT:
35868 case V4DF_FTYPE_V4DF_INT:
35869 case V4DF_FTYPE_V8DF_INT:
35870 case V4SF_FTYPE_V4SF_INT:
35871 case V4SF_FTYPE_V8SF_INT:
35872 case V2DI_FTYPE_V2DI_INT:
35873 case V2DF_FTYPE_V2DF_INT:
35874 case V2DF_FTYPE_V4DF_INT:
35875 case V16HI_FTYPE_V16HI_INT:
35876 case V8SI_FTYPE_V8SI_INT:
35877 case V16SI_FTYPE_V16SI_INT:
35878 case V4SI_FTYPE_V16SI_INT:
35879 case V4DI_FTYPE_V4DI_INT:
35880 case V2DI_FTYPE_V4DI_INT:
35881 case V4DI_FTYPE_V8DI_INT:
35882 case QI_FTYPE_V4SF_INT:
35883 case QI_FTYPE_V2DF_INT:
35884 case UQI_FTYPE_UQI_UQI_CONST:
35885 case UHI_FTYPE_UHI_UQI:
35886 case USI_FTYPE_USI_UQI:
35887 case UDI_FTYPE_UDI_UQI:
35888 nargs = 2;
35889 nargs_constant = 1;
35890 break;
35891 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35892 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35893 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35894 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35895 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35896 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35897 case UHI_FTYPE_V16SI_V16SI_UHI:
35898 case UQI_FTYPE_V8DI_V8DI_UQI:
35899 case V16HI_FTYPE_V16SI_V16HI_UHI:
35900 case V16QI_FTYPE_V16SI_V16QI_UHI:
35901 case V16QI_FTYPE_V8DI_V16QI_UQI:
35902 case V16SF_FTYPE_V16SF_V16SF_UHI:
35903 case V16SF_FTYPE_V4SF_V16SF_UHI:
35904 case V16SI_FTYPE_SI_V16SI_UHI:
35905 case V16SI_FTYPE_V16HI_V16SI_UHI:
35906 case V16SI_FTYPE_V16QI_V16SI_UHI:
35907 case V8SF_FTYPE_V4SF_V8SF_UQI:
35908 case V4DF_FTYPE_V2DF_V4DF_UQI:
35909 case V8SI_FTYPE_V4SI_V8SI_UQI:
35910 case V8SI_FTYPE_SI_V8SI_UQI:
35911 case V4SI_FTYPE_V4SI_V4SI_UQI:
35912 case V4SI_FTYPE_SI_V4SI_UQI:
35913 case V4DI_FTYPE_V2DI_V4DI_UQI:
35914 case V4DI_FTYPE_DI_V4DI_UQI:
35915 case V2DI_FTYPE_V2DI_V2DI_UQI:
35916 case V2DI_FTYPE_DI_V2DI_UQI:
35917 case V64QI_FTYPE_V64QI_V64QI_UDI:
35918 case V64QI_FTYPE_V16QI_V64QI_UDI:
35919 case V64QI_FTYPE_QI_V64QI_UDI:
35920 case V32QI_FTYPE_V32QI_V32QI_USI:
35921 case V32QI_FTYPE_V16QI_V32QI_USI:
35922 case V32QI_FTYPE_QI_V32QI_USI:
35923 case V16QI_FTYPE_V16QI_V16QI_UHI:
35924 case V16QI_FTYPE_QI_V16QI_UHI:
35925 case V32HI_FTYPE_V8HI_V32HI_USI:
35926 case V32HI_FTYPE_HI_V32HI_USI:
35927 case V16HI_FTYPE_V8HI_V16HI_UHI:
35928 case V16HI_FTYPE_HI_V16HI_UHI:
35929 case V8HI_FTYPE_V8HI_V8HI_UQI:
35930 case V8HI_FTYPE_HI_V8HI_UQI:
35931 case V8SF_FTYPE_V8HI_V8SF_UQI:
35932 case V4SF_FTYPE_V8HI_V4SF_UQI:
35933 case V8SI_FTYPE_V8SF_V8SI_UQI:
35934 case V4SI_FTYPE_V4SF_V4SI_UQI:
35935 case V4DI_FTYPE_V4SF_V4DI_UQI:
35936 case V2DI_FTYPE_V4SF_V2DI_UQI:
35937 case V4SF_FTYPE_V4DI_V4SF_UQI:
35938 case V4SF_FTYPE_V2DI_V4SF_UQI:
35939 case V4DF_FTYPE_V4DI_V4DF_UQI:
35940 case V2DF_FTYPE_V2DI_V2DF_UQI:
35941 case V16QI_FTYPE_V8HI_V16QI_UQI:
35942 case V16QI_FTYPE_V16HI_V16QI_UHI:
35943 case V16QI_FTYPE_V4SI_V16QI_UQI:
35944 case V16QI_FTYPE_V8SI_V16QI_UQI:
35945 case V8HI_FTYPE_V4SI_V8HI_UQI:
35946 case V8HI_FTYPE_V8SI_V8HI_UQI:
35947 case V16QI_FTYPE_V2DI_V16QI_UQI:
35948 case V16QI_FTYPE_V4DI_V16QI_UQI:
35949 case V8HI_FTYPE_V2DI_V8HI_UQI:
35950 case V8HI_FTYPE_V4DI_V8HI_UQI:
35951 case V4SI_FTYPE_V2DI_V4SI_UQI:
35952 case V4SI_FTYPE_V4DI_V4SI_UQI:
35953 case V32QI_FTYPE_V32HI_V32QI_USI:
35954 case UHI_FTYPE_V16QI_V16QI_UHI:
35955 case USI_FTYPE_V32QI_V32QI_USI:
35956 case UDI_FTYPE_V64QI_V64QI_UDI:
35957 case UQI_FTYPE_V8HI_V8HI_UQI:
35958 case UHI_FTYPE_V16HI_V16HI_UHI:
35959 case USI_FTYPE_V32HI_V32HI_USI:
35960 case UQI_FTYPE_V4SI_V4SI_UQI:
35961 case UQI_FTYPE_V8SI_V8SI_UQI:
35962 case UQI_FTYPE_V2DI_V2DI_UQI:
35963 case UQI_FTYPE_V4DI_V4DI_UQI:
35964 case V4SF_FTYPE_V2DF_V4SF_UQI:
35965 case V4SF_FTYPE_V4DF_V4SF_UQI:
35966 case V16SI_FTYPE_V16SI_V16SI_UHI:
35967 case V16SI_FTYPE_V4SI_V16SI_UHI:
35968 case V2DI_FTYPE_V4SI_V2DI_UQI:
35969 case V2DI_FTYPE_V8HI_V2DI_UQI:
35970 case V2DI_FTYPE_V16QI_V2DI_UQI:
35971 case V4DI_FTYPE_V4DI_V4DI_UQI:
35972 case V4DI_FTYPE_V4SI_V4DI_UQI:
35973 case V4DI_FTYPE_V8HI_V4DI_UQI:
35974 case V4DI_FTYPE_V16QI_V4DI_UQI:
35975 case V4DI_FTYPE_V4DF_V4DI_UQI:
35976 case V2DI_FTYPE_V2DF_V2DI_UQI:
35977 case V4SI_FTYPE_V4DF_V4SI_UQI:
35978 case V4SI_FTYPE_V2DF_V4SI_UQI:
35979 case V4SI_FTYPE_V8HI_V4SI_UQI:
35980 case V4SI_FTYPE_V16QI_V4SI_UQI:
35981 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35982 case V8DF_FTYPE_V2DF_V8DF_UQI:
35983 case V8DF_FTYPE_V4DF_V8DF_UQI:
35984 case V8DF_FTYPE_V8DF_V8DF_UQI:
35985 case V8SF_FTYPE_V8SF_V8SF_UQI:
35986 case V8SF_FTYPE_V8SI_V8SF_UQI:
35987 case V4DF_FTYPE_V4DF_V4DF_UQI:
35988 case V4SF_FTYPE_V4SF_V4SF_UQI:
35989 case V2DF_FTYPE_V2DF_V2DF_UQI:
35990 case V2DF_FTYPE_V4SF_V2DF_UQI:
35991 case V2DF_FTYPE_V4SI_V2DF_UQI:
35992 case V4SF_FTYPE_V4SI_V4SF_UQI:
35993 case V4DF_FTYPE_V4SF_V4DF_UQI:
35994 case V4DF_FTYPE_V4SI_V4DF_UQI:
35995 case V8SI_FTYPE_V8SI_V8SI_UQI:
35996 case V8SI_FTYPE_V8HI_V8SI_UQI:
35997 case V8SI_FTYPE_V16QI_V8SI_UQI:
35998 case V8DF_FTYPE_V8SI_V8DF_UQI:
35999 case V8DI_FTYPE_DI_V8DI_UQI:
36000 case V16SF_FTYPE_V8SF_V16SF_UHI:
36001 case V16SI_FTYPE_V8SI_V16SI_UHI:
36002 case V16HI_FTYPE_V16HI_V16HI_UHI:
36003 case V8HI_FTYPE_V16QI_V8HI_UQI:
36004 case V16HI_FTYPE_V16QI_V16HI_UHI:
36005 case V32HI_FTYPE_V32HI_V32HI_USI:
36006 case V32HI_FTYPE_V32QI_V32HI_USI:
36007 case V8DI_FTYPE_V16QI_V8DI_UQI:
36008 case V8DI_FTYPE_V2DI_V8DI_UQI:
36009 case V8DI_FTYPE_V4DI_V8DI_UQI:
36010 case V8DI_FTYPE_V8DI_V8DI_UQI:
36011 case V8DI_FTYPE_V8HI_V8DI_UQI:
36012 case V8DI_FTYPE_V8SI_V8DI_UQI:
36013 case V8HI_FTYPE_V8DI_V8HI_UQI:
36014 case V8SI_FTYPE_V8DI_V8SI_UQI:
36015 case V4SI_FTYPE_V4SI_V4SI_V4SI:
36016 nargs = 3;
36017 break;
36018 case V32QI_FTYPE_V32QI_V32QI_INT:
36019 case V16HI_FTYPE_V16HI_V16HI_INT:
36020 case V16QI_FTYPE_V16QI_V16QI_INT:
36021 case V4DI_FTYPE_V4DI_V4DI_INT:
36022 case V8HI_FTYPE_V8HI_V8HI_INT:
36023 case V8SI_FTYPE_V8SI_V8SI_INT:
36024 case V8SI_FTYPE_V8SI_V4SI_INT:
36025 case V8SF_FTYPE_V8SF_V8SF_INT:
36026 case V8SF_FTYPE_V8SF_V4SF_INT:
36027 case V4SI_FTYPE_V4SI_V4SI_INT:
36028 case V4DF_FTYPE_V4DF_V4DF_INT:
36029 case V16SF_FTYPE_V16SF_V16SF_INT:
36030 case V16SF_FTYPE_V16SF_V4SF_INT:
36031 case V16SI_FTYPE_V16SI_V4SI_INT:
36032 case V4DF_FTYPE_V4DF_V2DF_INT:
36033 case V4SF_FTYPE_V4SF_V4SF_INT:
36034 case V2DI_FTYPE_V2DI_V2DI_INT:
36035 case V4DI_FTYPE_V4DI_V2DI_INT:
36036 case V2DF_FTYPE_V2DF_V2DF_INT:
36037 case UQI_FTYPE_V8DI_V8UDI_INT:
36038 case UQI_FTYPE_V8DF_V8DF_INT:
36039 case UQI_FTYPE_V2DF_V2DF_INT:
36040 case UQI_FTYPE_V4SF_V4SF_INT:
36041 case UHI_FTYPE_V16SI_V16SI_INT:
36042 case UHI_FTYPE_V16SF_V16SF_INT:
36043 nargs = 3;
36044 nargs_constant = 1;
36045 break;
36046 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36047 nargs = 3;
36048 rmode = V4DImode;
36049 nargs_constant = 1;
36050 break;
36051 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36052 nargs = 3;
36053 rmode = V2DImode;
36054 nargs_constant = 1;
36055 break;
36056 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36057 nargs = 3;
36058 rmode = DImode;
36059 nargs_constant = 1;
36060 break;
36061 case V2DI_FTYPE_V2DI_UINT_UINT:
36062 nargs = 3;
36063 nargs_constant = 2;
36064 break;
36065 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36066 nargs = 3;
36067 rmode = V8DImode;
36068 nargs_constant = 1;
36069 break;
36070 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36071 nargs = 5;
36072 rmode = V8DImode;
36073 mask_pos = 2;
36074 nargs_constant = 1;
36075 break;
36076 case QI_FTYPE_V8DF_INT_UQI:
36077 case QI_FTYPE_V4DF_INT_UQI:
36078 case QI_FTYPE_V2DF_INT_UQI:
36079 case HI_FTYPE_V16SF_INT_UHI:
36080 case QI_FTYPE_V8SF_INT_UQI:
36081 case QI_FTYPE_V4SF_INT_UQI:
36082 nargs = 3;
36083 mask_pos = 1;
36084 nargs_constant = 1;
36085 break;
36086 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36087 nargs = 5;
36088 rmode = V4DImode;
36089 mask_pos = 2;
36090 nargs_constant = 1;
36091 break;
36092 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36093 nargs = 5;
36094 rmode = V2DImode;
36095 mask_pos = 2;
36096 nargs_constant = 1;
36097 break;
36098 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36099 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36100 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36101 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36102 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36103 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36104 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36105 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36106 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36107 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36108 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36109 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36110 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36111 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36112 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36113 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36114 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36115 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36116 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36117 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36118 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36119 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36120 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36121 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36122 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36123 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36124 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36125 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36126 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36127 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36128 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36129 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36130 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36131 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36132 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36133 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36134 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36135 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36136 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36137 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36138 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36139 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36140 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36141 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36142 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36143 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36144 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36145 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36146 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36147 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36148 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36149 nargs = 4;
36150 break;
36151 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36152 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36153 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36154 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36155 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36156 nargs = 4;
36157 nargs_constant = 1;
36158 break;
36159 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36160 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36161 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36162 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36163 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36164 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36165 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36166 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36167 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36168 case USI_FTYPE_V32QI_V32QI_INT_USI:
36169 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36170 case USI_FTYPE_V32HI_V32HI_INT_USI:
36171 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36172 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36173 nargs = 4;
36174 mask_pos = 1;
36175 nargs_constant = 1;
36176 break;
36177 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36178 nargs = 4;
36179 nargs_constant = 2;
36180 break;
36181 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36182 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36183 nargs = 4;
36184 break;
36185 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36186 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36187 mask_pos = 1;
36188 nargs = 4;
36189 nargs_constant = 1;
36190 break;
36191 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36192 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36193 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36194 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36195 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36196 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36197 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36198 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36199 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36200 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36201 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36202 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36203 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36204 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36205 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36206 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36207 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36208 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36209 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36210 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36211 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36212 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36213 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36214 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36215 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36216 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36217 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36218 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36219 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36220 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36221 nargs = 4;
36222 mask_pos = 2;
36223 nargs_constant = 1;
36224 break;
36225 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36226 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36227 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36228 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36229 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36230 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36231 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36232 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36233 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36234 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36235 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36236 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36237 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36238 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36239 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36240 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36241 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36242 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36243 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36244 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36245 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36246 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36247 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36248 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36249 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36250 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36251 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36252 nargs = 5;
36253 mask_pos = 2;
36254 nargs_constant = 1;
36255 break;
36256 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36257 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36258 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36259 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36260 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36261 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36262 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36263 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36264 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36265 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36266 nargs = 5;
36267 mask_pos = 1;
36268 nargs_constant = 1;
36269 break;
36271 default:
36272 gcc_unreachable ();
36275 gcc_assert (nargs <= ARRAY_SIZE (args));
36277 if (comparison != UNKNOWN)
36279 gcc_assert (nargs == 2);
36280 return ix86_expand_sse_compare (d, exp, target, swap);
36283 if (rmode == VOIDmode || rmode == tmode)
36285 if (optimize
36286 || target == 0
36287 || GET_MODE (target) != tmode
36288 || !insn_p->operand[0].predicate (target, tmode))
36289 target = gen_reg_rtx (tmode);
36290 else if (memory_operand (target, tmode))
36291 num_memory++;
36292 real_target = target;
36294 else
36296 real_target = gen_reg_rtx (tmode);
36297 target = lowpart_subreg (rmode, real_target, tmode);
36300 for (i = 0; i < nargs; i++)
36302 tree arg = CALL_EXPR_ARG (exp, i);
36303 rtx op = expand_normal (arg);
36304 machine_mode mode = insn_p->operand[i + 1].mode;
36305 bool match = insn_p->operand[i + 1].predicate (op, mode);
36307 if (second_arg_count && i == 1)
36309 /* SIMD shift insns take either an 8-bit immediate or
36310 register as count. But builtin functions take int as
36311 count. If count doesn't match, we put it in register.
36312 The instructions are using 64-bit count, if op is just
36313 32-bit, zero-extend it, as negative shift counts
36314 are undefined behavior and zero-extension is more
36315 efficient. */
36316 if (!match)
36318 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36319 op = convert_modes (mode, GET_MODE (op), op, 1);
36320 else
36321 op = lowpart_subreg (mode, op, GET_MODE (op));
36322 if (!insn_p->operand[i + 1].predicate (op, mode))
36323 op = copy_to_reg (op);
36326 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36327 (!mask_pos && (nargs - i) <= nargs_constant))
36329 if (!match)
36330 switch (icode)
36332 case CODE_FOR_avx_vinsertf128v4di:
36333 case CODE_FOR_avx_vextractf128v4di:
36334 error ("the last argument must be an 1-bit immediate");
36335 return const0_rtx;
36337 case CODE_FOR_avx512f_cmpv8di3_mask:
36338 case CODE_FOR_avx512f_cmpv16si3_mask:
36339 case CODE_FOR_avx512f_ucmpv8di3_mask:
36340 case CODE_FOR_avx512f_ucmpv16si3_mask:
36341 case CODE_FOR_avx512vl_cmpv4di3_mask:
36342 case CODE_FOR_avx512vl_cmpv8si3_mask:
36343 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36344 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36345 case CODE_FOR_avx512vl_cmpv2di3_mask:
36346 case CODE_FOR_avx512vl_cmpv4si3_mask:
36347 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36348 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36349 error ("the last argument must be a 3-bit immediate");
36350 return const0_rtx;
36352 case CODE_FOR_sse4_1_roundsd:
36353 case CODE_FOR_sse4_1_roundss:
36355 case CODE_FOR_sse4_1_roundpd:
36356 case CODE_FOR_sse4_1_roundps:
36357 case CODE_FOR_avx_roundpd256:
36358 case CODE_FOR_avx_roundps256:
36360 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36361 case CODE_FOR_sse4_1_roundps_sfix:
36362 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36363 case CODE_FOR_avx_roundps_sfix256:
36365 case CODE_FOR_sse4_1_blendps:
36366 case CODE_FOR_avx_blendpd256:
36367 case CODE_FOR_avx_vpermilv4df:
36368 case CODE_FOR_avx_vpermilv4df_mask:
36369 case CODE_FOR_avx512f_getmantv8df_mask:
36370 case CODE_FOR_avx512f_getmantv16sf_mask:
36371 case CODE_FOR_avx512vl_getmantv8sf_mask:
36372 case CODE_FOR_avx512vl_getmantv4df_mask:
36373 case CODE_FOR_avx512vl_getmantv4sf_mask:
36374 case CODE_FOR_avx512vl_getmantv2df_mask:
36375 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36376 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36377 case CODE_FOR_avx512dq_rangepv4df_mask:
36378 case CODE_FOR_avx512dq_rangepv8sf_mask:
36379 case CODE_FOR_avx512dq_rangepv2df_mask:
36380 case CODE_FOR_avx512dq_rangepv4sf_mask:
36381 case CODE_FOR_avx_shufpd256_mask:
36382 error ("the last argument must be a 4-bit immediate");
36383 return const0_rtx;
36385 case CODE_FOR_sha1rnds4:
36386 case CODE_FOR_sse4_1_blendpd:
36387 case CODE_FOR_avx_vpermilv2df:
36388 case CODE_FOR_avx_vpermilv2df_mask:
36389 case CODE_FOR_xop_vpermil2v2df3:
36390 case CODE_FOR_xop_vpermil2v4sf3:
36391 case CODE_FOR_xop_vpermil2v4df3:
36392 case CODE_FOR_xop_vpermil2v8sf3:
36393 case CODE_FOR_avx512f_vinsertf32x4_mask:
36394 case CODE_FOR_avx512f_vinserti32x4_mask:
36395 case CODE_FOR_avx512f_vextractf32x4_mask:
36396 case CODE_FOR_avx512f_vextracti32x4_mask:
36397 case CODE_FOR_sse2_shufpd:
36398 case CODE_FOR_sse2_shufpd_mask:
36399 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36400 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36401 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36402 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36403 error ("the last argument must be a 2-bit immediate");
36404 return const0_rtx;
36406 case CODE_FOR_avx_vextractf128v4df:
36407 case CODE_FOR_avx_vextractf128v8sf:
36408 case CODE_FOR_avx_vextractf128v8si:
36409 case CODE_FOR_avx_vinsertf128v4df:
36410 case CODE_FOR_avx_vinsertf128v8sf:
36411 case CODE_FOR_avx_vinsertf128v8si:
36412 case CODE_FOR_avx512f_vinsertf64x4_mask:
36413 case CODE_FOR_avx512f_vinserti64x4_mask:
36414 case CODE_FOR_avx512f_vextractf64x4_mask:
36415 case CODE_FOR_avx512f_vextracti64x4_mask:
36416 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36417 case CODE_FOR_avx512dq_vinserti32x8_mask:
36418 case CODE_FOR_avx512vl_vinsertv4df:
36419 case CODE_FOR_avx512vl_vinsertv4di:
36420 case CODE_FOR_avx512vl_vinsertv8sf:
36421 case CODE_FOR_avx512vl_vinsertv8si:
36422 error ("the last argument must be a 1-bit immediate");
36423 return const0_rtx;
36425 case CODE_FOR_avx_vmcmpv2df3:
36426 case CODE_FOR_avx_vmcmpv4sf3:
36427 case CODE_FOR_avx_cmpv2df3:
36428 case CODE_FOR_avx_cmpv4sf3:
36429 case CODE_FOR_avx_cmpv4df3:
36430 case CODE_FOR_avx_cmpv8sf3:
36431 case CODE_FOR_avx512f_cmpv8df3_mask:
36432 case CODE_FOR_avx512f_cmpv16sf3_mask:
36433 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36434 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36435 error ("the last argument must be a 5-bit immediate");
36436 return const0_rtx;
36438 default:
36439 switch (nargs_constant)
36441 case 2:
36442 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36443 (!mask_pos && (nargs - i) == nargs_constant))
36445 error ("the next to last argument must be an 8-bit immediate");
36446 break;
36448 /* FALLTHRU */
36449 case 1:
36450 error ("the last argument must be an 8-bit immediate");
36451 break;
36452 default:
36453 gcc_unreachable ();
36455 return const0_rtx;
36458 else
36460 if (VECTOR_MODE_P (mode))
36461 op = safe_vector_operand (op, mode);
36463 /* If we aren't optimizing, only allow one memory operand to
36464 be generated. */
36465 if (memory_operand (op, mode))
36466 num_memory++;
36468 op = fixup_modeless_constant (op, mode);
36470 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36472 if (optimize || !match || num_memory > 1)
36473 op = copy_to_mode_reg (mode, op);
36475 else
36477 op = copy_to_reg (op);
36478 op = lowpart_subreg (mode, op, GET_MODE (op));
36482 args[i].op = op;
36483 args[i].mode = mode;
36486 switch (nargs)
36488 case 1:
36489 pat = GEN_FCN (icode) (real_target, args[0].op);
36490 break;
36491 case 2:
36492 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36493 break;
36494 case 3:
36495 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36496 args[2].op);
36497 break;
36498 case 4:
36499 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36500 args[2].op, args[3].op);
36501 break;
36502 case 5:
36503 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36504 args[2].op, args[3].op, args[4].op);
36505 break;
36506 case 6:
36507 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36508 args[2].op, args[3].op, args[4].op,
36509 args[5].op);
36510 break;
36511 default:
36512 gcc_unreachable ();
36515 if (! pat)
36516 return 0;
36518 emit_insn (pat);
36519 return target;
36522 /* Transform pattern of following layout:
36523 (set A
36524 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36526 into:
36527 (set (A B)) */
36529 static rtx
36530 ix86_erase_embedded_rounding (rtx pat)
36532 if (GET_CODE (pat) == INSN)
36533 pat = PATTERN (pat);
36535 gcc_assert (GET_CODE (pat) == SET);
36536 rtx src = SET_SRC (pat);
36537 gcc_assert (XVECLEN (src, 0) == 2);
36538 rtx p0 = XVECEXP (src, 0, 0);
36539 gcc_assert (GET_CODE (src) == UNSPEC
36540 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36541 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36542 return res;
36545 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36546 with rounding. */
36547 static rtx
36548 ix86_expand_sse_comi_round (const struct builtin_description *d,
36549 tree exp, rtx target)
36551 rtx pat, set_dst;
36552 tree arg0 = CALL_EXPR_ARG (exp, 0);
36553 tree arg1 = CALL_EXPR_ARG (exp, 1);
36554 tree arg2 = CALL_EXPR_ARG (exp, 2);
36555 tree arg3 = CALL_EXPR_ARG (exp, 3);
36556 rtx op0 = expand_normal (arg0);
36557 rtx op1 = expand_normal (arg1);
36558 rtx op2 = expand_normal (arg2);
36559 rtx op3 = expand_normal (arg3);
36560 enum insn_code icode = d->icode;
36561 const struct insn_data_d *insn_p = &insn_data[icode];
36562 machine_mode mode0 = insn_p->operand[0].mode;
36563 machine_mode mode1 = insn_p->operand[1].mode;
36564 enum rtx_code comparison = UNEQ;
36565 bool need_ucomi = false;
36567 /* See avxintrin.h for values. */
36568 enum rtx_code comi_comparisons[32] =
36570 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36571 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36572 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36574 bool need_ucomi_values[32] =
36576 true, false, false, true, true, false, false, true,
36577 true, false, false, true, true, false, false, true,
36578 false, true, true, false, false, true, true, false,
36579 false, true, true, false, false, true, true, false
36582 if (!CONST_INT_P (op2))
36584 error ("the third argument must be comparison constant");
36585 return const0_rtx;
36587 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36589 error ("incorrect comparison mode");
36590 return const0_rtx;
36593 if (!insn_p->operand[2].predicate (op3, SImode))
36595 error ("incorrect rounding operand");
36596 return const0_rtx;
36599 comparison = comi_comparisons[INTVAL (op2)];
36600 need_ucomi = need_ucomi_values[INTVAL (op2)];
36602 if (VECTOR_MODE_P (mode0))
36603 op0 = safe_vector_operand (op0, mode0);
36604 if (VECTOR_MODE_P (mode1))
36605 op1 = safe_vector_operand (op1, mode1);
36607 target = gen_reg_rtx (SImode);
36608 emit_move_insn (target, const0_rtx);
36609 target = gen_rtx_SUBREG (QImode, target, 0);
36611 if ((optimize && !register_operand (op0, mode0))
36612 || !insn_p->operand[0].predicate (op0, mode0))
36613 op0 = copy_to_mode_reg (mode0, op0);
36614 if ((optimize && !register_operand (op1, mode1))
36615 || !insn_p->operand[1].predicate (op1, mode1))
36616 op1 = copy_to_mode_reg (mode1, op1);
36618 if (need_ucomi)
36619 icode = icode == CODE_FOR_sse_comi_round
36620 ? CODE_FOR_sse_ucomi_round
36621 : CODE_FOR_sse2_ucomi_round;
36623 pat = GEN_FCN (icode) (op0, op1, op3);
36624 if (! pat)
36625 return 0;
36627 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36628 if (INTVAL (op3) == NO_ROUND)
36630 pat = ix86_erase_embedded_rounding (pat);
36631 if (! pat)
36632 return 0;
36634 set_dst = SET_DEST (pat);
36636 else
36638 gcc_assert (GET_CODE (pat) == SET);
36639 set_dst = SET_DEST (pat);
36642 emit_insn (pat);
36643 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36644 gen_rtx_fmt_ee (comparison, QImode,
36645 set_dst,
36646 const0_rtx)));
36648 return SUBREG_REG (target);
36651 static rtx
36652 ix86_expand_round_builtin (const struct builtin_description *d,
36653 tree exp, rtx target)
36655 rtx pat;
36656 unsigned int i, nargs;
36657 struct
36659 rtx op;
36660 machine_mode mode;
36661 } args[6];
36662 enum insn_code icode = d->icode;
36663 const struct insn_data_d *insn_p = &insn_data[icode];
36664 machine_mode tmode = insn_p->operand[0].mode;
36665 unsigned int nargs_constant = 0;
36666 unsigned int redundant_embed_rnd = 0;
36668 switch ((enum ix86_builtin_func_type) d->flag)
36670 case UINT64_FTYPE_V2DF_INT:
36671 case UINT64_FTYPE_V4SF_INT:
36672 case UINT_FTYPE_V2DF_INT:
36673 case UINT_FTYPE_V4SF_INT:
36674 case INT64_FTYPE_V2DF_INT:
36675 case INT64_FTYPE_V4SF_INT:
36676 case INT_FTYPE_V2DF_INT:
36677 case INT_FTYPE_V4SF_INT:
36678 nargs = 2;
36679 break;
36680 case V4SF_FTYPE_V4SF_UINT_INT:
36681 case V4SF_FTYPE_V4SF_UINT64_INT:
36682 case V2DF_FTYPE_V2DF_UINT64_INT:
36683 case V4SF_FTYPE_V4SF_INT_INT:
36684 case V4SF_FTYPE_V4SF_INT64_INT:
36685 case V2DF_FTYPE_V2DF_INT64_INT:
36686 case V4SF_FTYPE_V4SF_V4SF_INT:
36687 case V2DF_FTYPE_V2DF_V2DF_INT:
36688 case V4SF_FTYPE_V4SF_V2DF_INT:
36689 case V2DF_FTYPE_V2DF_V4SF_INT:
36690 nargs = 3;
36691 break;
36692 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36693 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36694 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36695 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36696 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36697 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36698 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36699 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36700 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36701 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36702 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36703 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36704 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36705 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36706 nargs = 4;
36707 break;
36708 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36709 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36710 nargs_constant = 2;
36711 nargs = 4;
36712 break;
36713 case INT_FTYPE_V4SF_V4SF_INT_INT:
36714 case INT_FTYPE_V2DF_V2DF_INT_INT:
36715 return ix86_expand_sse_comi_round (d, exp, target);
36716 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36717 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36718 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36719 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36720 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36721 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36722 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36723 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36724 nargs = 5;
36725 break;
36726 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36727 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36728 nargs_constant = 4;
36729 nargs = 5;
36730 break;
36731 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36732 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36733 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36734 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36735 nargs_constant = 3;
36736 nargs = 5;
36737 break;
36738 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36739 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36740 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36741 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36742 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
36743 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
36744 nargs = 6;
36745 nargs_constant = 4;
36746 break;
36747 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36748 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36749 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36750 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36751 nargs = 6;
36752 nargs_constant = 3;
36753 break;
36754 default:
36755 gcc_unreachable ();
36757 gcc_assert (nargs <= ARRAY_SIZE (args));
36759 if (optimize
36760 || target == 0
36761 || GET_MODE (target) != tmode
36762 || !insn_p->operand[0].predicate (target, tmode))
36763 target = gen_reg_rtx (tmode);
36765 for (i = 0; i < nargs; i++)
36767 tree arg = CALL_EXPR_ARG (exp, i);
36768 rtx op = expand_normal (arg);
36769 machine_mode mode = insn_p->operand[i + 1].mode;
36770 bool match = insn_p->operand[i + 1].predicate (op, mode);
36772 if (i == nargs - nargs_constant)
36774 if (!match)
36776 switch (icode)
36778 case CODE_FOR_avx512f_getmantv8df_mask_round:
36779 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36780 case CODE_FOR_avx512f_vgetmantv2df_round:
36781 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
36782 case CODE_FOR_avx512f_vgetmantv4sf_round:
36783 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
36784 error ("the immediate argument must be a 4-bit immediate");
36785 return const0_rtx;
36786 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36787 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36788 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36789 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36790 error ("the immediate argument must be a 5-bit immediate");
36791 return const0_rtx;
36792 default:
36793 error ("the immediate argument must be an 8-bit immediate");
36794 return const0_rtx;
36798 else if (i == nargs-1)
36800 if (!insn_p->operand[nargs].predicate (op, SImode))
36802 error ("incorrect rounding operand");
36803 return const0_rtx;
36806 /* If there is no rounding use normal version of the pattern. */
36807 if (INTVAL (op) == NO_ROUND)
36808 redundant_embed_rnd = 1;
36810 else
36812 if (VECTOR_MODE_P (mode))
36813 op = safe_vector_operand (op, mode);
36815 op = fixup_modeless_constant (op, mode);
36817 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36819 if (optimize || !match)
36820 op = copy_to_mode_reg (mode, op);
36822 else
36824 op = copy_to_reg (op);
36825 op = lowpart_subreg (mode, op, GET_MODE (op));
36829 args[i].op = op;
36830 args[i].mode = mode;
36833 switch (nargs)
36835 case 1:
36836 pat = GEN_FCN (icode) (target, args[0].op);
36837 break;
36838 case 2:
36839 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36840 break;
36841 case 3:
36842 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36843 args[2].op);
36844 break;
36845 case 4:
36846 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36847 args[2].op, args[3].op);
36848 break;
36849 case 5:
36850 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36851 args[2].op, args[3].op, args[4].op);
36852 break;
36853 case 6:
36854 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36855 args[2].op, args[3].op, args[4].op,
36856 args[5].op);
36857 break;
36858 default:
36859 gcc_unreachable ();
36862 if (!pat)
36863 return 0;
36865 if (redundant_embed_rnd)
36866 pat = ix86_erase_embedded_rounding (pat);
36868 emit_insn (pat);
36869 return target;
36872 /* Subroutine of ix86_expand_builtin to take care of special insns
36873 with variable number of operands. */
36875 static rtx
36876 ix86_expand_special_args_builtin (const struct builtin_description *d,
36877 tree exp, rtx target)
36879 tree arg;
36880 rtx pat, op;
36881 unsigned int i, nargs, arg_adjust, memory;
36882 bool aligned_mem = false;
36883 struct
36885 rtx op;
36886 machine_mode mode;
36887 } args[3];
36888 enum insn_code icode = d->icode;
36889 bool last_arg_constant = false;
36890 const struct insn_data_d *insn_p = &insn_data[icode];
36891 machine_mode tmode = insn_p->operand[0].mode;
36892 enum { load, store } klass;
36894 switch ((enum ix86_builtin_func_type) d->flag)
36896 case VOID_FTYPE_VOID:
36897 emit_insn (GEN_FCN (icode) (target));
36898 return 0;
36899 case VOID_FTYPE_UINT64:
36900 case VOID_FTYPE_UNSIGNED:
36901 nargs = 0;
36902 klass = store;
36903 memory = 0;
36904 break;
36906 case INT_FTYPE_VOID:
36907 case USHORT_FTYPE_VOID:
36908 case UINT64_FTYPE_VOID:
36909 case UNSIGNED_FTYPE_VOID:
36910 nargs = 0;
36911 klass = load;
36912 memory = 0;
36913 break;
36914 case UINT64_FTYPE_PUNSIGNED:
36915 case V2DI_FTYPE_PV2DI:
36916 case V4DI_FTYPE_PV4DI:
36917 case V32QI_FTYPE_PCCHAR:
36918 case V16QI_FTYPE_PCCHAR:
36919 case V8SF_FTYPE_PCV4SF:
36920 case V8SF_FTYPE_PCFLOAT:
36921 case V4SF_FTYPE_PCFLOAT:
36922 case V4DF_FTYPE_PCV2DF:
36923 case V4DF_FTYPE_PCDOUBLE:
36924 case V2DF_FTYPE_PCDOUBLE:
36925 case VOID_FTYPE_PVOID:
36926 case V8DI_FTYPE_PV8DI:
36927 nargs = 1;
36928 klass = load;
36929 memory = 0;
36930 switch (icode)
36932 case CODE_FOR_sse4_1_movntdqa:
36933 case CODE_FOR_avx2_movntdqa:
36934 case CODE_FOR_avx512f_movntdqa:
36935 aligned_mem = true;
36936 break;
36937 default:
36938 break;
36940 break;
36941 case VOID_FTYPE_PV2SF_V4SF:
36942 case VOID_FTYPE_PV8DI_V8DI:
36943 case VOID_FTYPE_PV4DI_V4DI:
36944 case VOID_FTYPE_PV2DI_V2DI:
36945 case VOID_FTYPE_PCHAR_V32QI:
36946 case VOID_FTYPE_PCHAR_V16QI:
36947 case VOID_FTYPE_PFLOAT_V16SF:
36948 case VOID_FTYPE_PFLOAT_V8SF:
36949 case VOID_FTYPE_PFLOAT_V4SF:
36950 case VOID_FTYPE_PDOUBLE_V8DF:
36951 case VOID_FTYPE_PDOUBLE_V4DF:
36952 case VOID_FTYPE_PDOUBLE_V2DF:
36953 case VOID_FTYPE_PLONGLONG_LONGLONG:
36954 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36955 case VOID_FTYPE_PINT_INT:
36956 nargs = 1;
36957 klass = store;
36958 /* Reserve memory operand for target. */
36959 memory = ARRAY_SIZE (args);
36960 switch (icode)
36962 /* These builtins and instructions require the memory
36963 to be properly aligned. */
36964 case CODE_FOR_avx_movntv4di:
36965 case CODE_FOR_sse2_movntv2di:
36966 case CODE_FOR_avx_movntv8sf:
36967 case CODE_FOR_sse_movntv4sf:
36968 case CODE_FOR_sse4a_vmmovntv4sf:
36969 case CODE_FOR_avx_movntv4df:
36970 case CODE_FOR_sse2_movntv2df:
36971 case CODE_FOR_sse4a_vmmovntv2df:
36972 case CODE_FOR_sse2_movntidi:
36973 case CODE_FOR_sse_movntq:
36974 case CODE_FOR_sse2_movntisi:
36975 case CODE_FOR_avx512f_movntv16sf:
36976 case CODE_FOR_avx512f_movntv8df:
36977 case CODE_FOR_avx512f_movntv8di:
36978 aligned_mem = true;
36979 break;
36980 default:
36981 break;
36983 break;
36984 case V4SF_FTYPE_V4SF_PCV2SF:
36985 case V2DF_FTYPE_V2DF_PCDOUBLE:
36986 nargs = 2;
36987 klass = load;
36988 memory = 1;
36989 break;
36990 case V8SF_FTYPE_PCV8SF_V8SI:
36991 case V4DF_FTYPE_PCV4DF_V4DI:
36992 case V4SF_FTYPE_PCV4SF_V4SI:
36993 case V2DF_FTYPE_PCV2DF_V2DI:
36994 case V8SI_FTYPE_PCV8SI_V8SI:
36995 case V4DI_FTYPE_PCV4DI_V4DI:
36996 case V4SI_FTYPE_PCV4SI_V4SI:
36997 case V2DI_FTYPE_PCV2DI_V2DI:
36998 case VOID_FTYPE_INT_INT64:
36999 nargs = 2;
37000 klass = load;
37001 memory = 0;
37002 break;
37003 case VOID_FTYPE_PV8DF_V8DF_UQI:
37004 case VOID_FTYPE_PV4DF_V4DF_UQI:
37005 case VOID_FTYPE_PV2DF_V2DF_UQI:
37006 case VOID_FTYPE_PV16SF_V16SF_UHI:
37007 case VOID_FTYPE_PV8SF_V8SF_UQI:
37008 case VOID_FTYPE_PV4SF_V4SF_UQI:
37009 case VOID_FTYPE_PV8DI_V8DI_UQI:
37010 case VOID_FTYPE_PV4DI_V4DI_UQI:
37011 case VOID_FTYPE_PV2DI_V2DI_UQI:
37012 case VOID_FTYPE_PV16SI_V16SI_UHI:
37013 case VOID_FTYPE_PV8SI_V8SI_UQI:
37014 case VOID_FTYPE_PV4SI_V4SI_UQI:
37015 switch (icode)
37017 /* These builtins and instructions require the memory
37018 to be properly aligned. */
37019 case CODE_FOR_avx512f_storev16sf_mask:
37020 case CODE_FOR_avx512f_storev16si_mask:
37021 case CODE_FOR_avx512f_storev8df_mask:
37022 case CODE_FOR_avx512f_storev8di_mask:
37023 case CODE_FOR_avx512vl_storev8sf_mask:
37024 case CODE_FOR_avx512vl_storev8si_mask:
37025 case CODE_FOR_avx512vl_storev4df_mask:
37026 case CODE_FOR_avx512vl_storev4di_mask:
37027 case CODE_FOR_avx512vl_storev4sf_mask:
37028 case CODE_FOR_avx512vl_storev4si_mask:
37029 case CODE_FOR_avx512vl_storev2df_mask:
37030 case CODE_FOR_avx512vl_storev2di_mask:
37031 aligned_mem = true;
37032 break;
37033 default:
37034 break;
37036 /* FALLTHRU */
37037 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37038 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37039 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37040 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37041 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37042 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37043 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37044 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37045 case VOID_FTYPE_PV8SI_V8DI_UQI:
37046 case VOID_FTYPE_PV8HI_V8DI_UQI:
37047 case VOID_FTYPE_PV16HI_V16SI_UHI:
37048 case VOID_FTYPE_PV16QI_V8DI_UQI:
37049 case VOID_FTYPE_PV16QI_V16SI_UHI:
37050 case VOID_FTYPE_PV4SI_V4DI_UQI:
37051 case VOID_FTYPE_PV4SI_V2DI_UQI:
37052 case VOID_FTYPE_PV8HI_V4DI_UQI:
37053 case VOID_FTYPE_PV8HI_V2DI_UQI:
37054 case VOID_FTYPE_PV8HI_V8SI_UQI:
37055 case VOID_FTYPE_PV8HI_V4SI_UQI:
37056 case VOID_FTYPE_PV16QI_V4DI_UQI:
37057 case VOID_FTYPE_PV16QI_V2DI_UQI:
37058 case VOID_FTYPE_PV16QI_V8SI_UQI:
37059 case VOID_FTYPE_PV16QI_V4SI_UQI:
37060 case VOID_FTYPE_PCHAR_V64QI_UDI:
37061 case VOID_FTYPE_PCHAR_V32QI_USI:
37062 case VOID_FTYPE_PCHAR_V16QI_UHI:
37063 case VOID_FTYPE_PSHORT_V32HI_USI:
37064 case VOID_FTYPE_PSHORT_V16HI_UHI:
37065 case VOID_FTYPE_PSHORT_V8HI_UQI:
37066 case VOID_FTYPE_PINT_V16SI_UHI:
37067 case VOID_FTYPE_PINT_V8SI_UQI:
37068 case VOID_FTYPE_PINT_V4SI_UQI:
37069 case VOID_FTYPE_PINT64_V8DI_UQI:
37070 case VOID_FTYPE_PINT64_V4DI_UQI:
37071 case VOID_FTYPE_PINT64_V2DI_UQI:
37072 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37073 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37074 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37075 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37076 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37077 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37078 case VOID_FTYPE_PV32QI_V32HI_USI:
37079 case VOID_FTYPE_PV16QI_V16HI_UHI:
37080 case VOID_FTYPE_PV8QI_V8HI_UQI:
37081 nargs = 2;
37082 klass = store;
37083 /* Reserve memory operand for target. */
37084 memory = ARRAY_SIZE (args);
37085 break;
37086 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37087 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37088 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37089 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37090 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37091 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37092 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37093 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37094 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37095 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37096 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37097 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37098 switch (icode)
37100 /* These builtins and instructions require the memory
37101 to be properly aligned. */
37102 case CODE_FOR_avx512f_loadv16sf_mask:
37103 case CODE_FOR_avx512f_loadv16si_mask:
37104 case CODE_FOR_avx512f_loadv8df_mask:
37105 case CODE_FOR_avx512f_loadv8di_mask:
37106 case CODE_FOR_avx512vl_loadv8sf_mask:
37107 case CODE_FOR_avx512vl_loadv8si_mask:
37108 case CODE_FOR_avx512vl_loadv4df_mask:
37109 case CODE_FOR_avx512vl_loadv4di_mask:
37110 case CODE_FOR_avx512vl_loadv4sf_mask:
37111 case CODE_FOR_avx512vl_loadv4si_mask:
37112 case CODE_FOR_avx512vl_loadv2df_mask:
37113 case CODE_FOR_avx512vl_loadv2di_mask:
37114 case CODE_FOR_avx512bw_loadv64qi_mask:
37115 case CODE_FOR_avx512vl_loadv32qi_mask:
37116 case CODE_FOR_avx512vl_loadv16qi_mask:
37117 case CODE_FOR_avx512bw_loadv32hi_mask:
37118 case CODE_FOR_avx512vl_loadv16hi_mask:
37119 case CODE_FOR_avx512vl_loadv8hi_mask:
37120 aligned_mem = true;
37121 break;
37122 default:
37123 break;
37125 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37126 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37127 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37128 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37129 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37130 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37131 case V16SI_FTYPE_PCINT_V16SI_UHI:
37132 case V8SI_FTYPE_PCINT_V8SI_UQI:
37133 case V4SI_FTYPE_PCINT_V4SI_UQI:
37134 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37135 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37136 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37137 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37138 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37139 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37140 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37141 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37142 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37143 nargs = 3;
37144 klass = load;
37145 memory = 0;
37146 break;
37147 case VOID_FTYPE_UINT_UINT_UINT:
37148 case VOID_FTYPE_UINT64_UINT_UINT:
37149 case UCHAR_FTYPE_UINT_UINT_UINT:
37150 case UCHAR_FTYPE_UINT64_UINT_UINT:
37151 nargs = 3;
37152 klass = load;
37153 memory = ARRAY_SIZE (args);
37154 last_arg_constant = true;
37155 break;
37156 default:
37157 gcc_unreachable ();
37160 gcc_assert (nargs <= ARRAY_SIZE (args));
37162 if (klass == store)
37164 arg = CALL_EXPR_ARG (exp, 0);
37165 op = expand_normal (arg);
37166 gcc_assert (target == 0);
37167 if (memory)
37169 op = ix86_zero_extend_to_Pmode (op);
37170 target = gen_rtx_MEM (tmode, op);
37171 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37172 on it. Try to improve it using get_pointer_alignment,
37173 and if the special builtin is one that requires strict
37174 mode alignment, also from it's GET_MODE_ALIGNMENT.
37175 Failure to do so could lead to ix86_legitimate_combined_insn
37176 rejecting all changes to such insns. */
37177 unsigned int align = get_pointer_alignment (arg);
37178 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37179 align = GET_MODE_ALIGNMENT (tmode);
37180 if (MEM_ALIGN (target) < align)
37181 set_mem_align (target, align);
37183 else
37184 target = force_reg (tmode, op);
37185 arg_adjust = 1;
37187 else
37189 arg_adjust = 0;
37190 if (optimize
37191 || target == 0
37192 || !register_operand (target, tmode)
37193 || GET_MODE (target) != tmode)
37194 target = gen_reg_rtx (tmode);
37197 for (i = 0; i < nargs; i++)
37199 machine_mode mode = insn_p->operand[i + 1].mode;
37200 bool match;
37202 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37203 op = expand_normal (arg);
37204 match = insn_p->operand[i + 1].predicate (op, mode);
37206 if (last_arg_constant && (i + 1) == nargs)
37208 if (!match)
37210 if (icode == CODE_FOR_lwp_lwpvalsi3
37211 || icode == CODE_FOR_lwp_lwpinssi3
37212 || icode == CODE_FOR_lwp_lwpvaldi3
37213 || icode == CODE_FOR_lwp_lwpinsdi3)
37214 error ("the last argument must be a 32-bit immediate");
37215 else
37216 error ("the last argument must be an 8-bit immediate");
37217 return const0_rtx;
37220 else
37222 if (i == memory)
37224 /* This must be the memory operand. */
37225 op = ix86_zero_extend_to_Pmode (op);
37226 op = gen_rtx_MEM (mode, op);
37227 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37228 on it. Try to improve it using get_pointer_alignment,
37229 and if the special builtin is one that requires strict
37230 mode alignment, also from it's GET_MODE_ALIGNMENT.
37231 Failure to do so could lead to ix86_legitimate_combined_insn
37232 rejecting all changes to such insns. */
37233 unsigned int align = get_pointer_alignment (arg);
37234 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37235 align = GET_MODE_ALIGNMENT (mode);
37236 if (MEM_ALIGN (op) < align)
37237 set_mem_align (op, align);
37239 else
37241 /* This must be register. */
37242 if (VECTOR_MODE_P (mode))
37243 op = safe_vector_operand (op, mode);
37245 op = fixup_modeless_constant (op, mode);
37247 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37248 op = copy_to_mode_reg (mode, op);
37249 else
37251 op = copy_to_reg (op);
37252 op = lowpart_subreg (mode, op, GET_MODE (op));
37257 args[i].op = op;
37258 args[i].mode = mode;
37261 switch (nargs)
37263 case 0:
37264 pat = GEN_FCN (icode) (target);
37265 break;
37266 case 1:
37267 pat = GEN_FCN (icode) (target, args[0].op);
37268 break;
37269 case 2:
37270 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37271 break;
37272 case 3:
37273 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37274 break;
37275 default:
37276 gcc_unreachable ();
37279 if (! pat)
37280 return 0;
37281 emit_insn (pat);
37282 return klass == store ? 0 : target;
37285 /* Return the integer constant in ARG. Constrain it to be in the range
37286 of the subparts of VEC_TYPE; issue an error if not. */
37288 static int
37289 get_element_number (tree vec_type, tree arg)
37291 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37293 if (!tree_fits_uhwi_p (arg)
37294 || (elt = tree_to_uhwi (arg), elt > max))
37296 error ("selector must be an integer constant in the range 0..%wi", max);
37297 return 0;
37300 return elt;
37303 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37304 ix86_expand_vector_init. We DO have language-level syntax for this, in
37305 the form of (type){ init-list }. Except that since we can't place emms
37306 instructions from inside the compiler, we can't allow the use of MMX
37307 registers unless the user explicitly asks for it. So we do *not* define
37308 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37309 we have builtins invoked by mmintrin.h that gives us license to emit
37310 these sorts of instructions. */
37312 static rtx
37313 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37315 machine_mode tmode = TYPE_MODE (type);
37316 machine_mode inner_mode = GET_MODE_INNER (tmode);
37317 int i, n_elt = GET_MODE_NUNITS (tmode);
37318 rtvec v = rtvec_alloc (n_elt);
37320 gcc_assert (VECTOR_MODE_P (tmode));
37321 gcc_assert (call_expr_nargs (exp) == n_elt);
37323 for (i = 0; i < n_elt; ++i)
37325 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37326 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37329 if (!target || !register_operand (target, tmode))
37330 target = gen_reg_rtx (tmode);
37332 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37333 return target;
37336 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37337 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37338 had a language-level syntax for referencing vector elements. */
37340 static rtx
37341 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37343 machine_mode tmode, mode0;
37344 tree arg0, arg1;
37345 int elt;
37346 rtx op0;
37348 arg0 = CALL_EXPR_ARG (exp, 0);
37349 arg1 = CALL_EXPR_ARG (exp, 1);
37351 op0 = expand_normal (arg0);
37352 elt = get_element_number (TREE_TYPE (arg0), arg1);
37354 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37355 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37356 gcc_assert (VECTOR_MODE_P (mode0));
37358 op0 = force_reg (mode0, op0);
37360 if (optimize || !target || !register_operand (target, tmode))
37361 target = gen_reg_rtx (tmode);
37363 ix86_expand_vector_extract (true, target, op0, elt);
37365 return target;
37368 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37369 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37370 a language-level syntax for referencing vector elements. */
37372 static rtx
37373 ix86_expand_vec_set_builtin (tree exp)
37375 machine_mode tmode, mode1;
37376 tree arg0, arg1, arg2;
37377 int elt;
37378 rtx op0, op1, target;
37380 arg0 = CALL_EXPR_ARG (exp, 0);
37381 arg1 = CALL_EXPR_ARG (exp, 1);
37382 arg2 = CALL_EXPR_ARG (exp, 2);
37384 tmode = TYPE_MODE (TREE_TYPE (arg0));
37385 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37386 gcc_assert (VECTOR_MODE_P (tmode));
37388 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37389 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37390 elt = get_element_number (TREE_TYPE (arg0), arg2);
37392 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37393 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37395 op0 = force_reg (tmode, op0);
37396 op1 = force_reg (mode1, op1);
37398 /* OP0 is the source of these builtin functions and shouldn't be
37399 modified. Create a copy, use it and return it as target. */
37400 target = gen_reg_rtx (tmode);
37401 emit_move_insn (target, op0);
37402 ix86_expand_vector_set (true, target, op1, elt);
37404 return target;
37407 /* Emit conditional move of SRC to DST with condition
37408 OP1 CODE OP2. */
37409 static void
37410 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37412 rtx t;
37414 if (TARGET_CMOVE)
37416 t = ix86_expand_compare (code, op1, op2);
37417 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37418 src, dst)));
37420 else
37422 rtx_code_label *nomove = gen_label_rtx ();
37423 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37424 const0_rtx, GET_MODE (op1), 1, nomove);
37425 emit_move_insn (dst, src);
37426 emit_label (nomove);
37430 /* Choose max of DST and SRC and put it to DST. */
37431 static void
37432 ix86_emit_move_max (rtx dst, rtx src)
37434 ix86_emit_cmove (dst, src, LTU, dst, src);
37437 /* Expand an expression EXP that calls a built-in function,
37438 with result going to TARGET if that's convenient
37439 (and in mode MODE if that's convenient).
37440 SUBTARGET may be used as the target for computing one of EXP's operands.
37441 IGNORE is nonzero if the value is to be ignored. */
37443 static rtx
37444 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37445 machine_mode mode, int ignore)
37447 size_t i;
37448 enum insn_code icode;
37449 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37450 tree arg0, arg1, arg2, arg3, arg4;
37451 rtx op0, op1, op2, op3, op4, pat, insn;
37452 machine_mode mode0, mode1, mode2, mode3, mode4;
37453 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37455 /* For CPU builtins that can be folded, fold first and expand the fold. */
37456 switch (fcode)
37458 case IX86_BUILTIN_CPU_INIT:
37460 /* Make it call __cpu_indicator_init in libgcc. */
37461 tree call_expr, fndecl, type;
37462 type = build_function_type_list (integer_type_node, NULL_TREE);
37463 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37464 call_expr = build_call_expr (fndecl, 0);
37465 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37467 case IX86_BUILTIN_CPU_IS:
37468 case IX86_BUILTIN_CPU_SUPPORTS:
37470 tree arg0 = CALL_EXPR_ARG (exp, 0);
37471 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37472 gcc_assert (fold_expr != NULL_TREE);
37473 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37477 /* Determine whether the builtin function is available under the current ISA.
37478 Originally the builtin was not created if it wasn't applicable to the
37479 current ISA based on the command line switches. With function specific
37480 options, we need to check in the context of the function making the call
37481 whether it is supported. Treat AVX512VL specially. For other flags,
37482 if isa includes more than one ISA bit, treat those are requiring any
37483 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37484 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37485 at all, -m64 is a whole TU option. */
37486 if (((ix86_builtins_isa[fcode].isa
37487 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37488 && !(ix86_builtins_isa[fcode].isa
37489 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37490 & ix86_isa_flags))
37491 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37492 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37493 || (ix86_builtins_isa[fcode].isa2
37494 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37496 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37497 ix86_builtins_isa[fcode].isa2, 0, 0,
37498 NULL, NULL, (enum fpmath_unit) 0,
37499 false);
37500 if (!opts)
37501 error ("%qE needs unknown isa option", fndecl);
37502 else
37504 gcc_assert (opts != NULL);
37505 error ("%qE needs isa option %s", fndecl, opts);
37506 free (opts);
37508 return expand_call (exp, target, ignore);
37511 switch (fcode)
37513 case IX86_BUILTIN_BNDMK:
37514 if (!target
37515 || GET_MODE (target) != BNDmode
37516 || !register_operand (target, BNDmode))
37517 target = gen_reg_rtx (BNDmode);
37519 arg0 = CALL_EXPR_ARG (exp, 0);
37520 arg1 = CALL_EXPR_ARG (exp, 1);
37522 op0 = expand_normal (arg0);
37523 op1 = expand_normal (arg1);
37525 if (!register_operand (op0, Pmode))
37526 op0 = ix86_zero_extend_to_Pmode (op0);
37527 if (!register_operand (op1, Pmode))
37528 op1 = ix86_zero_extend_to_Pmode (op1);
37530 /* Builtin arg1 is size of block but instruction op1 should
37531 be (size - 1). */
37532 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37533 NULL_RTX, 1, OPTAB_DIRECT);
37535 emit_insn (BNDmode == BND64mode
37536 ? gen_bnd64_mk (target, op0, op1)
37537 : gen_bnd32_mk (target, op0, op1));
37538 return target;
37540 case IX86_BUILTIN_BNDSTX:
37541 arg0 = CALL_EXPR_ARG (exp, 0);
37542 arg1 = CALL_EXPR_ARG (exp, 1);
37543 arg2 = CALL_EXPR_ARG (exp, 2);
37545 op0 = expand_normal (arg0);
37546 op1 = expand_normal (arg1);
37547 op2 = expand_normal (arg2);
37549 if (!register_operand (op0, Pmode))
37550 op0 = ix86_zero_extend_to_Pmode (op0);
37551 if (!register_operand (op1, BNDmode))
37552 op1 = copy_to_mode_reg (BNDmode, op1);
37553 if (!register_operand (op2, Pmode))
37554 op2 = ix86_zero_extend_to_Pmode (op2);
37556 emit_insn (BNDmode == BND64mode
37557 ? gen_bnd64_stx (op2, op0, op1)
37558 : gen_bnd32_stx (op2, op0, op1));
37559 return 0;
37561 case IX86_BUILTIN_BNDLDX:
37562 if (!target
37563 || GET_MODE (target) != BNDmode
37564 || !register_operand (target, BNDmode))
37565 target = gen_reg_rtx (BNDmode);
37567 arg0 = CALL_EXPR_ARG (exp, 0);
37568 arg1 = CALL_EXPR_ARG (exp, 1);
37570 op0 = expand_normal (arg0);
37571 op1 = expand_normal (arg1);
37573 if (!register_operand (op0, Pmode))
37574 op0 = ix86_zero_extend_to_Pmode (op0);
37575 if (!register_operand (op1, Pmode))
37576 op1 = ix86_zero_extend_to_Pmode (op1);
37578 emit_insn (BNDmode == BND64mode
37579 ? gen_bnd64_ldx (target, op0, op1)
37580 : gen_bnd32_ldx (target, op0, op1));
37581 return target;
37583 case IX86_BUILTIN_BNDCL:
37584 arg0 = CALL_EXPR_ARG (exp, 0);
37585 arg1 = CALL_EXPR_ARG (exp, 1);
37587 op0 = expand_normal (arg0);
37588 op1 = expand_normal (arg1);
37590 if (!register_operand (op0, Pmode))
37591 op0 = ix86_zero_extend_to_Pmode (op0);
37592 if (!register_operand (op1, BNDmode))
37593 op1 = copy_to_mode_reg (BNDmode, op1);
37595 emit_insn (BNDmode == BND64mode
37596 ? gen_bnd64_cl (op1, op0)
37597 : gen_bnd32_cl (op1, op0));
37598 return 0;
37600 case IX86_BUILTIN_BNDCU:
37601 arg0 = CALL_EXPR_ARG (exp, 0);
37602 arg1 = CALL_EXPR_ARG (exp, 1);
37604 op0 = expand_normal (arg0);
37605 op1 = expand_normal (arg1);
37607 if (!register_operand (op0, Pmode))
37608 op0 = ix86_zero_extend_to_Pmode (op0);
37609 if (!register_operand (op1, BNDmode))
37610 op1 = copy_to_mode_reg (BNDmode, op1);
37612 emit_insn (BNDmode == BND64mode
37613 ? gen_bnd64_cu (op1, op0)
37614 : gen_bnd32_cu (op1, op0));
37615 return 0;
37617 case IX86_BUILTIN_BNDRET:
37618 arg0 = CALL_EXPR_ARG (exp, 0);
37619 target = chkp_get_rtl_bounds (arg0);
37621 /* If no bounds were specified for returned value,
37622 then use INIT bounds. It usually happens when
37623 some built-in function is expanded. */
37624 if (!target)
37626 rtx t1 = gen_reg_rtx (Pmode);
37627 rtx t2 = gen_reg_rtx (Pmode);
37628 target = gen_reg_rtx (BNDmode);
37629 emit_move_insn (t1, const0_rtx);
37630 emit_move_insn (t2, constm1_rtx);
37631 emit_insn (BNDmode == BND64mode
37632 ? gen_bnd64_mk (target, t1, t2)
37633 : gen_bnd32_mk (target, t1, t2));
37636 gcc_assert (target && REG_P (target));
37637 return target;
37639 case IX86_BUILTIN_BNDNARROW:
37641 rtx m1, m1h1, m1h2, lb, ub, t1;
37643 /* Return value and lb. */
37644 arg0 = CALL_EXPR_ARG (exp, 0);
37645 /* Bounds. */
37646 arg1 = CALL_EXPR_ARG (exp, 1);
37647 /* Size. */
37648 arg2 = CALL_EXPR_ARG (exp, 2);
37650 lb = expand_normal (arg0);
37651 op1 = expand_normal (arg1);
37652 op2 = expand_normal (arg2);
37654 /* Size was passed but we need to use (size - 1) as for bndmk. */
37655 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37656 NULL_RTX, 1, OPTAB_DIRECT);
37658 /* Add LB to size and inverse to get UB. */
37659 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37660 op2, 1, OPTAB_DIRECT);
37661 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37663 if (!register_operand (lb, Pmode))
37664 lb = ix86_zero_extend_to_Pmode (lb);
37665 if (!register_operand (ub, Pmode))
37666 ub = ix86_zero_extend_to_Pmode (ub);
37668 /* We need to move bounds to memory before any computations. */
37669 if (MEM_P (op1))
37670 m1 = op1;
37671 else
37673 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37674 emit_move_insn (m1, op1);
37677 /* Generate mem expression to be used for access to LB and UB. */
37678 m1h1 = adjust_address (m1, Pmode, 0);
37679 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37681 t1 = gen_reg_rtx (Pmode);
37683 /* Compute LB. */
37684 emit_move_insn (t1, m1h1);
37685 ix86_emit_move_max (t1, lb);
37686 emit_move_insn (m1h1, t1);
37688 /* Compute UB. UB is stored in 1's complement form. Therefore
37689 we also use max here. */
37690 emit_move_insn (t1, m1h2);
37691 ix86_emit_move_max (t1, ub);
37692 emit_move_insn (m1h2, t1);
37694 op2 = gen_reg_rtx (BNDmode);
37695 emit_move_insn (op2, m1);
37697 return chkp_join_splitted_slot (lb, op2);
37700 case IX86_BUILTIN_BNDINT:
37702 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37704 if (!target
37705 || GET_MODE (target) != BNDmode
37706 || !register_operand (target, BNDmode))
37707 target = gen_reg_rtx (BNDmode);
37709 arg0 = CALL_EXPR_ARG (exp, 0);
37710 arg1 = CALL_EXPR_ARG (exp, 1);
37712 op0 = expand_normal (arg0);
37713 op1 = expand_normal (arg1);
37715 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37716 rh1 = adjust_address (res, Pmode, 0);
37717 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37719 /* Put first bounds to temporaries. */
37720 lb1 = gen_reg_rtx (Pmode);
37721 ub1 = gen_reg_rtx (Pmode);
37722 if (MEM_P (op0))
37724 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37725 emit_move_insn (ub1, adjust_address (op0, Pmode,
37726 GET_MODE_SIZE (Pmode)));
37728 else
37730 emit_move_insn (res, op0);
37731 emit_move_insn (lb1, rh1);
37732 emit_move_insn (ub1, rh2);
37735 /* Put second bounds to temporaries. */
37736 lb2 = gen_reg_rtx (Pmode);
37737 ub2 = gen_reg_rtx (Pmode);
37738 if (MEM_P (op1))
37740 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37741 emit_move_insn (ub2, adjust_address (op1, Pmode,
37742 GET_MODE_SIZE (Pmode)));
37744 else
37746 emit_move_insn (res, op1);
37747 emit_move_insn (lb2, rh1);
37748 emit_move_insn (ub2, rh2);
37751 /* Compute LB. */
37752 ix86_emit_move_max (lb1, lb2);
37753 emit_move_insn (rh1, lb1);
37755 /* Compute UB. UB is stored in 1's complement form. Therefore
37756 we also use max here. */
37757 ix86_emit_move_max (ub1, ub2);
37758 emit_move_insn (rh2, ub1);
37760 emit_move_insn (target, res);
37762 return target;
37765 case IX86_BUILTIN_SIZEOF:
37767 tree name;
37768 rtx symbol;
37770 if (!target
37771 || GET_MODE (target) != Pmode
37772 || !register_operand (target, Pmode))
37773 target = gen_reg_rtx (Pmode);
37775 arg0 = CALL_EXPR_ARG (exp, 0);
37776 gcc_assert (VAR_P (arg0));
37778 name = DECL_ASSEMBLER_NAME (arg0);
37779 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37781 emit_insn (Pmode == SImode
37782 ? gen_move_size_reloc_si (target, symbol)
37783 : gen_move_size_reloc_di (target, symbol));
37785 return target;
37788 case IX86_BUILTIN_BNDLOWER:
37790 rtx mem, hmem;
37792 if (!target
37793 || GET_MODE (target) != Pmode
37794 || !register_operand (target, Pmode))
37795 target = gen_reg_rtx (Pmode);
37797 arg0 = CALL_EXPR_ARG (exp, 0);
37798 op0 = expand_normal (arg0);
37800 /* We need to move bounds to memory first. */
37801 if (MEM_P (op0))
37802 mem = op0;
37803 else
37805 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37806 emit_move_insn (mem, op0);
37809 /* Generate mem expression to access LB and load it. */
37810 hmem = adjust_address (mem, Pmode, 0);
37811 emit_move_insn (target, hmem);
37813 return target;
37816 case IX86_BUILTIN_BNDUPPER:
37818 rtx mem, hmem, res;
37820 if (!target
37821 || GET_MODE (target) != Pmode
37822 || !register_operand (target, Pmode))
37823 target = gen_reg_rtx (Pmode);
37825 arg0 = CALL_EXPR_ARG (exp, 0);
37826 op0 = expand_normal (arg0);
37828 /* We need to move bounds to memory first. */
37829 if (MEM_P (op0))
37830 mem = op0;
37831 else
37833 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37834 emit_move_insn (mem, op0);
37837 /* Generate mem expression to access UB. */
37838 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37840 /* We need to inverse all bits of UB. */
37841 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37843 if (res != target)
37844 emit_move_insn (target, res);
37846 return target;
37849 case IX86_BUILTIN_MASKMOVQ:
37850 case IX86_BUILTIN_MASKMOVDQU:
37851 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37852 ? CODE_FOR_mmx_maskmovq
37853 : CODE_FOR_sse2_maskmovdqu);
37854 /* Note the arg order is different from the operand order. */
37855 arg1 = CALL_EXPR_ARG (exp, 0);
37856 arg2 = CALL_EXPR_ARG (exp, 1);
37857 arg0 = CALL_EXPR_ARG (exp, 2);
37858 op0 = expand_normal (arg0);
37859 op1 = expand_normal (arg1);
37860 op2 = expand_normal (arg2);
37861 mode0 = insn_data[icode].operand[0].mode;
37862 mode1 = insn_data[icode].operand[1].mode;
37863 mode2 = insn_data[icode].operand[2].mode;
37865 op0 = ix86_zero_extend_to_Pmode (op0);
37866 op0 = gen_rtx_MEM (mode1, op0);
37868 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37869 op0 = copy_to_mode_reg (mode0, op0);
37870 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37871 op1 = copy_to_mode_reg (mode1, op1);
37872 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37873 op2 = copy_to_mode_reg (mode2, op2);
37874 pat = GEN_FCN (icode) (op0, op1, op2);
37875 if (! pat)
37876 return 0;
37877 emit_insn (pat);
37878 return 0;
37880 case IX86_BUILTIN_LDMXCSR:
37881 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
37882 target = assign_386_stack_local (SImode, SLOT_TEMP);
37883 emit_move_insn (target, op0);
37884 emit_insn (gen_sse_ldmxcsr (target));
37885 return 0;
37887 case IX86_BUILTIN_STMXCSR:
37888 target = assign_386_stack_local (SImode, SLOT_TEMP);
37889 emit_insn (gen_sse_stmxcsr (target));
37890 return copy_to_mode_reg (SImode, target);
37892 case IX86_BUILTIN_CLFLUSH:
37893 arg0 = CALL_EXPR_ARG (exp, 0);
37894 op0 = expand_normal (arg0);
37895 icode = CODE_FOR_sse2_clflush;
37896 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37897 op0 = ix86_zero_extend_to_Pmode (op0);
37899 emit_insn (gen_sse2_clflush (op0));
37900 return 0;
37902 case IX86_BUILTIN_CLWB:
37903 arg0 = CALL_EXPR_ARG (exp, 0);
37904 op0 = expand_normal (arg0);
37905 icode = CODE_FOR_clwb;
37906 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37907 op0 = ix86_zero_extend_to_Pmode (op0);
37909 emit_insn (gen_clwb (op0));
37910 return 0;
37912 case IX86_BUILTIN_CLFLUSHOPT:
37913 arg0 = CALL_EXPR_ARG (exp, 0);
37914 op0 = expand_normal (arg0);
37915 icode = CODE_FOR_clflushopt;
37916 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37917 op0 = ix86_zero_extend_to_Pmode (op0);
37919 emit_insn (gen_clflushopt (op0));
37920 return 0;
37922 case IX86_BUILTIN_MONITOR:
37923 case IX86_BUILTIN_MONITORX:
37924 arg0 = CALL_EXPR_ARG (exp, 0);
37925 arg1 = CALL_EXPR_ARG (exp, 1);
37926 arg2 = CALL_EXPR_ARG (exp, 2);
37927 op0 = expand_normal (arg0);
37928 op1 = expand_normal (arg1);
37929 op2 = expand_normal (arg2);
37930 if (!REG_P (op0))
37931 op0 = ix86_zero_extend_to_Pmode (op0);
37932 if (!REG_P (op1))
37933 op1 = copy_to_mode_reg (SImode, op1);
37934 if (!REG_P (op2))
37935 op2 = copy_to_mode_reg (SImode, op2);
37937 emit_insn (fcode == IX86_BUILTIN_MONITOR
37938 ? ix86_gen_monitor (op0, op1, op2)
37939 : ix86_gen_monitorx (op0, op1, op2));
37940 return 0;
37942 case IX86_BUILTIN_MWAIT:
37943 arg0 = CALL_EXPR_ARG (exp, 0);
37944 arg1 = CALL_EXPR_ARG (exp, 1);
37945 op0 = expand_normal (arg0);
37946 op1 = expand_normal (arg1);
37947 if (!REG_P (op0))
37948 op0 = copy_to_mode_reg (SImode, op0);
37949 if (!REG_P (op1))
37950 op1 = copy_to_mode_reg (SImode, op1);
37951 emit_insn (gen_sse3_mwait (op0, op1));
37952 return 0;
37954 case IX86_BUILTIN_MWAITX:
37955 arg0 = CALL_EXPR_ARG (exp, 0);
37956 arg1 = CALL_EXPR_ARG (exp, 1);
37957 arg2 = CALL_EXPR_ARG (exp, 2);
37958 op0 = expand_normal (arg0);
37959 op1 = expand_normal (arg1);
37960 op2 = expand_normal (arg2);
37961 if (!REG_P (op0))
37962 op0 = copy_to_mode_reg (SImode, op0);
37963 if (!REG_P (op1))
37964 op1 = copy_to_mode_reg (SImode, op1);
37965 if (!REG_P (op2))
37966 op2 = copy_to_mode_reg (SImode, op2);
37967 emit_insn (gen_mwaitx (op0, op1, op2));
37968 return 0;
37970 case IX86_BUILTIN_CLZERO:
37971 arg0 = CALL_EXPR_ARG (exp, 0);
37972 op0 = expand_normal (arg0);
37973 if (!REG_P (op0))
37974 op0 = ix86_zero_extend_to_Pmode (op0);
37975 emit_insn (ix86_gen_clzero (op0));
37976 return 0;
37978 case IX86_BUILTIN_VEC_INIT_V2SI:
37979 case IX86_BUILTIN_VEC_INIT_V4HI:
37980 case IX86_BUILTIN_VEC_INIT_V8QI:
37981 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37983 case IX86_BUILTIN_VEC_EXT_V2DF:
37984 case IX86_BUILTIN_VEC_EXT_V2DI:
37985 case IX86_BUILTIN_VEC_EXT_V4SF:
37986 case IX86_BUILTIN_VEC_EXT_V4SI:
37987 case IX86_BUILTIN_VEC_EXT_V8HI:
37988 case IX86_BUILTIN_VEC_EXT_V2SI:
37989 case IX86_BUILTIN_VEC_EXT_V4HI:
37990 case IX86_BUILTIN_VEC_EXT_V16QI:
37991 return ix86_expand_vec_ext_builtin (exp, target);
37993 case IX86_BUILTIN_VEC_SET_V2DI:
37994 case IX86_BUILTIN_VEC_SET_V4SF:
37995 case IX86_BUILTIN_VEC_SET_V4SI:
37996 case IX86_BUILTIN_VEC_SET_V8HI:
37997 case IX86_BUILTIN_VEC_SET_V4HI:
37998 case IX86_BUILTIN_VEC_SET_V16QI:
37999 return ix86_expand_vec_set_builtin (exp);
38001 case IX86_BUILTIN_NANQ:
38002 case IX86_BUILTIN_NANSQ:
38003 return expand_call (exp, target, ignore);
38005 case IX86_BUILTIN_RDPMC:
38006 case IX86_BUILTIN_RDTSC:
38007 case IX86_BUILTIN_RDTSCP:
38008 case IX86_BUILTIN_XGETBV:
38010 op0 = gen_reg_rtx (DImode);
38011 op1 = gen_reg_rtx (DImode);
38013 if (fcode == IX86_BUILTIN_RDPMC)
38015 arg0 = CALL_EXPR_ARG (exp, 0);
38016 op2 = expand_normal (arg0);
38017 if (!register_operand (op2, SImode))
38018 op2 = copy_to_mode_reg (SImode, op2);
38020 insn = (TARGET_64BIT
38021 ? gen_rdpmc_rex64 (op0, op1, op2)
38022 : gen_rdpmc (op0, op2));
38023 emit_insn (insn);
38025 else if (fcode == IX86_BUILTIN_XGETBV)
38027 arg0 = CALL_EXPR_ARG (exp, 0);
38028 op2 = expand_normal (arg0);
38029 if (!register_operand (op2, SImode))
38030 op2 = copy_to_mode_reg (SImode, op2);
38032 insn = (TARGET_64BIT
38033 ? gen_xgetbv_rex64 (op0, op1, op2)
38034 : gen_xgetbv (op0, op2));
38035 emit_insn (insn);
38037 else if (fcode == IX86_BUILTIN_RDTSC)
38039 insn = (TARGET_64BIT
38040 ? gen_rdtsc_rex64 (op0, op1)
38041 : gen_rdtsc (op0));
38042 emit_insn (insn);
38044 else
38046 op2 = gen_reg_rtx (SImode);
38048 insn = (TARGET_64BIT
38049 ? gen_rdtscp_rex64 (op0, op1, op2)
38050 : gen_rdtscp (op0, op2));
38051 emit_insn (insn);
38053 arg0 = CALL_EXPR_ARG (exp, 0);
38054 op4 = expand_normal (arg0);
38055 if (!address_operand (op4, VOIDmode))
38057 op4 = convert_memory_address (Pmode, op4);
38058 op4 = copy_addr_to_reg (op4);
38060 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38063 if (target == 0)
38065 /* mode is VOIDmode if __builtin_rd* has been called
38066 without lhs. */
38067 if (mode == VOIDmode)
38068 return target;
38069 target = gen_reg_rtx (mode);
38072 if (TARGET_64BIT)
38074 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38075 op1, 1, OPTAB_DIRECT);
38076 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38077 op0, 1, OPTAB_DIRECT);
38080 emit_move_insn (target, op0);
38081 return target;
38083 case IX86_BUILTIN_FXSAVE:
38084 case IX86_BUILTIN_FXRSTOR:
38085 case IX86_BUILTIN_FXSAVE64:
38086 case IX86_BUILTIN_FXRSTOR64:
38087 case IX86_BUILTIN_FNSTENV:
38088 case IX86_BUILTIN_FLDENV:
38089 mode0 = BLKmode;
38090 switch (fcode)
38092 case IX86_BUILTIN_FXSAVE:
38093 icode = CODE_FOR_fxsave;
38094 break;
38095 case IX86_BUILTIN_FXRSTOR:
38096 icode = CODE_FOR_fxrstor;
38097 break;
38098 case IX86_BUILTIN_FXSAVE64:
38099 icode = CODE_FOR_fxsave64;
38100 break;
38101 case IX86_BUILTIN_FXRSTOR64:
38102 icode = CODE_FOR_fxrstor64;
38103 break;
38104 case IX86_BUILTIN_FNSTENV:
38105 icode = CODE_FOR_fnstenv;
38106 break;
38107 case IX86_BUILTIN_FLDENV:
38108 icode = CODE_FOR_fldenv;
38109 break;
38110 default:
38111 gcc_unreachable ();
38114 arg0 = CALL_EXPR_ARG (exp, 0);
38115 op0 = expand_normal (arg0);
38117 if (!address_operand (op0, VOIDmode))
38119 op0 = convert_memory_address (Pmode, op0);
38120 op0 = copy_addr_to_reg (op0);
38122 op0 = gen_rtx_MEM (mode0, op0);
38124 pat = GEN_FCN (icode) (op0);
38125 if (pat)
38126 emit_insn (pat);
38127 return 0;
38129 case IX86_BUILTIN_XSETBV:
38130 arg0 = CALL_EXPR_ARG (exp, 0);
38131 arg1 = CALL_EXPR_ARG (exp, 1);
38132 op0 = expand_normal (arg0);
38133 op1 = expand_normal (arg1);
38135 if (!REG_P (op0))
38136 op0 = copy_to_mode_reg (SImode, op0);
38138 if (TARGET_64BIT)
38140 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38141 NULL, 1, OPTAB_DIRECT);
38143 op2 = gen_lowpart (SImode, op2);
38144 op1 = gen_lowpart (SImode, op1);
38145 if (!REG_P (op1))
38146 op1 = copy_to_mode_reg (SImode, op1);
38147 if (!REG_P (op2))
38148 op2 = copy_to_mode_reg (SImode, op2);
38149 icode = CODE_FOR_xsetbv_rex64;
38150 pat = GEN_FCN (icode) (op0, op1, op2);
38152 else
38154 if (!REG_P (op1))
38155 op1 = copy_to_mode_reg (DImode, op1);
38156 icode = CODE_FOR_xsetbv;
38157 pat = GEN_FCN (icode) (op0, op1);
38159 if (pat)
38160 emit_insn (pat);
38161 return 0;
38163 case IX86_BUILTIN_XSAVE:
38164 case IX86_BUILTIN_XRSTOR:
38165 case IX86_BUILTIN_XSAVE64:
38166 case IX86_BUILTIN_XRSTOR64:
38167 case IX86_BUILTIN_XSAVEOPT:
38168 case IX86_BUILTIN_XSAVEOPT64:
38169 case IX86_BUILTIN_XSAVES:
38170 case IX86_BUILTIN_XRSTORS:
38171 case IX86_BUILTIN_XSAVES64:
38172 case IX86_BUILTIN_XRSTORS64:
38173 case IX86_BUILTIN_XSAVEC:
38174 case IX86_BUILTIN_XSAVEC64:
38175 arg0 = CALL_EXPR_ARG (exp, 0);
38176 arg1 = CALL_EXPR_ARG (exp, 1);
38177 op0 = expand_normal (arg0);
38178 op1 = expand_normal (arg1);
38180 if (!address_operand (op0, VOIDmode))
38182 op0 = convert_memory_address (Pmode, op0);
38183 op0 = copy_addr_to_reg (op0);
38185 op0 = gen_rtx_MEM (BLKmode, op0);
38187 op1 = force_reg (DImode, op1);
38189 if (TARGET_64BIT)
38191 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38192 NULL, 1, OPTAB_DIRECT);
38193 switch (fcode)
38195 case IX86_BUILTIN_XSAVE:
38196 icode = CODE_FOR_xsave_rex64;
38197 break;
38198 case IX86_BUILTIN_XRSTOR:
38199 icode = CODE_FOR_xrstor_rex64;
38200 break;
38201 case IX86_BUILTIN_XSAVE64:
38202 icode = CODE_FOR_xsave64;
38203 break;
38204 case IX86_BUILTIN_XRSTOR64:
38205 icode = CODE_FOR_xrstor64;
38206 break;
38207 case IX86_BUILTIN_XSAVEOPT:
38208 icode = CODE_FOR_xsaveopt_rex64;
38209 break;
38210 case IX86_BUILTIN_XSAVEOPT64:
38211 icode = CODE_FOR_xsaveopt64;
38212 break;
38213 case IX86_BUILTIN_XSAVES:
38214 icode = CODE_FOR_xsaves_rex64;
38215 break;
38216 case IX86_BUILTIN_XRSTORS:
38217 icode = CODE_FOR_xrstors_rex64;
38218 break;
38219 case IX86_BUILTIN_XSAVES64:
38220 icode = CODE_FOR_xsaves64;
38221 break;
38222 case IX86_BUILTIN_XRSTORS64:
38223 icode = CODE_FOR_xrstors64;
38224 break;
38225 case IX86_BUILTIN_XSAVEC:
38226 icode = CODE_FOR_xsavec_rex64;
38227 break;
38228 case IX86_BUILTIN_XSAVEC64:
38229 icode = CODE_FOR_xsavec64;
38230 break;
38231 default:
38232 gcc_unreachable ();
38235 op2 = gen_lowpart (SImode, op2);
38236 op1 = gen_lowpart (SImode, op1);
38237 pat = GEN_FCN (icode) (op0, op1, op2);
38239 else
38241 switch (fcode)
38243 case IX86_BUILTIN_XSAVE:
38244 icode = CODE_FOR_xsave;
38245 break;
38246 case IX86_BUILTIN_XRSTOR:
38247 icode = CODE_FOR_xrstor;
38248 break;
38249 case IX86_BUILTIN_XSAVEOPT:
38250 icode = CODE_FOR_xsaveopt;
38251 break;
38252 case IX86_BUILTIN_XSAVES:
38253 icode = CODE_FOR_xsaves;
38254 break;
38255 case IX86_BUILTIN_XRSTORS:
38256 icode = CODE_FOR_xrstors;
38257 break;
38258 case IX86_BUILTIN_XSAVEC:
38259 icode = CODE_FOR_xsavec;
38260 break;
38261 default:
38262 gcc_unreachable ();
38264 pat = GEN_FCN (icode) (op0, op1);
38267 if (pat)
38268 emit_insn (pat);
38269 return 0;
38271 case IX86_BUILTIN_LLWPCB:
38272 arg0 = CALL_EXPR_ARG (exp, 0);
38273 op0 = expand_normal (arg0);
38274 icode = CODE_FOR_lwp_llwpcb;
38275 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38276 op0 = ix86_zero_extend_to_Pmode (op0);
38277 emit_insn (gen_lwp_llwpcb (op0));
38278 return 0;
38280 case IX86_BUILTIN_SLWPCB:
38281 icode = CODE_FOR_lwp_slwpcb;
38282 if (!target
38283 || !insn_data[icode].operand[0].predicate (target, Pmode))
38284 target = gen_reg_rtx (Pmode);
38285 emit_insn (gen_lwp_slwpcb (target));
38286 return target;
38288 case IX86_BUILTIN_BEXTRI32:
38289 case IX86_BUILTIN_BEXTRI64:
38290 arg0 = CALL_EXPR_ARG (exp, 0);
38291 arg1 = CALL_EXPR_ARG (exp, 1);
38292 op0 = expand_normal (arg0);
38293 op1 = expand_normal (arg1);
38294 icode = (fcode == IX86_BUILTIN_BEXTRI32
38295 ? CODE_FOR_tbm_bextri_si
38296 : CODE_FOR_tbm_bextri_di);
38297 if (!CONST_INT_P (op1))
38299 error ("last argument must be an immediate");
38300 return const0_rtx;
38302 else
38304 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38305 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38306 op1 = GEN_INT (length);
38307 op2 = GEN_INT (lsb_index);
38308 pat = GEN_FCN (icode) (target, op0, op1, op2);
38309 if (pat)
38310 emit_insn (pat);
38311 return target;
38314 case IX86_BUILTIN_RDRAND16_STEP:
38315 icode = CODE_FOR_rdrandhi_1;
38316 mode0 = HImode;
38317 goto rdrand_step;
38319 case IX86_BUILTIN_RDRAND32_STEP:
38320 icode = CODE_FOR_rdrandsi_1;
38321 mode0 = SImode;
38322 goto rdrand_step;
38324 case IX86_BUILTIN_RDRAND64_STEP:
38325 icode = CODE_FOR_rdranddi_1;
38326 mode0 = DImode;
38328 rdrand_step:
38329 arg0 = CALL_EXPR_ARG (exp, 0);
38330 op1 = expand_normal (arg0);
38331 if (!address_operand (op1, VOIDmode))
38333 op1 = convert_memory_address (Pmode, op1);
38334 op1 = copy_addr_to_reg (op1);
38337 op0 = gen_reg_rtx (mode0);
38338 emit_insn (GEN_FCN (icode) (op0));
38340 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38342 op1 = gen_reg_rtx (SImode);
38343 emit_move_insn (op1, CONST1_RTX (SImode));
38345 /* Emit SImode conditional move. */
38346 if (mode0 == HImode)
38348 if (TARGET_ZERO_EXTEND_WITH_AND
38349 && optimize_function_for_speed_p (cfun))
38351 op2 = force_reg (SImode, const0_rtx);
38353 emit_insn (gen_movstricthi
38354 (gen_lowpart (HImode, op2), op0));
38356 else
38358 op2 = gen_reg_rtx (SImode);
38360 emit_insn (gen_zero_extendhisi2 (op2, op0));
38363 else if (mode0 == SImode)
38364 op2 = op0;
38365 else
38366 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38368 if (target == 0
38369 || !register_operand (target, SImode))
38370 target = gen_reg_rtx (SImode);
38372 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38373 const0_rtx);
38374 emit_insn (gen_rtx_SET (target,
38375 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38376 return target;
38378 case IX86_BUILTIN_RDSEED16_STEP:
38379 icode = CODE_FOR_rdseedhi_1;
38380 mode0 = HImode;
38381 goto rdseed_step;
38383 case IX86_BUILTIN_RDSEED32_STEP:
38384 icode = CODE_FOR_rdseedsi_1;
38385 mode0 = SImode;
38386 goto rdseed_step;
38388 case IX86_BUILTIN_RDSEED64_STEP:
38389 icode = CODE_FOR_rdseeddi_1;
38390 mode0 = DImode;
38392 rdseed_step:
38393 arg0 = CALL_EXPR_ARG (exp, 0);
38394 op1 = expand_normal (arg0);
38395 if (!address_operand (op1, VOIDmode))
38397 op1 = convert_memory_address (Pmode, op1);
38398 op1 = copy_addr_to_reg (op1);
38401 op0 = gen_reg_rtx (mode0);
38402 emit_insn (GEN_FCN (icode) (op0));
38404 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38406 op2 = gen_reg_rtx (QImode);
38408 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38409 const0_rtx);
38410 emit_insn (gen_rtx_SET (op2, pat));
38412 if (target == 0
38413 || !register_operand (target, SImode))
38414 target = gen_reg_rtx (SImode);
38416 emit_insn (gen_zero_extendqisi2 (target, op2));
38417 return target;
38419 case IX86_BUILTIN_SBB32:
38420 icode = CODE_FOR_subborrowsi;
38421 mode0 = SImode;
38422 goto handlecarry;
38424 case IX86_BUILTIN_SBB64:
38425 icode = CODE_FOR_subborrowdi;
38426 mode0 = DImode;
38427 goto handlecarry;
38429 case IX86_BUILTIN_ADDCARRYX32:
38430 icode = CODE_FOR_addcarrysi;
38431 mode0 = SImode;
38432 goto handlecarry;
38434 case IX86_BUILTIN_ADDCARRYX64:
38435 icode = CODE_FOR_addcarrydi;
38436 mode0 = DImode;
38438 handlecarry:
38439 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38440 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38441 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38442 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38444 op1 = expand_normal (arg0);
38445 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38447 op2 = expand_normal (arg1);
38448 if (!register_operand (op2, mode0))
38449 op2 = copy_to_mode_reg (mode0, op2);
38451 op3 = expand_normal (arg2);
38452 if (!register_operand (op3, mode0))
38453 op3 = copy_to_mode_reg (mode0, op3);
38455 op4 = expand_normal (arg3);
38456 if (!address_operand (op4, VOIDmode))
38458 op4 = convert_memory_address (Pmode, op4);
38459 op4 = copy_addr_to_reg (op4);
38462 /* Generate CF from input operand. */
38463 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38465 /* Generate instruction that consumes CF. */
38466 op0 = gen_reg_rtx (mode0);
38468 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38469 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38470 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38472 /* Return current CF value. */
38473 if (target == 0)
38474 target = gen_reg_rtx (QImode);
38476 PUT_MODE (pat, QImode);
38477 emit_insn (gen_rtx_SET (target, pat));
38479 /* Store the result. */
38480 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38482 return target;
38484 case IX86_BUILTIN_READ_FLAGS:
38485 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38487 if (optimize
38488 || target == NULL_RTX
38489 || !nonimmediate_operand (target, word_mode)
38490 || GET_MODE (target) != word_mode)
38491 target = gen_reg_rtx (word_mode);
38493 emit_insn (gen_pop (target));
38494 return target;
38496 case IX86_BUILTIN_WRITE_FLAGS:
38498 arg0 = CALL_EXPR_ARG (exp, 0);
38499 op0 = expand_normal (arg0);
38500 if (!general_no_elim_operand (op0, word_mode))
38501 op0 = copy_to_mode_reg (word_mode, op0);
38503 emit_insn (gen_push (op0));
38504 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38505 return 0;
38507 case IX86_BUILTIN_KTESTC8:
38508 icode = CODE_FOR_ktestqi;
38509 mode3 = CCCmode;
38510 goto kortest;
38512 case IX86_BUILTIN_KTESTZ8:
38513 icode = CODE_FOR_ktestqi;
38514 mode3 = CCZmode;
38515 goto kortest;
38517 case IX86_BUILTIN_KTESTC16:
38518 icode = CODE_FOR_ktesthi;
38519 mode3 = CCCmode;
38520 goto kortest;
38522 case IX86_BUILTIN_KTESTZ16:
38523 icode = CODE_FOR_ktesthi;
38524 mode3 = CCZmode;
38525 goto kortest;
38527 case IX86_BUILTIN_KTESTC32:
38528 icode = CODE_FOR_ktestsi;
38529 mode3 = CCCmode;
38530 goto kortest;
38532 case IX86_BUILTIN_KTESTZ32:
38533 icode = CODE_FOR_ktestsi;
38534 mode3 = CCZmode;
38535 goto kortest;
38537 case IX86_BUILTIN_KTESTC64:
38538 icode = CODE_FOR_ktestdi;
38539 mode3 = CCCmode;
38540 goto kortest;
38542 case IX86_BUILTIN_KTESTZ64:
38543 icode = CODE_FOR_ktestdi;
38544 mode3 = CCZmode;
38545 goto kortest;
38547 case IX86_BUILTIN_KORTESTC8:
38548 icode = CODE_FOR_kortestqi;
38549 mode3 = CCCmode;
38550 goto kortest;
38552 case IX86_BUILTIN_KORTESTZ8:
38553 icode = CODE_FOR_kortestqi;
38554 mode3 = CCZmode;
38555 goto kortest;
38557 case IX86_BUILTIN_KORTESTC16:
38558 icode = CODE_FOR_kortesthi;
38559 mode3 = CCCmode;
38560 goto kortest;
38562 case IX86_BUILTIN_KORTESTZ16:
38563 icode = CODE_FOR_kortesthi;
38564 mode3 = CCZmode;
38565 goto kortest;
38567 case IX86_BUILTIN_KORTESTC32:
38568 icode = CODE_FOR_kortestsi;
38569 mode3 = CCCmode;
38570 goto kortest;
38572 case IX86_BUILTIN_KORTESTZ32:
38573 icode = CODE_FOR_kortestsi;
38574 mode3 = CCZmode;
38575 goto kortest;
38577 case IX86_BUILTIN_KORTESTC64:
38578 icode = CODE_FOR_kortestdi;
38579 mode3 = CCCmode;
38580 goto kortest;
38582 case IX86_BUILTIN_KORTESTZ64:
38583 icode = CODE_FOR_kortestdi;
38584 mode3 = CCZmode;
38586 kortest:
38587 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38588 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38589 op0 = expand_normal (arg0);
38590 op1 = expand_normal (arg1);
38592 mode0 = insn_data[icode].operand[0].mode;
38593 mode1 = insn_data[icode].operand[1].mode;
38595 if (GET_MODE (op0) != VOIDmode)
38596 op0 = force_reg (GET_MODE (op0), op0);
38598 op0 = gen_lowpart (mode0, op0);
38600 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38601 op0 = copy_to_mode_reg (mode0, op0);
38603 if (GET_MODE (op1) != VOIDmode)
38604 op1 = force_reg (GET_MODE (op1), op1);
38606 op1 = gen_lowpart (mode1, op1);
38608 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38609 op1 = copy_to_mode_reg (mode1, op1);
38611 target = gen_reg_rtx (QImode);
38613 /* Emit kortest. */
38614 emit_insn (GEN_FCN (icode) (op0, op1));
38615 /* And use setcc to return result from flags. */
38616 ix86_expand_setcc (target, EQ,
38617 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38618 return target;
38620 case IX86_BUILTIN_GATHERSIV2DF:
38621 icode = CODE_FOR_avx2_gathersiv2df;
38622 goto gather_gen;
38623 case IX86_BUILTIN_GATHERSIV4DF:
38624 icode = CODE_FOR_avx2_gathersiv4df;
38625 goto gather_gen;
38626 case IX86_BUILTIN_GATHERDIV2DF:
38627 icode = CODE_FOR_avx2_gatherdiv2df;
38628 goto gather_gen;
38629 case IX86_BUILTIN_GATHERDIV4DF:
38630 icode = CODE_FOR_avx2_gatherdiv4df;
38631 goto gather_gen;
38632 case IX86_BUILTIN_GATHERSIV4SF:
38633 icode = CODE_FOR_avx2_gathersiv4sf;
38634 goto gather_gen;
38635 case IX86_BUILTIN_GATHERSIV8SF:
38636 icode = CODE_FOR_avx2_gathersiv8sf;
38637 goto gather_gen;
38638 case IX86_BUILTIN_GATHERDIV4SF:
38639 icode = CODE_FOR_avx2_gatherdiv4sf;
38640 goto gather_gen;
38641 case IX86_BUILTIN_GATHERDIV8SF:
38642 icode = CODE_FOR_avx2_gatherdiv8sf;
38643 goto gather_gen;
38644 case IX86_BUILTIN_GATHERSIV2DI:
38645 icode = CODE_FOR_avx2_gathersiv2di;
38646 goto gather_gen;
38647 case IX86_BUILTIN_GATHERSIV4DI:
38648 icode = CODE_FOR_avx2_gathersiv4di;
38649 goto gather_gen;
38650 case IX86_BUILTIN_GATHERDIV2DI:
38651 icode = CODE_FOR_avx2_gatherdiv2di;
38652 goto gather_gen;
38653 case IX86_BUILTIN_GATHERDIV4DI:
38654 icode = CODE_FOR_avx2_gatherdiv4di;
38655 goto gather_gen;
38656 case IX86_BUILTIN_GATHERSIV4SI:
38657 icode = CODE_FOR_avx2_gathersiv4si;
38658 goto gather_gen;
38659 case IX86_BUILTIN_GATHERSIV8SI:
38660 icode = CODE_FOR_avx2_gathersiv8si;
38661 goto gather_gen;
38662 case IX86_BUILTIN_GATHERDIV4SI:
38663 icode = CODE_FOR_avx2_gatherdiv4si;
38664 goto gather_gen;
38665 case IX86_BUILTIN_GATHERDIV8SI:
38666 icode = CODE_FOR_avx2_gatherdiv8si;
38667 goto gather_gen;
38668 case IX86_BUILTIN_GATHERALTSIV4DF:
38669 icode = CODE_FOR_avx2_gathersiv4df;
38670 goto gather_gen;
38671 case IX86_BUILTIN_GATHERALTDIV8SF:
38672 icode = CODE_FOR_avx2_gatherdiv8sf;
38673 goto gather_gen;
38674 case IX86_BUILTIN_GATHERALTSIV4DI:
38675 icode = CODE_FOR_avx2_gathersiv4di;
38676 goto gather_gen;
38677 case IX86_BUILTIN_GATHERALTDIV8SI:
38678 icode = CODE_FOR_avx2_gatherdiv8si;
38679 goto gather_gen;
38680 case IX86_BUILTIN_GATHER3SIV16SF:
38681 icode = CODE_FOR_avx512f_gathersiv16sf;
38682 goto gather_gen;
38683 case IX86_BUILTIN_GATHER3SIV8DF:
38684 icode = CODE_FOR_avx512f_gathersiv8df;
38685 goto gather_gen;
38686 case IX86_BUILTIN_GATHER3DIV16SF:
38687 icode = CODE_FOR_avx512f_gatherdiv16sf;
38688 goto gather_gen;
38689 case IX86_BUILTIN_GATHER3DIV8DF:
38690 icode = CODE_FOR_avx512f_gatherdiv8df;
38691 goto gather_gen;
38692 case IX86_BUILTIN_GATHER3SIV16SI:
38693 icode = CODE_FOR_avx512f_gathersiv16si;
38694 goto gather_gen;
38695 case IX86_BUILTIN_GATHER3SIV8DI:
38696 icode = CODE_FOR_avx512f_gathersiv8di;
38697 goto gather_gen;
38698 case IX86_BUILTIN_GATHER3DIV16SI:
38699 icode = CODE_FOR_avx512f_gatherdiv16si;
38700 goto gather_gen;
38701 case IX86_BUILTIN_GATHER3DIV8DI:
38702 icode = CODE_FOR_avx512f_gatherdiv8di;
38703 goto gather_gen;
38704 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38705 icode = CODE_FOR_avx512f_gathersiv8df;
38706 goto gather_gen;
38707 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38708 icode = CODE_FOR_avx512f_gatherdiv16sf;
38709 goto gather_gen;
38710 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38711 icode = CODE_FOR_avx512f_gathersiv8di;
38712 goto gather_gen;
38713 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38714 icode = CODE_FOR_avx512f_gatherdiv16si;
38715 goto gather_gen;
38716 case IX86_BUILTIN_GATHER3SIV2DF:
38717 icode = CODE_FOR_avx512vl_gathersiv2df;
38718 goto gather_gen;
38719 case IX86_BUILTIN_GATHER3SIV4DF:
38720 icode = CODE_FOR_avx512vl_gathersiv4df;
38721 goto gather_gen;
38722 case IX86_BUILTIN_GATHER3DIV2DF:
38723 icode = CODE_FOR_avx512vl_gatherdiv2df;
38724 goto gather_gen;
38725 case IX86_BUILTIN_GATHER3DIV4DF:
38726 icode = CODE_FOR_avx512vl_gatherdiv4df;
38727 goto gather_gen;
38728 case IX86_BUILTIN_GATHER3SIV4SF:
38729 icode = CODE_FOR_avx512vl_gathersiv4sf;
38730 goto gather_gen;
38731 case IX86_BUILTIN_GATHER3SIV8SF:
38732 icode = CODE_FOR_avx512vl_gathersiv8sf;
38733 goto gather_gen;
38734 case IX86_BUILTIN_GATHER3DIV4SF:
38735 icode = CODE_FOR_avx512vl_gatherdiv4sf;
38736 goto gather_gen;
38737 case IX86_BUILTIN_GATHER3DIV8SF:
38738 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38739 goto gather_gen;
38740 case IX86_BUILTIN_GATHER3SIV2DI:
38741 icode = CODE_FOR_avx512vl_gathersiv2di;
38742 goto gather_gen;
38743 case IX86_BUILTIN_GATHER3SIV4DI:
38744 icode = CODE_FOR_avx512vl_gathersiv4di;
38745 goto gather_gen;
38746 case IX86_BUILTIN_GATHER3DIV2DI:
38747 icode = CODE_FOR_avx512vl_gatherdiv2di;
38748 goto gather_gen;
38749 case IX86_BUILTIN_GATHER3DIV4DI:
38750 icode = CODE_FOR_avx512vl_gatherdiv4di;
38751 goto gather_gen;
38752 case IX86_BUILTIN_GATHER3SIV4SI:
38753 icode = CODE_FOR_avx512vl_gathersiv4si;
38754 goto gather_gen;
38755 case IX86_BUILTIN_GATHER3SIV8SI:
38756 icode = CODE_FOR_avx512vl_gathersiv8si;
38757 goto gather_gen;
38758 case IX86_BUILTIN_GATHER3DIV4SI:
38759 icode = CODE_FOR_avx512vl_gatherdiv4si;
38760 goto gather_gen;
38761 case IX86_BUILTIN_GATHER3DIV8SI:
38762 icode = CODE_FOR_avx512vl_gatherdiv8si;
38763 goto gather_gen;
38764 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38765 icode = CODE_FOR_avx512vl_gathersiv4df;
38766 goto gather_gen;
38767 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38768 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38769 goto gather_gen;
38770 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38771 icode = CODE_FOR_avx512vl_gathersiv4di;
38772 goto gather_gen;
38773 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38774 icode = CODE_FOR_avx512vl_gatherdiv8si;
38775 goto gather_gen;
38776 case IX86_BUILTIN_SCATTERSIV16SF:
38777 icode = CODE_FOR_avx512f_scattersiv16sf;
38778 goto scatter_gen;
38779 case IX86_BUILTIN_SCATTERSIV8DF:
38780 icode = CODE_FOR_avx512f_scattersiv8df;
38781 goto scatter_gen;
38782 case IX86_BUILTIN_SCATTERDIV16SF:
38783 icode = CODE_FOR_avx512f_scatterdiv16sf;
38784 goto scatter_gen;
38785 case IX86_BUILTIN_SCATTERDIV8DF:
38786 icode = CODE_FOR_avx512f_scatterdiv8df;
38787 goto scatter_gen;
38788 case IX86_BUILTIN_SCATTERSIV16SI:
38789 icode = CODE_FOR_avx512f_scattersiv16si;
38790 goto scatter_gen;
38791 case IX86_BUILTIN_SCATTERSIV8DI:
38792 icode = CODE_FOR_avx512f_scattersiv8di;
38793 goto scatter_gen;
38794 case IX86_BUILTIN_SCATTERDIV16SI:
38795 icode = CODE_FOR_avx512f_scatterdiv16si;
38796 goto scatter_gen;
38797 case IX86_BUILTIN_SCATTERDIV8DI:
38798 icode = CODE_FOR_avx512f_scatterdiv8di;
38799 goto scatter_gen;
38800 case IX86_BUILTIN_SCATTERSIV8SF:
38801 icode = CODE_FOR_avx512vl_scattersiv8sf;
38802 goto scatter_gen;
38803 case IX86_BUILTIN_SCATTERSIV4SF:
38804 icode = CODE_FOR_avx512vl_scattersiv4sf;
38805 goto scatter_gen;
38806 case IX86_BUILTIN_SCATTERSIV4DF:
38807 icode = CODE_FOR_avx512vl_scattersiv4df;
38808 goto scatter_gen;
38809 case IX86_BUILTIN_SCATTERSIV2DF:
38810 icode = CODE_FOR_avx512vl_scattersiv2df;
38811 goto scatter_gen;
38812 case IX86_BUILTIN_SCATTERDIV8SF:
38813 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38814 goto scatter_gen;
38815 case IX86_BUILTIN_SCATTERDIV4SF:
38816 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38817 goto scatter_gen;
38818 case IX86_BUILTIN_SCATTERDIV4DF:
38819 icode = CODE_FOR_avx512vl_scatterdiv4df;
38820 goto scatter_gen;
38821 case IX86_BUILTIN_SCATTERDIV2DF:
38822 icode = CODE_FOR_avx512vl_scatterdiv2df;
38823 goto scatter_gen;
38824 case IX86_BUILTIN_SCATTERSIV8SI:
38825 icode = CODE_FOR_avx512vl_scattersiv8si;
38826 goto scatter_gen;
38827 case IX86_BUILTIN_SCATTERSIV4SI:
38828 icode = CODE_FOR_avx512vl_scattersiv4si;
38829 goto scatter_gen;
38830 case IX86_BUILTIN_SCATTERSIV4DI:
38831 icode = CODE_FOR_avx512vl_scattersiv4di;
38832 goto scatter_gen;
38833 case IX86_BUILTIN_SCATTERSIV2DI:
38834 icode = CODE_FOR_avx512vl_scattersiv2di;
38835 goto scatter_gen;
38836 case IX86_BUILTIN_SCATTERDIV8SI:
38837 icode = CODE_FOR_avx512vl_scatterdiv8si;
38838 goto scatter_gen;
38839 case IX86_BUILTIN_SCATTERDIV4SI:
38840 icode = CODE_FOR_avx512vl_scatterdiv4si;
38841 goto scatter_gen;
38842 case IX86_BUILTIN_SCATTERDIV4DI:
38843 icode = CODE_FOR_avx512vl_scatterdiv4di;
38844 goto scatter_gen;
38845 case IX86_BUILTIN_SCATTERDIV2DI:
38846 icode = CODE_FOR_avx512vl_scatterdiv2di;
38847 goto scatter_gen;
38848 case IX86_BUILTIN_GATHERPFDPD:
38849 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38850 goto vec_prefetch_gen;
38851 case IX86_BUILTIN_SCATTERALTSIV8DF:
38852 icode = CODE_FOR_avx512f_scattersiv8df;
38853 goto scatter_gen;
38854 case IX86_BUILTIN_SCATTERALTDIV16SF:
38855 icode = CODE_FOR_avx512f_scatterdiv16sf;
38856 goto scatter_gen;
38857 case IX86_BUILTIN_SCATTERALTSIV8DI:
38858 icode = CODE_FOR_avx512f_scattersiv8di;
38859 goto scatter_gen;
38860 case IX86_BUILTIN_SCATTERALTDIV16SI:
38861 icode = CODE_FOR_avx512f_scatterdiv16si;
38862 goto scatter_gen;
38863 case IX86_BUILTIN_GATHERPFDPS:
38864 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38865 goto vec_prefetch_gen;
38866 case IX86_BUILTIN_GATHERPFQPD:
38867 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38868 goto vec_prefetch_gen;
38869 case IX86_BUILTIN_GATHERPFQPS:
38870 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38871 goto vec_prefetch_gen;
38872 case IX86_BUILTIN_SCATTERPFDPD:
38873 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38874 goto vec_prefetch_gen;
38875 case IX86_BUILTIN_SCATTERPFDPS:
38876 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38877 goto vec_prefetch_gen;
38878 case IX86_BUILTIN_SCATTERPFQPD:
38879 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38880 goto vec_prefetch_gen;
38881 case IX86_BUILTIN_SCATTERPFQPS:
38882 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38883 goto vec_prefetch_gen;
38885 gather_gen:
38886 rtx half;
38887 rtx (*gen) (rtx, rtx);
38889 arg0 = CALL_EXPR_ARG (exp, 0);
38890 arg1 = CALL_EXPR_ARG (exp, 1);
38891 arg2 = CALL_EXPR_ARG (exp, 2);
38892 arg3 = CALL_EXPR_ARG (exp, 3);
38893 arg4 = CALL_EXPR_ARG (exp, 4);
38894 op0 = expand_normal (arg0);
38895 op1 = expand_normal (arg1);
38896 op2 = expand_normal (arg2);
38897 op3 = expand_normal (arg3);
38898 op4 = expand_normal (arg4);
38899 /* Note the arg order is different from the operand order. */
38900 mode0 = insn_data[icode].operand[1].mode;
38901 mode2 = insn_data[icode].operand[3].mode;
38902 mode3 = insn_data[icode].operand[4].mode;
38903 mode4 = insn_data[icode].operand[5].mode;
38905 if (target == NULL_RTX
38906 || GET_MODE (target) != insn_data[icode].operand[0].mode
38907 || !insn_data[icode].operand[0].predicate (target,
38908 GET_MODE (target)))
38909 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38910 else
38911 subtarget = target;
38913 switch (fcode)
38915 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38916 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38917 half = gen_reg_rtx (V8SImode);
38918 if (!nonimmediate_operand (op2, V16SImode))
38919 op2 = copy_to_mode_reg (V16SImode, op2);
38920 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38921 op2 = half;
38922 break;
38923 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38924 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38925 case IX86_BUILTIN_GATHERALTSIV4DF:
38926 case IX86_BUILTIN_GATHERALTSIV4DI:
38927 half = gen_reg_rtx (V4SImode);
38928 if (!nonimmediate_operand (op2, V8SImode))
38929 op2 = copy_to_mode_reg (V8SImode, op2);
38930 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38931 op2 = half;
38932 break;
38933 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38934 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38935 half = gen_reg_rtx (mode0);
38936 if (mode0 == V8SFmode)
38937 gen = gen_vec_extract_lo_v16sf;
38938 else
38939 gen = gen_vec_extract_lo_v16si;
38940 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38941 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38942 emit_insn (gen (half, op0));
38943 op0 = half;
38944 if (GET_MODE (op3) != VOIDmode)
38946 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38947 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38948 emit_insn (gen (half, op3));
38949 op3 = half;
38951 break;
38952 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38953 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38954 case IX86_BUILTIN_GATHERALTDIV8SF:
38955 case IX86_BUILTIN_GATHERALTDIV8SI:
38956 half = gen_reg_rtx (mode0);
38957 if (mode0 == V4SFmode)
38958 gen = gen_vec_extract_lo_v8sf;
38959 else
38960 gen = gen_vec_extract_lo_v8si;
38961 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38962 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38963 emit_insn (gen (half, op0));
38964 op0 = half;
38965 if (GET_MODE (op3) != VOIDmode)
38967 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38968 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38969 emit_insn (gen (half, op3));
38970 op3 = half;
38972 break;
38973 default:
38974 break;
38977 /* Force memory operand only with base register here. But we
38978 don't want to do it on memory operand for other builtin
38979 functions. */
38980 op1 = ix86_zero_extend_to_Pmode (op1);
38982 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38983 op0 = copy_to_mode_reg (mode0, op0);
38984 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38985 op1 = copy_to_mode_reg (Pmode, op1);
38986 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38987 op2 = copy_to_mode_reg (mode2, op2);
38989 op3 = fixup_modeless_constant (op3, mode3);
38991 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38993 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38994 op3 = copy_to_mode_reg (mode3, op3);
38996 else
38998 op3 = copy_to_reg (op3);
38999 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
39001 if (!insn_data[icode].operand[5].predicate (op4, mode4))
39003 error ("the last argument must be scale 1, 2, 4, 8");
39004 return const0_rtx;
39007 /* Optimize. If mask is known to have all high bits set,
39008 replace op0 with pc_rtx to signal that the instruction
39009 overwrites the whole destination and doesn't use its
39010 previous contents. */
39011 if (optimize)
39013 if (TREE_CODE (arg3) == INTEGER_CST)
39015 if (integer_all_onesp (arg3))
39016 op0 = pc_rtx;
39018 else if (TREE_CODE (arg3) == VECTOR_CST)
39020 unsigned int negative = 0;
39021 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39023 tree cst = VECTOR_CST_ELT (arg3, i);
39024 if (TREE_CODE (cst) == INTEGER_CST
39025 && tree_int_cst_sign_bit (cst))
39026 negative++;
39027 else if (TREE_CODE (cst) == REAL_CST
39028 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39029 negative++;
39031 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39032 op0 = pc_rtx;
39034 else if (TREE_CODE (arg3) == SSA_NAME
39035 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39037 /* Recognize also when mask is like:
39038 __v2df src = _mm_setzero_pd ();
39039 __v2df mask = _mm_cmpeq_pd (src, src);
39041 __v8sf src = _mm256_setzero_ps ();
39042 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39043 as that is a cheaper way to load all ones into
39044 a register than having to load a constant from
39045 memory. */
39046 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39047 if (is_gimple_call (def_stmt))
39049 tree fndecl = gimple_call_fndecl (def_stmt);
39050 if (fndecl
39051 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39052 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39054 case IX86_BUILTIN_CMPPD:
39055 case IX86_BUILTIN_CMPPS:
39056 case IX86_BUILTIN_CMPPD256:
39057 case IX86_BUILTIN_CMPPS256:
39058 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39059 break;
39060 /* FALLTHRU */
39061 case IX86_BUILTIN_CMPEQPD:
39062 case IX86_BUILTIN_CMPEQPS:
39063 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39064 && initializer_zerop (gimple_call_arg (def_stmt,
39065 1)))
39066 op0 = pc_rtx;
39067 break;
39068 default:
39069 break;
39075 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39076 if (! pat)
39077 return const0_rtx;
39078 emit_insn (pat);
39080 switch (fcode)
39082 case IX86_BUILTIN_GATHER3DIV16SF:
39083 if (target == NULL_RTX)
39084 target = gen_reg_rtx (V8SFmode);
39085 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39086 break;
39087 case IX86_BUILTIN_GATHER3DIV16SI:
39088 if (target == NULL_RTX)
39089 target = gen_reg_rtx (V8SImode);
39090 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39091 break;
39092 case IX86_BUILTIN_GATHER3DIV8SF:
39093 case IX86_BUILTIN_GATHERDIV8SF:
39094 if (target == NULL_RTX)
39095 target = gen_reg_rtx (V4SFmode);
39096 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39097 break;
39098 case IX86_BUILTIN_GATHER3DIV8SI:
39099 case IX86_BUILTIN_GATHERDIV8SI:
39100 if (target == NULL_RTX)
39101 target = gen_reg_rtx (V4SImode);
39102 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39103 break;
39104 default:
39105 target = subtarget;
39106 break;
39108 return target;
39110 scatter_gen:
39111 arg0 = CALL_EXPR_ARG (exp, 0);
39112 arg1 = CALL_EXPR_ARG (exp, 1);
39113 arg2 = CALL_EXPR_ARG (exp, 2);
39114 arg3 = CALL_EXPR_ARG (exp, 3);
39115 arg4 = CALL_EXPR_ARG (exp, 4);
39116 op0 = expand_normal (arg0);
39117 op1 = expand_normal (arg1);
39118 op2 = expand_normal (arg2);
39119 op3 = expand_normal (arg3);
39120 op4 = expand_normal (arg4);
39121 mode1 = insn_data[icode].operand[1].mode;
39122 mode2 = insn_data[icode].operand[2].mode;
39123 mode3 = insn_data[icode].operand[3].mode;
39124 mode4 = insn_data[icode].operand[4].mode;
39126 /* Scatter instruction stores operand op3 to memory with
39127 indices from op2 and scale from op4 under writemask op1.
39128 If index operand op2 has more elements then source operand
39129 op3 one need to use only its low half. And vice versa. */
39130 switch (fcode)
39132 case IX86_BUILTIN_SCATTERALTSIV8DF:
39133 case IX86_BUILTIN_SCATTERALTSIV8DI:
39134 half = gen_reg_rtx (V8SImode);
39135 if (!nonimmediate_operand (op2, V16SImode))
39136 op2 = copy_to_mode_reg (V16SImode, op2);
39137 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39138 op2 = half;
39139 break;
39140 case IX86_BUILTIN_SCATTERALTDIV16SF:
39141 case IX86_BUILTIN_SCATTERALTDIV16SI:
39142 half = gen_reg_rtx (mode3);
39143 if (mode3 == V8SFmode)
39144 gen = gen_vec_extract_lo_v16sf;
39145 else
39146 gen = gen_vec_extract_lo_v16si;
39147 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39148 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39149 emit_insn (gen (half, op3));
39150 op3 = half;
39151 break;
39152 default:
39153 break;
39156 /* Force memory operand only with base register here. But we
39157 don't want to do it on memory operand for other builtin
39158 functions. */
39159 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39161 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39162 op0 = copy_to_mode_reg (Pmode, op0);
39164 op1 = fixup_modeless_constant (op1, mode1);
39166 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39168 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39169 op1 = copy_to_mode_reg (mode1, op1);
39171 else
39173 op1 = copy_to_reg (op1);
39174 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39177 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39178 op2 = copy_to_mode_reg (mode2, op2);
39180 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39181 op3 = copy_to_mode_reg (mode3, op3);
39183 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39185 error ("the last argument must be scale 1, 2, 4, 8");
39186 return const0_rtx;
39189 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39190 if (! pat)
39191 return const0_rtx;
39193 emit_insn (pat);
39194 return 0;
39196 vec_prefetch_gen:
39197 arg0 = CALL_EXPR_ARG (exp, 0);
39198 arg1 = CALL_EXPR_ARG (exp, 1);
39199 arg2 = CALL_EXPR_ARG (exp, 2);
39200 arg3 = CALL_EXPR_ARG (exp, 3);
39201 arg4 = CALL_EXPR_ARG (exp, 4);
39202 op0 = expand_normal (arg0);
39203 op1 = expand_normal (arg1);
39204 op2 = expand_normal (arg2);
39205 op3 = expand_normal (arg3);
39206 op4 = expand_normal (arg4);
39207 mode0 = insn_data[icode].operand[0].mode;
39208 mode1 = insn_data[icode].operand[1].mode;
39209 mode3 = insn_data[icode].operand[3].mode;
39210 mode4 = insn_data[icode].operand[4].mode;
39212 op0 = fixup_modeless_constant (op0, mode0);
39214 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39216 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39217 op0 = copy_to_mode_reg (mode0, op0);
39219 else
39221 op0 = copy_to_reg (op0);
39222 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39225 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39226 op1 = copy_to_mode_reg (mode1, op1);
39228 /* Force memory operand only with base register here. But we
39229 don't want to do it on memory operand for other builtin
39230 functions. */
39231 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39233 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39234 op2 = copy_to_mode_reg (Pmode, op2);
39236 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39238 error ("the forth argument must be scale 1, 2, 4, 8");
39239 return const0_rtx;
39242 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39244 error ("incorrect hint operand");
39245 return const0_rtx;
39248 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39249 if (! pat)
39250 return const0_rtx;
39252 emit_insn (pat);
39254 return 0;
39256 case IX86_BUILTIN_XABORT:
39257 icode = CODE_FOR_xabort;
39258 arg0 = CALL_EXPR_ARG (exp, 0);
39259 op0 = expand_normal (arg0);
39260 mode0 = insn_data[icode].operand[0].mode;
39261 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39263 error ("the xabort's argument must be an 8-bit immediate");
39264 return const0_rtx;
39266 emit_insn (gen_xabort (op0));
39267 return 0;
39269 default:
39270 break;
39273 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39274 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39276 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39277 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39278 target);
39281 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39282 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39284 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39285 switch (fcode)
39287 case IX86_BUILTIN_FABSQ:
39288 case IX86_BUILTIN_COPYSIGNQ:
39289 if (!TARGET_SSE)
39290 /* Emit a normal call if SSE isn't available. */
39291 return expand_call (exp, target, ignore);
39292 /* FALLTHRU */
39293 default:
39294 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39298 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39299 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39301 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39302 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39303 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39304 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39305 int masked = 1;
39306 machine_mode mode, wide_mode, nar_mode;
39308 nar_mode = V4SFmode;
39309 mode = V16SFmode;
39310 wide_mode = V64SFmode;
39311 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39312 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39314 switch (fcode)
39316 case IX86_BUILTIN_4FMAPS:
39317 fcn = gen_avx5124fmaddps_4fmaddps;
39318 masked = 0;
39319 goto v4fma_expand;
39321 case IX86_BUILTIN_4DPWSSD:
39322 nar_mode = V4SImode;
39323 mode = V16SImode;
39324 wide_mode = V64SImode;
39325 fcn = gen_avx5124vnniw_vp4dpwssd;
39326 masked = 0;
39327 goto v4fma_expand;
39329 case IX86_BUILTIN_4DPWSSDS:
39330 nar_mode = V4SImode;
39331 mode = V16SImode;
39332 wide_mode = V64SImode;
39333 fcn = gen_avx5124vnniw_vp4dpwssds;
39334 masked = 0;
39335 goto v4fma_expand;
39337 case IX86_BUILTIN_4FNMAPS:
39338 fcn = gen_avx5124fmaddps_4fnmaddps;
39339 masked = 0;
39340 goto v4fma_expand;
39342 case IX86_BUILTIN_4FNMAPS_MASK:
39343 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39344 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39345 goto v4fma_expand;
39347 case IX86_BUILTIN_4DPWSSD_MASK:
39348 nar_mode = V4SImode;
39349 mode = V16SImode;
39350 wide_mode = V64SImode;
39351 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39352 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39353 goto v4fma_expand;
39355 case IX86_BUILTIN_4DPWSSDS_MASK:
39356 nar_mode = V4SImode;
39357 mode = V16SImode;
39358 wide_mode = V64SImode;
39359 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39360 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39361 goto v4fma_expand;
39363 case IX86_BUILTIN_4FMAPS_MASK:
39365 tree args[4];
39366 rtx ops[4];
39367 rtx wide_reg;
39368 rtx accum;
39369 rtx addr;
39370 rtx mem;
39372 v4fma_expand:
39373 wide_reg = gen_reg_rtx (wide_mode);
39374 for (i = 0; i < 4; i++)
39376 args[i] = CALL_EXPR_ARG (exp, i);
39377 ops[i] = expand_normal (args[i]);
39379 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39380 ops[i]);
39383 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39384 accum = force_reg (mode, accum);
39386 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39387 addr = force_reg (Pmode, addr);
39389 mem = gen_rtx_MEM (nar_mode, addr);
39391 target = gen_reg_rtx (mode);
39393 emit_move_insn (target, accum);
39395 if (! masked)
39396 emit_insn (fcn (target, accum, wide_reg, mem));
39397 else
39399 rtx merge, mask;
39400 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39402 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39404 if (CONST_INT_P (mask))
39405 mask = fixup_modeless_constant (mask, HImode);
39407 mask = force_reg (HImode, mask);
39409 if (GET_MODE (mask) != HImode)
39410 mask = gen_rtx_SUBREG (HImode, mask, 0);
39412 /* If merge is 0 then we're about to emit z-masked variant. */
39413 if (const0_operand (merge, mode))
39414 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39415 /* If merge is the same as accum then emit merge-masked variant. */
39416 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39418 merge = force_reg (mode, merge);
39419 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39421 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39422 else
39424 target = gen_reg_rtx (mode);
39425 emit_move_insn (target, merge);
39426 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39429 return target;
39432 case IX86_BUILTIN_4FNMASS:
39433 fcn = gen_avx5124fmaddps_4fnmaddss;
39434 masked = 0;
39435 goto s4fma_expand;
39437 case IX86_BUILTIN_4FMASS:
39438 fcn = gen_avx5124fmaddps_4fmaddss;
39439 masked = 0;
39440 goto s4fma_expand;
39442 case IX86_BUILTIN_4FNMASS_MASK:
39443 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39444 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39445 goto s4fma_expand;
39447 case IX86_BUILTIN_4FMASS_MASK:
39449 tree args[4];
39450 rtx ops[4];
39451 rtx wide_reg;
39452 rtx accum;
39453 rtx addr;
39454 rtx mem;
39456 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39457 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39459 s4fma_expand:
39460 mode = V4SFmode;
39461 wide_reg = gen_reg_rtx (V64SFmode);
39462 for (i = 0; i < 4; i++)
39464 rtx tmp;
39465 args[i] = CALL_EXPR_ARG (exp, i);
39466 ops[i] = expand_normal (args[i]);
39468 tmp = gen_reg_rtx (SFmode);
39469 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39471 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39472 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39475 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39476 accum = force_reg (V4SFmode, accum);
39478 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39479 addr = force_reg (Pmode, addr);
39481 mem = gen_rtx_MEM (V4SFmode, addr);
39483 target = gen_reg_rtx (V4SFmode);
39485 emit_move_insn (target, accum);
39487 if (! masked)
39488 emit_insn (fcn (target, accum, wide_reg, mem));
39489 else
39491 rtx merge, mask;
39492 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39494 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39496 if (CONST_INT_P (mask))
39497 mask = fixup_modeless_constant (mask, QImode);
39499 mask = force_reg (QImode, mask);
39501 if (GET_MODE (mask) != QImode)
39502 mask = gen_rtx_SUBREG (QImode, mask, 0);
39504 /* If merge is 0 then we're about to emit z-masked variant. */
39505 if (const0_operand (merge, mode))
39506 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39507 /* If merge is the same as accum then emit merge-masked
39508 variant. */
39509 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39511 merge = force_reg (mode, merge);
39512 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39514 /* Merge with something unknown might happen if we z-mask
39515 w/ -O0. */
39516 else
39518 target = gen_reg_rtx (mode);
39519 emit_move_insn (target, merge);
39520 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39523 return target;
39525 case IX86_BUILTIN_RDPID:
39526 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39527 target);
39528 default:
39529 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39533 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39534 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39536 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39537 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39540 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39541 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39543 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39544 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39547 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39548 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39550 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39551 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39554 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39555 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39557 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39558 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39561 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39562 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39564 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39565 const struct builtin_description *d = bdesc_multi_arg + i;
39566 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39567 (enum ix86_builtin_func_type)
39568 d->flag, d->comparison);
39571 gcc_unreachable ();
39574 /* This returns the target-specific builtin with code CODE if
39575 current_function_decl has visibility on this builtin, which is checked
39576 using isa flags. Returns NULL_TREE otherwise. */
39578 static tree ix86_get_builtin (enum ix86_builtins code)
39580 struct cl_target_option *opts;
39581 tree target_tree = NULL_TREE;
39583 /* Determine the isa flags of current_function_decl. */
39585 if (current_function_decl)
39586 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39588 if (target_tree == NULL)
39589 target_tree = target_option_default_node;
39591 opts = TREE_TARGET_OPTION (target_tree);
39593 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39594 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39595 return ix86_builtin_decl (code, true);
39596 else
39597 return NULL_TREE;
39600 /* Return function decl for target specific builtin
39601 for given MPX builtin passed i FCODE. */
39602 static tree
39603 ix86_builtin_mpx_function (unsigned fcode)
39605 switch (fcode)
39607 case BUILT_IN_CHKP_BNDMK:
39608 return ix86_builtins[IX86_BUILTIN_BNDMK];
39610 case BUILT_IN_CHKP_BNDSTX:
39611 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39613 case BUILT_IN_CHKP_BNDLDX:
39614 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39616 case BUILT_IN_CHKP_BNDCL:
39617 return ix86_builtins[IX86_BUILTIN_BNDCL];
39619 case BUILT_IN_CHKP_BNDCU:
39620 return ix86_builtins[IX86_BUILTIN_BNDCU];
39622 case BUILT_IN_CHKP_BNDRET:
39623 return ix86_builtins[IX86_BUILTIN_BNDRET];
39625 case BUILT_IN_CHKP_INTERSECT:
39626 return ix86_builtins[IX86_BUILTIN_BNDINT];
39628 case BUILT_IN_CHKP_NARROW:
39629 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39631 case BUILT_IN_CHKP_SIZEOF:
39632 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39634 case BUILT_IN_CHKP_EXTRACT_LOWER:
39635 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39637 case BUILT_IN_CHKP_EXTRACT_UPPER:
39638 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39640 default:
39641 return NULL_TREE;
39644 gcc_unreachable ();
39647 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39649 Return an address to be used to load/store bounds for pointer
39650 passed in SLOT.
39652 SLOT_NO is an integer constant holding number of a target
39653 dependent special slot to be used in case SLOT is not a memory.
39655 SPECIAL_BASE is a pointer to be used as a base of fake address
39656 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39657 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39659 static rtx
39660 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39662 rtx addr = NULL;
39664 /* NULL slot means we pass bounds for pointer not passed to the
39665 function at all. Register slot means we pass pointer in a
39666 register. In both these cases bounds are passed via Bounds
39667 Table. Since we do not have actual pointer stored in memory,
39668 we have to use fake addresses to access Bounds Table. We
39669 start with (special_base - sizeof (void*)) and decrease this
39670 address by pointer size to get addresses for other slots. */
39671 if (!slot || REG_P (slot))
39673 gcc_assert (CONST_INT_P (slot_no));
39674 addr = plus_constant (Pmode, special_base,
39675 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39677 /* If pointer is passed in a memory then its address is used to
39678 access Bounds Table. */
39679 else if (MEM_P (slot))
39681 addr = XEXP (slot, 0);
39682 if (!register_operand (addr, Pmode))
39683 addr = copy_addr_to_reg (addr);
39685 else
39686 gcc_unreachable ();
39688 return addr;
39691 /* Expand pass uses this hook to load bounds for function parameter
39692 PTR passed in SLOT in case its bounds are not passed in a register.
39694 If SLOT is a memory, then bounds are loaded as for regular pointer
39695 loaded from memory. PTR may be NULL in case SLOT is a memory.
39696 In such case value of PTR (if required) may be loaded from SLOT.
39698 If SLOT is NULL or a register then SLOT_NO is an integer constant
39699 holding number of the target dependent special slot which should be
39700 used to obtain bounds.
39702 Return loaded bounds. */
39704 static rtx
39705 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39707 rtx reg = gen_reg_rtx (BNDmode);
39708 rtx addr;
39710 /* Get address to be used to access Bounds Table. Special slots start
39711 at the location of return address of the current function. */
39712 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39714 /* Load pointer value from a memory if we don't have it. */
39715 if (!ptr)
39717 gcc_assert (MEM_P (slot));
39718 ptr = copy_addr_to_reg (slot);
39721 if (!register_operand (ptr, Pmode))
39722 ptr = ix86_zero_extend_to_Pmode (ptr);
39724 emit_insn (BNDmode == BND64mode
39725 ? gen_bnd64_ldx (reg, addr, ptr)
39726 : gen_bnd32_ldx (reg, addr, ptr));
39728 return reg;
39731 /* Expand pass uses this hook to store BOUNDS for call argument PTR
39732 passed in SLOT in case BOUNDS are not passed in a register.
39734 If SLOT is a memory, then BOUNDS are stored as for regular pointer
39735 stored in memory. PTR may be NULL in case SLOT is a memory.
39736 In such case value of PTR (if required) may be loaded from SLOT.
39738 If SLOT is NULL or a register then SLOT_NO is an integer constant
39739 holding number of the target dependent special slot which should be
39740 used to store BOUNDS. */
39742 static void
39743 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
39745 rtx addr;
39747 /* Get address to be used to access Bounds Table. Special slots start
39748 at the location of return address of a called function. */
39749 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
39751 /* Load pointer value from a memory if we don't have it. */
39752 if (!ptr)
39754 gcc_assert (MEM_P (slot));
39755 ptr = copy_addr_to_reg (slot);
39758 if (!register_operand (ptr, Pmode))
39759 ptr = ix86_zero_extend_to_Pmode (ptr);
39761 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39762 if (!register_operand (bounds, BNDmode))
39763 bounds = copy_to_mode_reg (BNDmode, bounds);
39765 emit_insn (BNDmode == BND64mode
39766 ? gen_bnd64_stx (addr, ptr, bounds)
39767 : gen_bnd32_stx (addr, ptr, bounds));
39770 /* Load and return bounds returned by function in SLOT. */
39772 static rtx
39773 ix86_load_returned_bounds (rtx slot)
39775 rtx res;
39777 gcc_assert (REG_P (slot));
39778 res = gen_reg_rtx (BNDmode);
39779 emit_move_insn (res, slot);
39781 return res;
39784 /* Store BOUNDS returned by function into SLOT. */
39786 static void
39787 ix86_store_returned_bounds (rtx slot, rtx bounds)
39789 gcc_assert (REG_P (slot));
39790 emit_move_insn (slot, bounds);
39793 /* Returns a function decl for a vectorized version of the combined function
39794 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39795 if it is not available. */
39797 static tree
39798 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39799 tree type_in)
39801 machine_mode in_mode, out_mode;
39802 int in_n, out_n;
39804 if (TREE_CODE (type_out) != VECTOR_TYPE
39805 || TREE_CODE (type_in) != VECTOR_TYPE)
39806 return NULL_TREE;
39808 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39809 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39810 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39811 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39813 switch (fn)
39815 CASE_CFN_EXP2:
39816 if (out_mode == SFmode && in_mode == SFmode)
39818 if (out_n == 16 && in_n == 16)
39819 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39821 break;
39823 CASE_CFN_IFLOOR:
39824 CASE_CFN_LFLOOR:
39825 CASE_CFN_LLFLOOR:
39826 /* The round insn does not trap on denormals. */
39827 if (flag_trapping_math || !TARGET_ROUND)
39828 break;
39830 if (out_mode == SImode && in_mode == DFmode)
39832 if (out_n == 4 && in_n == 2)
39833 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39834 else if (out_n == 8 && in_n == 4)
39835 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39836 else if (out_n == 16 && in_n == 8)
39837 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39839 if (out_mode == SImode && in_mode == SFmode)
39841 if (out_n == 4 && in_n == 4)
39842 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39843 else if (out_n == 8 && in_n == 8)
39844 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39845 else if (out_n == 16 && in_n == 16)
39846 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39848 break;
39850 CASE_CFN_ICEIL:
39851 CASE_CFN_LCEIL:
39852 CASE_CFN_LLCEIL:
39853 /* The round insn does not trap on denormals. */
39854 if (flag_trapping_math || !TARGET_ROUND)
39855 break;
39857 if (out_mode == SImode && in_mode == DFmode)
39859 if (out_n == 4 && in_n == 2)
39860 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
39861 else if (out_n == 8 && in_n == 4)
39862 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
39863 else if (out_n == 16 && in_n == 8)
39864 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
39866 if (out_mode == SImode && in_mode == SFmode)
39868 if (out_n == 4 && in_n == 4)
39869 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
39870 else if (out_n == 8 && in_n == 8)
39871 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
39872 else if (out_n == 16 && in_n == 16)
39873 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
39875 break;
39877 CASE_CFN_IRINT:
39878 CASE_CFN_LRINT:
39879 CASE_CFN_LLRINT:
39880 if (out_mode == SImode && in_mode == DFmode)
39882 if (out_n == 4 && in_n == 2)
39883 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
39884 else if (out_n == 8 && in_n == 4)
39885 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
39886 else if (out_n == 16 && in_n == 8)
39887 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
39889 if (out_mode == SImode && in_mode == SFmode)
39891 if (out_n == 4 && in_n == 4)
39892 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
39893 else if (out_n == 8 && in_n == 8)
39894 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
39895 else if (out_n == 16 && in_n == 16)
39896 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
39898 break;
39900 CASE_CFN_IROUND:
39901 CASE_CFN_LROUND:
39902 CASE_CFN_LLROUND:
39903 /* The round insn does not trap on denormals. */
39904 if (flag_trapping_math || !TARGET_ROUND)
39905 break;
39907 if (out_mode == SImode && in_mode == DFmode)
39909 if (out_n == 4 && in_n == 2)
39910 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
39911 else if (out_n == 8 && in_n == 4)
39912 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
39913 else if (out_n == 16 && in_n == 8)
39914 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
39916 if (out_mode == SImode && in_mode == SFmode)
39918 if (out_n == 4 && in_n == 4)
39919 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
39920 else if (out_n == 8 && in_n == 8)
39921 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
39922 else if (out_n == 16 && in_n == 16)
39923 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
39925 break;
39927 CASE_CFN_FLOOR:
39928 /* The round insn does not trap on denormals. */
39929 if (flag_trapping_math || !TARGET_ROUND)
39930 break;
39932 if (out_mode == DFmode && in_mode == DFmode)
39934 if (out_n == 2 && in_n == 2)
39935 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
39936 else if (out_n == 4 && in_n == 4)
39937 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
39938 else if (out_n == 8 && in_n == 8)
39939 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
39941 if (out_mode == SFmode && in_mode == SFmode)
39943 if (out_n == 4 && in_n == 4)
39944 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
39945 else if (out_n == 8 && in_n == 8)
39946 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
39947 else if (out_n == 16 && in_n == 16)
39948 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
39950 break;
39952 CASE_CFN_CEIL:
39953 /* The round insn does not trap on denormals. */
39954 if (flag_trapping_math || !TARGET_ROUND)
39955 break;
39957 if (out_mode == DFmode && in_mode == DFmode)
39959 if (out_n == 2 && in_n == 2)
39960 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
39961 else if (out_n == 4 && in_n == 4)
39962 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
39963 else if (out_n == 8 && in_n == 8)
39964 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
39966 if (out_mode == SFmode && in_mode == SFmode)
39968 if (out_n == 4 && in_n == 4)
39969 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
39970 else if (out_n == 8 && in_n == 8)
39971 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
39972 else if (out_n == 16 && in_n == 16)
39973 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
39975 break;
39977 CASE_CFN_TRUNC:
39978 /* The round insn does not trap on denormals. */
39979 if (flag_trapping_math || !TARGET_ROUND)
39980 break;
39982 if (out_mode == DFmode && in_mode == DFmode)
39984 if (out_n == 2 && in_n == 2)
39985 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39986 else if (out_n == 4 && in_n == 4)
39987 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39988 else if (out_n == 8 && in_n == 8)
39989 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39991 if (out_mode == SFmode && in_mode == SFmode)
39993 if (out_n == 4 && in_n == 4)
39994 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39995 else if (out_n == 8 && in_n == 8)
39996 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39997 else if (out_n == 16 && in_n == 16)
39998 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
40000 break;
40002 CASE_CFN_RINT:
40003 /* The round insn does not trap on denormals. */
40004 if (flag_trapping_math || !TARGET_ROUND)
40005 break;
40007 if (out_mode == DFmode && in_mode == DFmode)
40009 if (out_n == 2 && in_n == 2)
40010 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
40011 else if (out_n == 4 && in_n == 4)
40012 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
40014 if (out_mode == SFmode && in_mode == SFmode)
40016 if (out_n == 4 && in_n == 4)
40017 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
40018 else if (out_n == 8 && in_n == 8)
40019 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40021 break;
40023 CASE_CFN_FMA:
40024 if (out_mode == DFmode && in_mode == DFmode)
40026 if (out_n == 2 && in_n == 2)
40027 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40028 if (out_n == 4 && in_n == 4)
40029 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40031 if (out_mode == SFmode && in_mode == SFmode)
40033 if (out_n == 4 && in_n == 4)
40034 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40035 if (out_n == 8 && in_n == 8)
40036 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40038 break;
40040 default:
40041 break;
40044 /* Dispatch to a handler for a vectorization library. */
40045 if (ix86_veclib_handler)
40046 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40048 return NULL_TREE;
40051 /* Handler for an SVML-style interface to
40052 a library with vectorized intrinsics. */
40054 static tree
40055 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40057 char name[20];
40058 tree fntype, new_fndecl, args;
40059 unsigned arity;
40060 const char *bname;
40061 machine_mode el_mode, in_mode;
40062 int n, in_n;
40064 /* The SVML is suitable for unsafe math only. */
40065 if (!flag_unsafe_math_optimizations)
40066 return NULL_TREE;
40068 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40069 n = TYPE_VECTOR_SUBPARTS (type_out);
40070 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40071 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40072 if (el_mode != in_mode
40073 || n != in_n)
40074 return NULL_TREE;
40076 switch (fn)
40078 CASE_CFN_EXP:
40079 CASE_CFN_LOG:
40080 CASE_CFN_LOG10:
40081 CASE_CFN_POW:
40082 CASE_CFN_TANH:
40083 CASE_CFN_TAN:
40084 CASE_CFN_ATAN:
40085 CASE_CFN_ATAN2:
40086 CASE_CFN_ATANH:
40087 CASE_CFN_CBRT:
40088 CASE_CFN_SINH:
40089 CASE_CFN_SIN:
40090 CASE_CFN_ASINH:
40091 CASE_CFN_ASIN:
40092 CASE_CFN_COSH:
40093 CASE_CFN_COS:
40094 CASE_CFN_ACOSH:
40095 CASE_CFN_ACOS:
40096 if ((el_mode != DFmode || n != 2)
40097 && (el_mode != SFmode || n != 4))
40098 return NULL_TREE;
40099 break;
40101 default:
40102 return NULL_TREE;
40105 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40106 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40108 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40109 strcpy (name, "vmlsLn4");
40110 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40111 strcpy (name, "vmldLn2");
40112 else if (n == 4)
40114 sprintf (name, "vmls%s", bname+10);
40115 name[strlen (name)-1] = '4';
40117 else
40118 sprintf (name, "vmld%s2", bname+10);
40120 /* Convert to uppercase. */
40121 name[4] &= ~0x20;
40123 arity = 0;
40124 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40125 arity++;
40127 if (arity == 1)
40128 fntype = build_function_type_list (type_out, type_in, NULL);
40129 else
40130 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40132 /* Build a function declaration for the vectorized function. */
40133 new_fndecl = build_decl (BUILTINS_LOCATION,
40134 FUNCTION_DECL, get_identifier (name), fntype);
40135 TREE_PUBLIC (new_fndecl) = 1;
40136 DECL_EXTERNAL (new_fndecl) = 1;
40137 DECL_IS_NOVOPS (new_fndecl) = 1;
40138 TREE_READONLY (new_fndecl) = 1;
40140 return new_fndecl;
40143 /* Handler for an ACML-style interface to
40144 a library with vectorized intrinsics. */
40146 static tree
40147 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40149 char name[20] = "__vr.._";
40150 tree fntype, new_fndecl, args;
40151 unsigned arity;
40152 const char *bname;
40153 machine_mode el_mode, in_mode;
40154 int n, in_n;
40156 /* The ACML is 64bits only and suitable for unsafe math only as
40157 it does not correctly support parts of IEEE with the required
40158 precision such as denormals. */
40159 if (!TARGET_64BIT
40160 || !flag_unsafe_math_optimizations)
40161 return NULL_TREE;
40163 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40164 n = TYPE_VECTOR_SUBPARTS (type_out);
40165 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40166 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40167 if (el_mode != in_mode
40168 || n != in_n)
40169 return NULL_TREE;
40171 switch (fn)
40173 CASE_CFN_SIN:
40174 CASE_CFN_COS:
40175 CASE_CFN_EXP:
40176 CASE_CFN_LOG:
40177 CASE_CFN_LOG2:
40178 CASE_CFN_LOG10:
40179 if (el_mode == DFmode && n == 2)
40181 name[4] = 'd';
40182 name[5] = '2';
40184 else if (el_mode == SFmode && n == 4)
40186 name[4] = 's';
40187 name[5] = '4';
40189 else
40190 return NULL_TREE;
40191 break;
40193 default:
40194 return NULL_TREE;
40197 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40198 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40199 sprintf (name + 7, "%s", bname+10);
40201 arity = 0;
40202 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40203 arity++;
40205 if (arity == 1)
40206 fntype = build_function_type_list (type_out, type_in, NULL);
40207 else
40208 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40210 /* Build a function declaration for the vectorized function. */
40211 new_fndecl = build_decl (BUILTINS_LOCATION,
40212 FUNCTION_DECL, get_identifier (name), fntype);
40213 TREE_PUBLIC (new_fndecl) = 1;
40214 DECL_EXTERNAL (new_fndecl) = 1;
40215 DECL_IS_NOVOPS (new_fndecl) = 1;
40216 TREE_READONLY (new_fndecl) = 1;
40218 return new_fndecl;
40221 /* Returns a decl of a function that implements gather load with
40222 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40223 Return NULL_TREE if it is not available. */
40225 static tree
40226 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40227 const_tree index_type, int scale)
40229 bool si;
40230 enum ix86_builtins code;
40232 if (! TARGET_AVX2)
40233 return NULL_TREE;
40235 if ((TREE_CODE (index_type) != INTEGER_TYPE
40236 && !POINTER_TYPE_P (index_type))
40237 || (TYPE_MODE (index_type) != SImode
40238 && TYPE_MODE (index_type) != DImode))
40239 return NULL_TREE;
40241 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40242 return NULL_TREE;
40244 /* v*gather* insn sign extends index to pointer mode. */
40245 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40246 && TYPE_UNSIGNED (index_type))
40247 return NULL_TREE;
40249 if (scale <= 0
40250 || scale > 8
40251 || (scale & (scale - 1)) != 0)
40252 return NULL_TREE;
40254 si = TYPE_MODE (index_type) == SImode;
40255 switch (TYPE_MODE (mem_vectype))
40257 case V2DFmode:
40258 if (TARGET_AVX512VL)
40259 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40260 else
40261 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40262 break;
40263 case V4DFmode:
40264 if (TARGET_AVX512VL)
40265 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40266 else
40267 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40268 break;
40269 case V2DImode:
40270 if (TARGET_AVX512VL)
40271 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40272 else
40273 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40274 break;
40275 case V4DImode:
40276 if (TARGET_AVX512VL)
40277 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40278 else
40279 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40280 break;
40281 case V4SFmode:
40282 if (TARGET_AVX512VL)
40283 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40284 else
40285 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40286 break;
40287 case V8SFmode:
40288 if (TARGET_AVX512VL)
40289 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40290 else
40291 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40292 break;
40293 case V4SImode:
40294 if (TARGET_AVX512VL)
40295 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40296 else
40297 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40298 break;
40299 case V8SImode:
40300 if (TARGET_AVX512VL)
40301 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40302 else
40303 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40304 break;
40305 case V8DFmode:
40306 if (TARGET_AVX512F)
40307 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40308 else
40309 return NULL_TREE;
40310 break;
40311 case V8DImode:
40312 if (TARGET_AVX512F)
40313 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40314 else
40315 return NULL_TREE;
40316 break;
40317 case V16SFmode:
40318 if (TARGET_AVX512F)
40319 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40320 else
40321 return NULL_TREE;
40322 break;
40323 case V16SImode:
40324 if (TARGET_AVX512F)
40325 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40326 else
40327 return NULL_TREE;
40328 break;
40329 default:
40330 return NULL_TREE;
40333 return ix86_get_builtin (code);
40336 /* Returns a decl of a function that implements scatter store with
40337 register type VECTYPE and index type INDEX_TYPE and SCALE.
40338 Return NULL_TREE if it is not available. */
40340 static tree
40341 ix86_vectorize_builtin_scatter (const_tree vectype,
40342 const_tree index_type, int scale)
40344 bool si;
40345 enum ix86_builtins code;
40347 if (!TARGET_AVX512F)
40348 return NULL_TREE;
40350 if ((TREE_CODE (index_type) != INTEGER_TYPE
40351 && !POINTER_TYPE_P (index_type))
40352 || (TYPE_MODE (index_type) != SImode
40353 && TYPE_MODE (index_type) != DImode))
40354 return NULL_TREE;
40356 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40357 return NULL_TREE;
40359 /* v*scatter* insn sign extends index to pointer mode. */
40360 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40361 && TYPE_UNSIGNED (index_type))
40362 return NULL_TREE;
40364 /* Scale can be 1, 2, 4 or 8. */
40365 if (scale <= 0
40366 || scale > 8
40367 || (scale & (scale - 1)) != 0)
40368 return NULL_TREE;
40370 si = TYPE_MODE (index_type) == SImode;
40371 switch (TYPE_MODE (vectype))
40373 case V8DFmode:
40374 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40375 break;
40376 case V8DImode:
40377 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40378 break;
40379 case V16SFmode:
40380 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40381 break;
40382 case V16SImode:
40383 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40384 break;
40385 default:
40386 return NULL_TREE;
40389 return ix86_builtins[code];
40392 /* Return true if it is safe to use the rsqrt optabs to optimize
40393 1.0/sqrt. */
40395 static bool
40396 use_rsqrt_p ()
40398 return (TARGET_SSE_MATH
40399 && flag_finite_math_only
40400 && !flag_trapping_math
40401 && flag_unsafe_math_optimizations);
40404 /* Returns a code for a target-specific builtin that implements
40405 reciprocal of the function, or NULL_TREE if not available. */
40407 static tree
40408 ix86_builtin_reciprocal (tree fndecl)
40410 switch (DECL_FUNCTION_CODE (fndecl))
40412 /* Vectorized version of sqrt to rsqrt conversion. */
40413 case IX86_BUILTIN_SQRTPS_NR:
40414 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40416 case IX86_BUILTIN_SQRTPS_NR256:
40417 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40419 default:
40420 return NULL_TREE;
40424 /* Helper for avx_vpermilps256_operand et al. This is also used by
40425 the expansion functions to turn the parallel back into a mask.
40426 The return value is 0 for no match and the imm8+1 for a match. */
40429 avx_vpermilp_parallel (rtx par, machine_mode mode)
40431 unsigned i, nelt = GET_MODE_NUNITS (mode);
40432 unsigned mask = 0;
40433 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40435 if (XVECLEN (par, 0) != (int) nelt)
40436 return 0;
40438 /* Validate that all of the elements are constants, and not totally
40439 out of range. Copy the data into an integral array to make the
40440 subsequent checks easier. */
40441 for (i = 0; i < nelt; ++i)
40443 rtx er = XVECEXP (par, 0, i);
40444 unsigned HOST_WIDE_INT ei;
40446 if (!CONST_INT_P (er))
40447 return 0;
40448 ei = INTVAL (er);
40449 if (ei >= nelt)
40450 return 0;
40451 ipar[i] = ei;
40454 switch (mode)
40456 case V8DFmode:
40457 /* In the 512-bit DFmode case, we can only move elements within
40458 a 128-bit lane. First fill the second part of the mask,
40459 then fallthru. */
40460 for (i = 4; i < 6; ++i)
40462 if (ipar[i] < 4 || ipar[i] >= 6)
40463 return 0;
40464 mask |= (ipar[i] - 4) << i;
40466 for (i = 6; i < 8; ++i)
40468 if (ipar[i] < 6)
40469 return 0;
40470 mask |= (ipar[i] - 6) << i;
40472 /* FALLTHRU */
40474 case V4DFmode:
40475 /* In the 256-bit DFmode case, we can only move elements within
40476 a 128-bit lane. */
40477 for (i = 0; i < 2; ++i)
40479 if (ipar[i] >= 2)
40480 return 0;
40481 mask |= ipar[i] << i;
40483 for (i = 2; i < 4; ++i)
40485 if (ipar[i] < 2)
40486 return 0;
40487 mask |= (ipar[i] - 2) << i;
40489 break;
40491 case V16SFmode:
40492 /* In 512 bit SFmode case, permutation in the upper 256 bits
40493 must mirror the permutation in the lower 256-bits. */
40494 for (i = 0; i < 8; ++i)
40495 if (ipar[i] + 8 != ipar[i + 8])
40496 return 0;
40497 /* FALLTHRU */
40499 case V8SFmode:
40500 /* In 256 bit SFmode case, we have full freedom of
40501 movement within the low 128-bit lane, but the high 128-bit
40502 lane must mirror the exact same pattern. */
40503 for (i = 0; i < 4; ++i)
40504 if (ipar[i] + 4 != ipar[i + 4])
40505 return 0;
40506 nelt = 4;
40507 /* FALLTHRU */
40509 case V2DFmode:
40510 case V4SFmode:
40511 /* In the 128-bit case, we've full freedom in the placement of
40512 the elements from the source operand. */
40513 for (i = 0; i < nelt; ++i)
40514 mask |= ipar[i] << (i * (nelt / 2));
40515 break;
40517 default:
40518 gcc_unreachable ();
40521 /* Make sure success has a non-zero value by adding one. */
40522 return mask + 1;
40525 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40526 the expansion functions to turn the parallel back into a mask.
40527 The return value is 0 for no match and the imm8+1 for a match. */
40530 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40532 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40533 unsigned mask = 0;
40534 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40536 if (XVECLEN (par, 0) != (int) nelt)
40537 return 0;
40539 /* Validate that all of the elements are constants, and not totally
40540 out of range. Copy the data into an integral array to make the
40541 subsequent checks easier. */
40542 for (i = 0; i < nelt; ++i)
40544 rtx er = XVECEXP (par, 0, i);
40545 unsigned HOST_WIDE_INT ei;
40547 if (!CONST_INT_P (er))
40548 return 0;
40549 ei = INTVAL (er);
40550 if (ei >= 2 * nelt)
40551 return 0;
40552 ipar[i] = ei;
40555 /* Validate that the halves of the permute are halves. */
40556 for (i = 0; i < nelt2 - 1; ++i)
40557 if (ipar[i] + 1 != ipar[i + 1])
40558 return 0;
40559 for (i = nelt2; i < nelt - 1; ++i)
40560 if (ipar[i] + 1 != ipar[i + 1])
40561 return 0;
40563 /* Reconstruct the mask. */
40564 for (i = 0; i < 2; ++i)
40566 unsigned e = ipar[i * nelt2];
40567 if (e % nelt2)
40568 return 0;
40569 e /= nelt2;
40570 mask |= e << (i * 4);
40573 /* Make sure success has a non-zero value by adding one. */
40574 return mask + 1;
40577 /* Return a register priority for hard reg REGNO. */
40578 static int
40579 ix86_register_priority (int hard_regno)
40581 /* ebp and r13 as the base always wants a displacement, r12 as the
40582 base always wants an index. So discourage their usage in an
40583 address. */
40584 if (hard_regno == R12_REG || hard_regno == R13_REG)
40585 return 0;
40586 if (hard_regno == BP_REG)
40587 return 1;
40588 /* New x86-64 int registers result in bigger code size. Discourage
40589 them. */
40590 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40591 return 2;
40592 /* New x86-64 SSE registers result in bigger code size. Discourage
40593 them. */
40594 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40595 return 2;
40596 /* Usage of AX register results in smaller code. Prefer it. */
40597 if (hard_regno == AX_REG)
40598 return 4;
40599 return 3;
40602 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40604 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40605 QImode must go into class Q_REGS.
40606 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40607 movdf to do mem-to-mem moves through integer regs. */
40609 static reg_class_t
40610 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40612 machine_mode mode = GET_MODE (x);
40614 /* We're only allowed to return a subclass of CLASS. Many of the
40615 following checks fail for NO_REGS, so eliminate that early. */
40616 if (regclass == NO_REGS)
40617 return NO_REGS;
40619 /* All classes can load zeros. */
40620 if (x == CONST0_RTX (mode))
40621 return regclass;
40623 /* Force constants into memory if we are loading a (nonzero) constant into
40624 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40625 instructions to load from a constant. */
40626 if (CONSTANT_P (x)
40627 && (MAYBE_MMX_CLASS_P (regclass)
40628 || MAYBE_SSE_CLASS_P (regclass)
40629 || MAYBE_MASK_CLASS_P (regclass)))
40630 return NO_REGS;
40632 /* Floating-point constants need more complex checks. */
40633 if (CONST_DOUBLE_P (x))
40635 /* General regs can load everything. */
40636 if (INTEGER_CLASS_P (regclass))
40637 return regclass;
40639 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40640 zero above. We only want to wind up preferring 80387 registers if
40641 we plan on doing computation with them. */
40642 if (IS_STACK_MODE (mode)
40643 && standard_80387_constant_p (x) > 0)
40645 /* Limit class to FP regs. */
40646 if (FLOAT_CLASS_P (regclass))
40647 return FLOAT_REGS;
40648 else if (regclass == FP_TOP_SSE_REGS)
40649 return FP_TOP_REG;
40650 else if (regclass == FP_SECOND_SSE_REGS)
40651 return FP_SECOND_REG;
40654 return NO_REGS;
40657 /* Prefer SSE regs only, if we can use them for math. */
40658 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40659 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40661 /* Generally when we see PLUS here, it's the function invariant
40662 (plus soft-fp const_int). Which can only be computed into general
40663 regs. */
40664 if (GET_CODE (x) == PLUS)
40665 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40667 /* QImode constants are easy to load, but non-constant QImode data
40668 must go into Q_REGS. */
40669 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40671 if (Q_CLASS_P (regclass))
40672 return regclass;
40673 else if (reg_class_subset_p (Q_REGS, regclass))
40674 return Q_REGS;
40675 else
40676 return NO_REGS;
40679 return regclass;
40682 /* Discourage putting floating-point values in SSE registers unless
40683 SSE math is being used, and likewise for the 387 registers. */
40684 static reg_class_t
40685 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40687 machine_mode mode = GET_MODE (x);
40689 /* Restrict the output reload class to the register bank that we are doing
40690 math on. If we would like not to return a subset of CLASS, reject this
40691 alternative: if reload cannot do this, it will still use its choice. */
40692 mode = GET_MODE (x);
40693 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40694 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40696 if (IS_STACK_MODE (mode))
40698 if (regclass == FP_TOP_SSE_REGS)
40699 return FP_TOP_REG;
40700 else if (regclass == FP_SECOND_SSE_REGS)
40701 return FP_SECOND_REG;
40702 else
40703 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40706 return regclass;
40709 static reg_class_t
40710 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40711 machine_mode mode, secondary_reload_info *sri)
40713 /* Double-word spills from general registers to non-offsettable memory
40714 references (zero-extended addresses) require special handling. */
40715 if (TARGET_64BIT
40716 && MEM_P (x)
40717 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40718 && INTEGER_CLASS_P (rclass)
40719 && !offsettable_memref_p (x))
40721 sri->icode = (in_p
40722 ? CODE_FOR_reload_noff_load
40723 : CODE_FOR_reload_noff_store);
40724 /* Add the cost of moving address to a temporary. */
40725 sri->extra_cost = 1;
40727 return NO_REGS;
40730 /* QImode spills from non-QI registers require
40731 intermediate register on 32bit targets. */
40732 if (mode == QImode
40733 && ((!TARGET_64BIT && !in_p
40734 && INTEGER_CLASS_P (rclass)
40735 && MAYBE_NON_Q_CLASS_P (rclass))
40736 || (!TARGET_AVX512DQ
40737 && MAYBE_MASK_CLASS_P (rclass))))
40739 int regno = true_regnum (x);
40741 /* Return Q_REGS if the operand is in memory. */
40742 if (regno == -1)
40743 return Q_REGS;
40745 return NO_REGS;
40748 /* This condition handles corner case where an expression involving
40749 pointers gets vectorized. We're trying to use the address of a
40750 stack slot as a vector initializer.
40752 (set (reg:V2DI 74 [ vect_cst_.2 ])
40753 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40755 Eventually frame gets turned into sp+offset like this:
40757 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40758 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40759 (const_int 392 [0x188]))))
40761 That later gets turned into:
40763 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40764 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40765 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40767 We'll have the following reload recorded:
40769 Reload 0: reload_in (DI) =
40770 (plus:DI (reg/f:DI 7 sp)
40771 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40772 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40773 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40774 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40775 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40776 reload_reg_rtx: (reg:V2DI 22 xmm1)
40778 Which isn't going to work since SSE instructions can't handle scalar
40779 additions. Returning GENERAL_REGS forces the addition into integer
40780 register and reload can handle subsequent reloads without problems. */
40782 if (in_p && GET_CODE (x) == PLUS
40783 && SSE_CLASS_P (rclass)
40784 && SCALAR_INT_MODE_P (mode))
40785 return GENERAL_REGS;
40787 return NO_REGS;
40790 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40792 static bool
40793 ix86_class_likely_spilled_p (reg_class_t rclass)
40795 switch (rclass)
40797 case AREG:
40798 case DREG:
40799 case CREG:
40800 case BREG:
40801 case AD_REGS:
40802 case SIREG:
40803 case DIREG:
40804 case SSE_FIRST_REG:
40805 case FP_TOP_REG:
40806 case FP_SECOND_REG:
40807 case BND_REGS:
40808 return true;
40810 default:
40811 break;
40814 return false;
40817 /* If we are copying between registers from different register sets
40818 (e.g. FP and integer), we may need a memory location.
40820 The function can't work reliably when one of the CLASSES is a class
40821 containing registers from multiple sets. We avoid this by never combining
40822 different sets in a single alternative in the machine description.
40823 Ensure that this constraint holds to avoid unexpected surprises.
40825 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40826 so do not enforce these sanity checks.
40828 To optimize register_move_cost performance, define inline variant. */
40830 static inline bool
40831 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40832 machine_mode mode, int strict)
40834 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40835 return false;
40837 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40838 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40839 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40840 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40841 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40842 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40843 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40844 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40846 gcc_assert (!strict || lra_in_progress);
40847 return true;
40850 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40851 return true;
40853 /* Between mask and general, we have moves no larger than word size. */
40854 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40855 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40856 return true;
40858 /* ??? This is a lie. We do have moves between mmx/general, and for
40859 mmx/sse2. But by saying we need secondary memory we discourage the
40860 register allocator from using the mmx registers unless needed. */
40861 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
40862 return true;
40864 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40866 /* SSE1 doesn't have any direct moves from other classes. */
40867 if (!TARGET_SSE2)
40868 return true;
40870 /* If the target says that inter-unit moves are more expensive
40871 than moving through memory, then don't generate them. */
40872 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
40873 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
40874 return true;
40876 /* Between SSE and general, we have moves no larger than word size. */
40877 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40878 return true;
40881 return false;
40884 bool
40885 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40886 machine_mode mode, int strict)
40888 return inline_secondary_memory_needed (class1, class2, mode, strict);
40891 /* Implement the TARGET_CLASS_MAX_NREGS hook.
40893 On the 80386, this is the size of MODE in words,
40894 except in the FP regs, where a single reg is always enough. */
40896 static unsigned char
40897 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
40899 if (MAYBE_INTEGER_CLASS_P (rclass))
40901 if (mode == XFmode)
40902 return (TARGET_64BIT ? 2 : 3);
40903 else if (mode == XCmode)
40904 return (TARGET_64BIT ? 4 : 6);
40905 else
40906 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40908 else
40910 if (COMPLEX_MODE_P (mode))
40911 return 2;
40912 else
40913 return 1;
40917 /* Return true if the registers in CLASS cannot represent the change from
40918 modes FROM to TO. */
40920 bool
40921 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
40922 enum reg_class regclass)
40924 if (from == to)
40925 return false;
40927 /* x87 registers can't do subreg at all, as all values are reformatted
40928 to extended precision. */
40929 if (MAYBE_FLOAT_CLASS_P (regclass))
40930 return true;
40932 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
40934 /* Vector registers do not support QI or HImode loads. If we don't
40935 disallow a change to these modes, reload will assume it's ok to
40936 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
40937 the vec_dupv4hi pattern. */
40938 if (GET_MODE_SIZE (from) < 4)
40939 return true;
40942 return false;
40945 /* Return the cost of moving data of mode M between a
40946 register and memory. A value of 2 is the default; this cost is
40947 relative to those in `REGISTER_MOVE_COST'.
40949 This function is used extensively by register_move_cost that is used to
40950 build tables at startup. Make it inline in this case.
40951 When IN is 2, return maximum of in and out move cost.
40953 If moving between registers and memory is more expensive than
40954 between two registers, you should define this macro to express the
40955 relative cost.
40957 Model also increased moving costs of QImode registers in non
40958 Q_REGS classes.
40960 static inline int
40961 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40962 int in)
40964 int cost;
40965 if (FLOAT_CLASS_P (regclass))
40967 int index;
40968 switch (mode)
40970 case SFmode:
40971 index = 0;
40972 break;
40973 case DFmode:
40974 index = 1;
40975 break;
40976 case XFmode:
40977 index = 2;
40978 break;
40979 default:
40980 return 100;
40982 if (in == 2)
40983 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40984 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40986 if (SSE_CLASS_P (regclass))
40988 int index;
40989 switch (GET_MODE_SIZE (mode))
40991 case 4:
40992 index = 0;
40993 break;
40994 case 8:
40995 index = 1;
40996 break;
40997 case 16:
40998 index = 2;
40999 break;
41000 default:
41001 return 100;
41003 if (in == 2)
41004 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
41005 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
41007 if (MMX_CLASS_P (regclass))
41009 int index;
41010 switch (GET_MODE_SIZE (mode))
41012 case 4:
41013 index = 0;
41014 break;
41015 case 8:
41016 index = 1;
41017 break;
41018 default:
41019 return 100;
41021 if (in)
41022 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41023 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41025 switch (GET_MODE_SIZE (mode))
41027 case 1:
41028 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41030 if (!in)
41031 return ix86_cost->int_store[0];
41032 if (TARGET_PARTIAL_REG_DEPENDENCY
41033 && optimize_function_for_speed_p (cfun))
41034 cost = ix86_cost->movzbl_load;
41035 else
41036 cost = ix86_cost->int_load[0];
41037 if (in == 2)
41038 return MAX (cost, ix86_cost->int_store[0]);
41039 return cost;
41041 else
41043 if (in == 2)
41044 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41045 if (in)
41046 return ix86_cost->movzbl_load;
41047 else
41048 return ix86_cost->int_store[0] + 4;
41050 break;
41051 case 2:
41052 if (in == 2)
41053 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41054 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41055 default:
41056 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41057 if (mode == TFmode)
41058 mode = XFmode;
41059 if (in == 2)
41060 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41061 else if (in)
41062 cost = ix86_cost->int_load[2];
41063 else
41064 cost = ix86_cost->int_store[2];
41065 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41069 static int
41070 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41071 bool in)
41073 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41077 /* Return the cost of moving data from a register in class CLASS1 to
41078 one in class CLASS2.
41080 It is not required that the cost always equal 2 when FROM is the same as TO;
41081 on some machines it is expensive to move between registers if they are not
41082 general registers. */
41084 static int
41085 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41086 reg_class_t class2_i)
41088 enum reg_class class1 = (enum reg_class) class1_i;
41089 enum reg_class class2 = (enum reg_class) class2_i;
41091 /* In case we require secondary memory, compute cost of the store followed
41092 by load. In order to avoid bad register allocation choices, we need
41093 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41095 if (inline_secondary_memory_needed (class1, class2, mode, 0))
41097 int cost = 1;
41099 cost += inline_memory_move_cost (mode, class1, 2);
41100 cost += inline_memory_move_cost (mode, class2, 2);
41102 /* In case of copying from general_purpose_register we may emit multiple
41103 stores followed by single load causing memory size mismatch stall.
41104 Count this as arbitrarily high cost of 20. */
41105 if (targetm.class_max_nregs (class1, mode)
41106 > targetm.class_max_nregs (class2, mode))
41107 cost += 20;
41109 /* In the case of FP/MMX moves, the registers actually overlap, and we
41110 have to switch modes in order to treat them differently. */
41111 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41112 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41113 cost += 20;
41115 return cost;
41118 /* Moves between SSE/MMX and integer unit are expensive. */
41119 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41120 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41122 /* ??? By keeping returned value relatively high, we limit the number
41123 of moves between integer and MMX/SSE registers for all targets.
41124 Additionally, high value prevents problem with x86_modes_tieable_p(),
41125 where integer modes in MMX/SSE registers are not tieable
41126 because of missing QImode and HImode moves to, from or between
41127 MMX/SSE registers. */
41128 return MAX (8, ix86_cost->mmxsse_to_integer);
41130 if (MAYBE_FLOAT_CLASS_P (class1))
41131 return ix86_cost->fp_move;
41132 if (MAYBE_SSE_CLASS_P (class1))
41133 return ix86_cost->sse_move;
41134 if (MAYBE_MMX_CLASS_P (class1))
41135 return ix86_cost->mmx_move;
41136 return 2;
41139 /* Return TRUE if hard register REGNO can hold a value of machine-mode
41140 MODE. */
41142 bool
41143 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
41145 /* Flags and only flags can only hold CCmode values. */
41146 if (CC_REGNO_P (regno))
41147 return GET_MODE_CLASS (mode) == MODE_CC;
41148 if (GET_MODE_CLASS (mode) == MODE_CC
41149 || GET_MODE_CLASS (mode) == MODE_RANDOM
41150 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41151 return false;
41152 if (STACK_REGNO_P (regno))
41153 return VALID_FP_MODE_P (mode);
41154 if (MASK_REGNO_P (regno))
41155 return (VALID_MASK_REG_MODE (mode)
41156 || (TARGET_AVX512BW
41157 && VALID_MASK_AVX512BW_MODE (mode)));
41158 if (BND_REGNO_P (regno))
41159 return VALID_BND_REG_MODE (mode);
41160 if (SSE_REGNO_P (regno))
41162 /* We implement the move patterns for all vector modes into and
41163 out of SSE registers, even when no operation instructions
41164 are available. */
41166 /* For AVX-512 we allow, regardless of regno:
41167 - XI mode
41168 - any of 512-bit wide vector mode
41169 - any scalar mode. */
41170 if (TARGET_AVX512F
41171 && (mode == XImode
41172 || VALID_AVX512F_REG_MODE (mode)
41173 || VALID_AVX512F_SCALAR_MODE (mode)))
41174 return true;
41176 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41177 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41178 && MOD4_SSE_REGNO_P (regno)
41179 && mode == V64SFmode)
41180 return true;
41182 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41183 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41184 && MOD4_SSE_REGNO_P (regno)
41185 && mode == V64SImode)
41186 return true;
41188 /* TODO check for QI/HI scalars. */
41189 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41190 if (TARGET_AVX512VL
41191 && (mode == OImode
41192 || mode == TImode
41193 || VALID_AVX256_REG_MODE (mode)
41194 || VALID_AVX512VL_128_REG_MODE (mode)))
41195 return true;
41197 /* xmm16-xmm31 are only available for AVX-512. */
41198 if (EXT_REX_SSE_REGNO_P (regno))
41199 return false;
41201 /* OImode and AVX modes are available only when AVX is enabled. */
41202 return ((TARGET_AVX
41203 && VALID_AVX256_REG_OR_OI_MODE (mode))
41204 || VALID_SSE_REG_MODE (mode)
41205 || VALID_SSE2_REG_MODE (mode)
41206 || VALID_MMX_REG_MODE (mode)
41207 || VALID_MMX_REG_MODE_3DNOW (mode));
41209 if (MMX_REGNO_P (regno))
41211 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41212 so if the register is available at all, then we can move data of
41213 the given mode into or out of it. */
41214 return (VALID_MMX_REG_MODE (mode)
41215 || VALID_MMX_REG_MODE_3DNOW (mode));
41218 if (mode == QImode)
41220 /* Take care for QImode values - they can be in non-QI regs,
41221 but then they do cause partial register stalls. */
41222 if (ANY_QI_REGNO_P (regno))
41223 return true;
41224 if (!TARGET_PARTIAL_REG_STALL)
41225 return true;
41226 /* LRA checks if the hard register is OK for the given mode.
41227 QImode values can live in non-QI regs, so we allow all
41228 registers here. */
41229 if (lra_in_progress)
41230 return true;
41231 return !can_create_pseudo_p ();
41233 /* We handle both integer and floats in the general purpose registers. */
41234 else if (VALID_INT_MODE_P (mode))
41235 return true;
41236 else if (VALID_FP_MODE_P (mode))
41237 return true;
41238 else if (VALID_DFP_MODE_P (mode))
41239 return true;
41240 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41241 on to use that value in smaller contexts, this can easily force a
41242 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41243 supporting DImode, allow it. */
41244 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41245 return true;
41247 return false;
41250 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41251 tieable integer mode. */
41253 static bool
41254 ix86_tieable_integer_mode_p (machine_mode mode)
41256 switch (mode)
41258 case HImode:
41259 case SImode:
41260 return true;
41262 case QImode:
41263 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41265 case DImode:
41266 return TARGET_64BIT;
41268 default:
41269 return false;
41273 /* Return true if MODE1 is accessible in a register that can hold MODE2
41274 without copying. That is, all register classes that can hold MODE2
41275 can also hold MODE1. */
41277 bool
41278 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41280 if (mode1 == mode2)
41281 return true;
41283 if (ix86_tieable_integer_mode_p (mode1)
41284 && ix86_tieable_integer_mode_p (mode2))
41285 return true;
41287 /* MODE2 being XFmode implies fp stack or general regs, which means we
41288 can tie any smaller floating point modes to it. Note that we do not
41289 tie this with TFmode. */
41290 if (mode2 == XFmode)
41291 return mode1 == SFmode || mode1 == DFmode;
41293 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41294 that we can tie it with SFmode. */
41295 if (mode2 == DFmode)
41296 return mode1 == SFmode;
41298 /* If MODE2 is only appropriate for an SSE register, then tie with
41299 any other mode acceptable to SSE registers. */
41300 if (GET_MODE_SIZE (mode2) == 32
41301 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41302 return (GET_MODE_SIZE (mode1) == 32
41303 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41304 if (GET_MODE_SIZE (mode2) == 16
41305 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41306 return (GET_MODE_SIZE (mode1) == 16
41307 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41309 /* If MODE2 is appropriate for an MMX register, then tie
41310 with any other mode acceptable to MMX registers. */
41311 if (GET_MODE_SIZE (mode2) == 8
41312 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41313 return (GET_MODE_SIZE (mode1) == 8
41314 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41316 return false;
41319 /* Return the cost of moving between two registers of mode MODE. */
41321 static int
41322 ix86_set_reg_reg_cost (machine_mode mode)
41324 unsigned int units = UNITS_PER_WORD;
41326 switch (GET_MODE_CLASS (mode))
41328 default:
41329 break;
41331 case MODE_CC:
41332 units = GET_MODE_SIZE (CCmode);
41333 break;
41335 case MODE_FLOAT:
41336 if ((TARGET_SSE && mode == TFmode)
41337 || (TARGET_80387 && mode == XFmode)
41338 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41339 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41340 units = GET_MODE_SIZE (mode);
41341 break;
41343 case MODE_COMPLEX_FLOAT:
41344 if ((TARGET_SSE && mode == TCmode)
41345 || (TARGET_80387 && mode == XCmode)
41346 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41347 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41348 units = GET_MODE_SIZE (mode);
41349 break;
41351 case MODE_VECTOR_INT:
41352 case MODE_VECTOR_FLOAT:
41353 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41354 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41355 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41356 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41357 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41358 units = GET_MODE_SIZE (mode);
41361 /* Return the cost of moving between two registers of mode MODE,
41362 assuming that the move will be in pieces of at most UNITS bytes. */
41363 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41366 /* Compute a (partial) cost for rtx X. Return true if the complete
41367 cost has been computed, and false if subexpressions should be
41368 scanned. In either case, *TOTAL contains the cost result. */
41370 static bool
41371 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41372 int *total, bool speed)
41374 rtx mask;
41375 enum rtx_code code = GET_CODE (x);
41376 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41377 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41378 int src_cost;
41380 switch (code)
41382 case SET:
41383 if (register_operand (SET_DEST (x), VOIDmode)
41384 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41386 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41387 return true;
41390 if (register_operand (SET_SRC (x), VOIDmode))
41391 /* Avoid potentially incorrect high cost from rtx_costs
41392 for non-tieable SUBREGs. */
41393 src_cost = 0;
41394 else
41396 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41398 if (CONSTANT_P (SET_SRC (x)))
41399 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41400 a small value, possibly zero for cheap constants. */
41401 src_cost += COSTS_N_INSNS (1);
41404 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41405 return true;
41407 case CONST_INT:
41408 case CONST:
41409 case LABEL_REF:
41410 case SYMBOL_REF:
41411 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41412 *total = 3;
41413 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41414 *total = 2;
41415 else if (flag_pic && SYMBOLIC_CONST (x)
41416 && !(TARGET_64BIT
41417 && (GET_CODE (x) == LABEL_REF
41418 || (GET_CODE (x) == SYMBOL_REF
41419 && SYMBOL_REF_LOCAL_P (x))))
41420 /* Use 0 cost for CONST to improve its propagation. */
41421 && (TARGET_64BIT || GET_CODE (x) != CONST))
41422 *total = 1;
41423 else
41424 *total = 0;
41425 return true;
41427 case CONST_DOUBLE:
41428 if (IS_STACK_MODE (mode))
41429 switch (standard_80387_constant_p (x))
41431 case -1:
41432 case 0:
41433 break;
41434 case 1: /* 0.0 */
41435 *total = 1;
41436 return true;
41437 default: /* Other constants */
41438 *total = 2;
41439 return true;
41441 /* FALLTHRU */
41443 case CONST_VECTOR:
41444 switch (standard_sse_constant_p (x, mode))
41446 case 0:
41447 break;
41448 case 1: /* 0: xor eliminates false dependency */
41449 *total = 0;
41450 return true;
41451 default: /* -1: cmp contains false dependency */
41452 *total = 1;
41453 return true;
41455 /* FALLTHRU */
41457 case CONST_WIDE_INT:
41458 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41459 it'll probably end up. Add a penalty for size. */
41460 *total = (COSTS_N_INSNS (1)
41461 + (!TARGET_64BIT && flag_pic)
41462 + (GET_MODE_SIZE (mode) <= 4
41463 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41464 return true;
41466 case ZERO_EXTEND:
41467 /* The zero extensions is often completely free on x86_64, so make
41468 it as cheap as possible. */
41469 if (TARGET_64BIT && mode == DImode
41470 && GET_MODE (XEXP (x, 0)) == SImode)
41471 *total = 1;
41472 else if (TARGET_ZERO_EXTEND_WITH_AND)
41473 *total = cost->add;
41474 else
41475 *total = cost->movzx;
41476 return false;
41478 case SIGN_EXTEND:
41479 *total = cost->movsx;
41480 return false;
41482 case ASHIFT:
41483 if (SCALAR_INT_MODE_P (mode)
41484 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41485 && CONST_INT_P (XEXP (x, 1)))
41487 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41488 if (value == 1)
41490 *total = cost->add;
41491 return false;
41493 if ((value == 2 || value == 3)
41494 && cost->lea <= cost->shift_const)
41496 *total = cost->lea;
41497 return false;
41500 /* FALLTHRU */
41502 case ROTATE:
41503 case ASHIFTRT:
41504 case LSHIFTRT:
41505 case ROTATERT:
41506 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41508 /* ??? Should be SSE vector operation cost. */
41509 /* At least for published AMD latencies, this really is the same
41510 as the latency for a simple fpu operation like fabs. */
41511 /* V*QImode is emulated with 1-11 insns. */
41512 if (mode == V16QImode || mode == V32QImode)
41514 int count = 11;
41515 if (TARGET_XOP && mode == V16QImode)
41517 /* For XOP we use vpshab, which requires a broadcast of the
41518 value to the variable shift insn. For constants this
41519 means a V16Q const in mem; even when we can perform the
41520 shift with one insn set the cost to prefer paddb. */
41521 if (CONSTANT_P (XEXP (x, 1)))
41523 *total = (cost->fabs
41524 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41525 + (speed ? 2 : COSTS_N_BYTES (16)));
41526 return true;
41528 count = 3;
41530 else if (TARGET_SSSE3)
41531 count = 7;
41532 *total = cost->fabs * count;
41534 else
41535 *total = cost->fabs;
41537 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41539 if (CONST_INT_P (XEXP (x, 1)))
41541 if (INTVAL (XEXP (x, 1)) > 32)
41542 *total = cost->shift_const + COSTS_N_INSNS (2);
41543 else
41544 *total = cost->shift_const * 2;
41546 else
41548 if (GET_CODE (XEXP (x, 1)) == AND)
41549 *total = cost->shift_var * 2;
41550 else
41551 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41554 else
41556 if (CONST_INT_P (XEXP (x, 1)))
41557 *total = cost->shift_const;
41558 else if (SUBREG_P (XEXP (x, 1))
41559 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41561 /* Return the cost after shift-and truncation. */
41562 *total = cost->shift_var;
41563 return true;
41565 else
41566 *total = cost->shift_var;
41568 return false;
41570 case FMA:
41572 rtx sub;
41574 gcc_assert (FLOAT_MODE_P (mode));
41575 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41577 /* ??? SSE scalar/vector cost should be used here. */
41578 /* ??? Bald assumption that fma has the same cost as fmul. */
41579 *total = cost->fmul;
41580 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41582 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41583 sub = XEXP (x, 0);
41584 if (GET_CODE (sub) == NEG)
41585 sub = XEXP (sub, 0);
41586 *total += rtx_cost (sub, mode, FMA, 0, speed);
41588 sub = XEXP (x, 2);
41589 if (GET_CODE (sub) == NEG)
41590 sub = XEXP (sub, 0);
41591 *total += rtx_cost (sub, mode, FMA, 2, speed);
41592 return true;
41595 case MULT:
41596 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41598 /* ??? SSE scalar cost should be used here. */
41599 *total = cost->fmul;
41600 return false;
41602 else if (X87_FLOAT_MODE_P (mode))
41604 *total = cost->fmul;
41605 return false;
41607 else if (FLOAT_MODE_P (mode))
41609 /* ??? SSE vector cost should be used here. */
41610 *total = cost->fmul;
41611 return false;
41613 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41615 /* V*QImode is emulated with 7-13 insns. */
41616 if (mode == V16QImode || mode == V32QImode)
41618 int extra = 11;
41619 if (TARGET_XOP && mode == V16QImode)
41620 extra = 5;
41621 else if (TARGET_SSSE3)
41622 extra = 6;
41623 *total = cost->fmul * 2 + cost->fabs * extra;
41625 /* V*DImode is emulated with 5-8 insns. */
41626 else if (mode == V2DImode || mode == V4DImode)
41628 if (TARGET_XOP && mode == V2DImode)
41629 *total = cost->fmul * 2 + cost->fabs * 3;
41630 else
41631 *total = cost->fmul * 3 + cost->fabs * 5;
41633 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41634 insns, including two PMULUDQ. */
41635 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41636 *total = cost->fmul * 2 + cost->fabs * 5;
41637 else
41638 *total = cost->fmul;
41639 return false;
41641 else
41643 rtx op0 = XEXP (x, 0);
41644 rtx op1 = XEXP (x, 1);
41645 int nbits;
41646 if (CONST_INT_P (XEXP (x, 1)))
41648 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41649 for (nbits = 0; value != 0; value &= value - 1)
41650 nbits++;
41652 else
41653 /* This is arbitrary. */
41654 nbits = 7;
41656 /* Compute costs correctly for widening multiplication. */
41657 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41658 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41659 == GET_MODE_SIZE (mode))
41661 int is_mulwiden = 0;
41662 machine_mode inner_mode = GET_MODE (op0);
41664 if (GET_CODE (op0) == GET_CODE (op1))
41665 is_mulwiden = 1, op1 = XEXP (op1, 0);
41666 else if (CONST_INT_P (op1))
41668 if (GET_CODE (op0) == SIGN_EXTEND)
41669 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41670 == INTVAL (op1);
41671 else
41672 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41675 if (is_mulwiden)
41676 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41679 *total = (cost->mult_init[MODE_INDEX (mode)]
41680 + nbits * cost->mult_bit
41681 + rtx_cost (op0, mode, outer_code, opno, speed)
41682 + rtx_cost (op1, mode, outer_code, opno, speed));
41684 return true;
41687 case DIV:
41688 case UDIV:
41689 case MOD:
41690 case UMOD:
41691 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41692 /* ??? SSE cost should be used here. */
41693 *total = cost->fdiv;
41694 else if (X87_FLOAT_MODE_P (mode))
41695 *total = cost->fdiv;
41696 else if (FLOAT_MODE_P (mode))
41697 /* ??? SSE vector cost should be used here. */
41698 *total = cost->fdiv;
41699 else
41700 *total = cost->divide[MODE_INDEX (mode)];
41701 return false;
41703 case PLUS:
41704 if (GET_MODE_CLASS (mode) == MODE_INT
41705 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41707 if (GET_CODE (XEXP (x, 0)) == PLUS
41708 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41709 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41710 && CONSTANT_P (XEXP (x, 1)))
41712 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41713 if (val == 2 || val == 4 || val == 8)
41715 *total = cost->lea;
41716 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41717 outer_code, opno, speed);
41718 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41719 outer_code, opno, speed);
41720 *total += rtx_cost (XEXP (x, 1), mode,
41721 outer_code, opno, speed);
41722 return true;
41725 else if (GET_CODE (XEXP (x, 0)) == MULT
41726 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41728 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41729 if (val == 2 || val == 4 || val == 8)
41731 *total = cost->lea;
41732 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41733 outer_code, opno, speed);
41734 *total += rtx_cost (XEXP (x, 1), mode,
41735 outer_code, opno, speed);
41736 return true;
41739 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41741 /* Add with carry, ignore the cost of adding a carry flag. */
41742 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41743 *total = cost->add;
41744 else
41746 *total = cost->lea;
41747 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41748 outer_code, opno, speed);
41751 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41752 outer_code, opno, speed);
41753 *total += rtx_cost (XEXP (x, 1), mode,
41754 outer_code, opno, speed);
41755 return true;
41758 /* FALLTHRU */
41760 case MINUS:
41761 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41762 if (GET_MODE_CLASS (mode) == MODE_INT
41763 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41764 && GET_CODE (XEXP (x, 0)) == MINUS
41765 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41767 *total = cost->add;
41768 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41769 outer_code, opno, speed);
41770 *total += rtx_cost (XEXP (x, 1), mode,
41771 outer_code, opno, speed);
41772 return true;
41775 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41777 /* ??? SSE cost should be used here. */
41778 *total = cost->fadd;
41779 return false;
41781 else if (X87_FLOAT_MODE_P (mode))
41783 *total = cost->fadd;
41784 return false;
41786 else if (FLOAT_MODE_P (mode))
41788 /* ??? SSE vector cost should be used here. */
41789 *total = cost->fadd;
41790 return false;
41792 /* FALLTHRU */
41794 case AND:
41795 case IOR:
41796 case XOR:
41797 if (GET_MODE_CLASS (mode) == MODE_INT
41798 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41800 *total = (cost->add * 2
41801 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41802 << (GET_MODE (XEXP (x, 0)) != DImode))
41803 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41804 << (GET_MODE (XEXP (x, 1)) != DImode)));
41805 return true;
41807 /* FALLTHRU */
41809 case NEG:
41810 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41812 /* ??? SSE cost should be used here. */
41813 *total = cost->fchs;
41814 return false;
41816 else if (X87_FLOAT_MODE_P (mode))
41818 *total = cost->fchs;
41819 return false;
41821 else if (FLOAT_MODE_P (mode))
41823 /* ??? SSE vector cost should be used here. */
41824 *total = cost->fchs;
41825 return false;
41827 /* FALLTHRU */
41829 case NOT:
41830 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41832 /* ??? Should be SSE vector operation cost. */
41833 /* At least for published AMD latencies, this really is the same
41834 as the latency for a simple fpu operation like fabs. */
41835 *total = cost->fabs;
41837 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41838 *total = cost->add * 2;
41839 else
41840 *total = cost->add;
41841 return false;
41843 case COMPARE:
41844 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41845 && XEXP (XEXP (x, 0), 1) == const1_rtx
41846 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41847 && XEXP (x, 1) == const0_rtx)
41849 /* This kind of construct is implemented using test[bwl].
41850 Treat it as if we had an AND. */
41851 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41852 *total = (cost->add
41853 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41854 opno, speed)
41855 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41856 return true;
41859 /* The embedded comparison operand is completely free. */
41860 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41861 && XEXP (x, 1) == const0_rtx)
41862 *total = 0;
41864 return false;
41866 case FLOAT_EXTEND:
41867 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41868 *total = 0;
41869 return false;
41871 case ABS:
41872 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41873 /* ??? SSE cost should be used here. */
41874 *total = cost->fabs;
41875 else if (X87_FLOAT_MODE_P (mode))
41876 *total = cost->fabs;
41877 else if (FLOAT_MODE_P (mode))
41878 /* ??? SSE vector cost should be used here. */
41879 *total = cost->fabs;
41880 return false;
41882 case SQRT:
41883 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41884 /* ??? SSE cost should be used here. */
41885 *total = cost->fsqrt;
41886 else if (X87_FLOAT_MODE_P (mode))
41887 *total = cost->fsqrt;
41888 else if (FLOAT_MODE_P (mode))
41889 /* ??? SSE vector cost should be used here. */
41890 *total = cost->fsqrt;
41891 return false;
41893 case UNSPEC:
41894 if (XINT (x, 1) == UNSPEC_TP)
41895 *total = 0;
41896 return false;
41898 case VEC_SELECT:
41899 case VEC_CONCAT:
41900 case VEC_DUPLICATE:
41901 /* ??? Assume all of these vector manipulation patterns are
41902 recognizable. In which case they all pretty much have the
41903 same cost. */
41904 *total = cost->fabs;
41905 return true;
41906 case VEC_MERGE:
41907 mask = XEXP (x, 2);
41908 /* This is masked instruction, assume the same cost,
41909 as nonmasked variant. */
41910 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41911 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41912 else
41913 *total = cost->fabs;
41914 return true;
41916 default:
41917 return false;
41921 #if TARGET_MACHO
41923 static int current_machopic_label_num;
41925 /* Given a symbol name and its associated stub, write out the
41926 definition of the stub. */
41928 void
41929 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41931 unsigned int length;
41932 char *binder_name, *symbol_name, lazy_ptr_name[32];
41933 int label = ++current_machopic_label_num;
41935 /* For 64-bit we shouldn't get here. */
41936 gcc_assert (!TARGET_64BIT);
41938 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41939 symb = targetm.strip_name_encoding (symb);
41941 length = strlen (stub);
41942 binder_name = XALLOCAVEC (char, length + 32);
41943 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41945 length = strlen (symb);
41946 symbol_name = XALLOCAVEC (char, length + 32);
41947 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41949 sprintf (lazy_ptr_name, "L%d$lz", label);
41951 if (MACHOPIC_ATT_STUB)
41952 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41953 else if (MACHOPIC_PURE)
41954 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41955 else
41956 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41958 fprintf (file, "%s:\n", stub);
41959 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41961 if (MACHOPIC_ATT_STUB)
41963 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41965 else if (MACHOPIC_PURE)
41967 /* PIC stub. */
41968 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41969 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41970 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41971 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41972 label, lazy_ptr_name, label);
41973 fprintf (file, "\tjmp\t*%%ecx\n");
41975 else
41976 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41978 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41979 it needs no stub-binding-helper. */
41980 if (MACHOPIC_ATT_STUB)
41981 return;
41983 fprintf (file, "%s:\n", binder_name);
41985 if (MACHOPIC_PURE)
41987 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41988 fprintf (file, "\tpushl\t%%ecx\n");
41990 else
41991 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41993 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41995 /* N.B. Keep the correspondence of these
41996 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41997 old-pic/new-pic/non-pic stubs; altering this will break
41998 compatibility with existing dylibs. */
41999 if (MACHOPIC_PURE)
42001 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42002 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
42004 else
42005 /* 16-byte -mdynamic-no-pic stub. */
42006 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
42008 fprintf (file, "%s:\n", lazy_ptr_name);
42009 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42010 fprintf (file, ASM_LONG "%s\n", binder_name);
42012 #endif /* TARGET_MACHO */
42014 /* Order the registers for register allocator. */
42016 void
42017 x86_order_regs_for_local_alloc (void)
42019 int pos = 0;
42020 int i;
42022 /* First allocate the local general purpose registers. */
42023 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42024 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42025 reg_alloc_order [pos++] = i;
42027 /* Global general purpose registers. */
42028 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42029 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42030 reg_alloc_order [pos++] = i;
42032 /* x87 registers come first in case we are doing FP math
42033 using them. */
42034 if (!TARGET_SSE_MATH)
42035 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42036 reg_alloc_order [pos++] = i;
42038 /* SSE registers. */
42039 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42040 reg_alloc_order [pos++] = i;
42041 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42042 reg_alloc_order [pos++] = i;
42044 /* Extended REX SSE registers. */
42045 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42046 reg_alloc_order [pos++] = i;
42048 /* Mask register. */
42049 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42050 reg_alloc_order [pos++] = i;
42052 /* MPX bound registers. */
42053 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42054 reg_alloc_order [pos++] = i;
42056 /* x87 registers. */
42057 if (TARGET_SSE_MATH)
42058 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42059 reg_alloc_order [pos++] = i;
42061 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42062 reg_alloc_order [pos++] = i;
42064 /* Initialize the rest of array as we do not allocate some registers
42065 at all. */
42066 while (pos < FIRST_PSEUDO_REGISTER)
42067 reg_alloc_order [pos++] = 0;
42070 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42071 in struct attribute_spec handler. */
42072 static tree
42073 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42074 tree args,
42075 int,
42076 bool *no_add_attrs)
42078 if (TREE_CODE (*node) != FUNCTION_TYPE
42079 && TREE_CODE (*node) != METHOD_TYPE
42080 && TREE_CODE (*node) != FIELD_DECL
42081 && TREE_CODE (*node) != TYPE_DECL)
42083 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42084 name);
42085 *no_add_attrs = true;
42086 return NULL_TREE;
42088 if (TARGET_64BIT)
42090 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42091 name);
42092 *no_add_attrs = true;
42093 return NULL_TREE;
42095 if (is_attribute_p ("callee_pop_aggregate_return", name))
42097 tree cst;
42099 cst = TREE_VALUE (args);
42100 if (TREE_CODE (cst) != INTEGER_CST)
42102 warning (OPT_Wattributes,
42103 "%qE attribute requires an integer constant argument",
42104 name);
42105 *no_add_attrs = true;
42107 else if (compare_tree_int (cst, 0) != 0
42108 && compare_tree_int (cst, 1) != 0)
42110 warning (OPT_Wattributes,
42111 "argument to %qE attribute is neither zero, nor one",
42112 name);
42113 *no_add_attrs = true;
42116 return NULL_TREE;
42119 return NULL_TREE;
42122 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42123 struct attribute_spec.handler. */
42124 static tree
42125 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42126 bool *no_add_attrs)
42128 if (TREE_CODE (*node) != FUNCTION_TYPE
42129 && TREE_CODE (*node) != METHOD_TYPE
42130 && TREE_CODE (*node) != FIELD_DECL
42131 && TREE_CODE (*node) != TYPE_DECL)
42133 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42134 name);
42135 *no_add_attrs = true;
42136 return NULL_TREE;
42139 /* Can combine regparm with all attributes but fastcall. */
42140 if (is_attribute_p ("ms_abi", name))
42142 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42144 error ("ms_abi and sysv_abi attributes are not compatible");
42147 return NULL_TREE;
42149 else if (is_attribute_p ("sysv_abi", name))
42151 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42153 error ("ms_abi and sysv_abi attributes are not compatible");
42156 return NULL_TREE;
42159 return NULL_TREE;
42162 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42163 struct attribute_spec.handler. */
42164 static tree
42165 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42166 bool *no_add_attrs)
42168 tree *type = NULL;
42169 if (DECL_P (*node))
42171 if (TREE_CODE (*node) == TYPE_DECL)
42172 type = &TREE_TYPE (*node);
42174 else
42175 type = node;
42177 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42179 warning (OPT_Wattributes, "%qE attribute ignored",
42180 name);
42181 *no_add_attrs = true;
42184 else if ((is_attribute_p ("ms_struct", name)
42185 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42186 || ((is_attribute_p ("gcc_struct", name)
42187 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42189 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42190 name);
42191 *no_add_attrs = true;
42194 return NULL_TREE;
42197 static tree
42198 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42199 bool *no_add_attrs)
42201 if (TREE_CODE (*node) != FUNCTION_DECL)
42203 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42204 name);
42205 *no_add_attrs = true;
42207 return NULL_TREE;
42210 static tree
42211 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42212 int, bool *)
42214 return NULL_TREE;
42217 static tree
42218 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42220 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42221 but the function type contains args and return type data. */
42222 tree func_type = *node;
42223 tree return_type = TREE_TYPE (func_type);
42225 int nargs = 0;
42226 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42227 while (current_arg_type
42228 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42230 if (nargs == 0)
42232 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42233 error ("interrupt service routine should have a pointer "
42234 "as the first argument");
42236 else if (nargs == 1)
42238 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42239 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42240 error ("interrupt service routine should have unsigned %s"
42241 "int as the second argument",
42242 TARGET_64BIT
42243 ? (TARGET_X32 ? "long long " : "long ")
42244 : "");
42246 nargs++;
42247 current_arg_type = TREE_CHAIN (current_arg_type);
42249 if (!nargs || nargs > 2)
42250 error ("interrupt service routine can only have a pointer argument "
42251 "and an optional integer argument");
42252 if (! VOID_TYPE_P (return_type))
42253 error ("interrupt service routine can't have non-void return value");
42255 return NULL_TREE;
42258 static bool
42259 ix86_ms_bitfield_layout_p (const_tree record_type)
42261 return ((TARGET_MS_BITFIELD_LAYOUT
42262 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42263 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42266 /* Returns an expression indicating where the this parameter is
42267 located on entry to the FUNCTION. */
42269 static rtx
42270 x86_this_parameter (tree function)
42272 tree type = TREE_TYPE (function);
42273 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42274 int nregs;
42276 if (TARGET_64BIT)
42278 const int *parm_regs;
42280 if (ix86_function_type_abi (type) == MS_ABI)
42281 parm_regs = x86_64_ms_abi_int_parameter_registers;
42282 else
42283 parm_regs = x86_64_int_parameter_registers;
42284 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42287 nregs = ix86_function_regparm (type, function);
42289 if (nregs > 0 && !stdarg_p (type))
42291 int regno;
42292 unsigned int ccvt = ix86_get_callcvt (type);
42294 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42295 regno = aggr ? DX_REG : CX_REG;
42296 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42298 regno = CX_REG;
42299 if (aggr)
42300 return gen_rtx_MEM (SImode,
42301 plus_constant (Pmode, stack_pointer_rtx, 4));
42303 else
42305 regno = AX_REG;
42306 if (aggr)
42308 regno = DX_REG;
42309 if (nregs == 1)
42310 return gen_rtx_MEM (SImode,
42311 plus_constant (Pmode,
42312 stack_pointer_rtx, 4));
42315 return gen_rtx_REG (SImode, regno);
42318 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42319 aggr ? 8 : 4));
42322 /* Determine whether x86_output_mi_thunk can succeed. */
42324 static bool
42325 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42326 const_tree function)
42328 /* 64-bit can handle anything. */
42329 if (TARGET_64BIT)
42330 return true;
42332 /* For 32-bit, everything's fine if we have one free register. */
42333 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42334 return true;
42336 /* Need a free register for vcall_offset. */
42337 if (vcall_offset)
42338 return false;
42340 /* Need a free register for GOT references. */
42341 if (flag_pic && !targetm.binds_local_p (function))
42342 return false;
42344 /* Otherwise ok. */
42345 return true;
42348 /* Output the assembler code for a thunk function. THUNK_DECL is the
42349 declaration for the thunk function itself, FUNCTION is the decl for
42350 the target function. DELTA is an immediate constant offset to be
42351 added to THIS. If VCALL_OFFSET is nonzero, the word at
42352 *(*this + vcall_offset) should be added to THIS. */
42354 static void
42355 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42356 HOST_WIDE_INT vcall_offset, tree function)
42358 rtx this_param = x86_this_parameter (function);
42359 rtx this_reg, tmp, fnaddr;
42360 unsigned int tmp_regno;
42361 rtx_insn *insn;
42363 if (TARGET_64BIT)
42364 tmp_regno = R10_REG;
42365 else
42367 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42368 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42369 tmp_regno = AX_REG;
42370 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42371 tmp_regno = DX_REG;
42372 else
42373 tmp_regno = CX_REG;
42376 emit_note (NOTE_INSN_PROLOGUE_END);
42378 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42379 pull it in now and let DELTA benefit. */
42380 if (REG_P (this_param))
42381 this_reg = this_param;
42382 else if (vcall_offset)
42384 /* Put the this parameter into %eax. */
42385 this_reg = gen_rtx_REG (Pmode, AX_REG);
42386 emit_move_insn (this_reg, this_param);
42388 else
42389 this_reg = NULL_RTX;
42391 /* Adjust the this parameter by a fixed constant. */
42392 if (delta)
42394 rtx delta_rtx = GEN_INT (delta);
42395 rtx delta_dst = this_reg ? this_reg : this_param;
42397 if (TARGET_64BIT)
42399 if (!x86_64_general_operand (delta_rtx, Pmode))
42401 tmp = gen_rtx_REG (Pmode, tmp_regno);
42402 emit_move_insn (tmp, delta_rtx);
42403 delta_rtx = tmp;
42407 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42410 /* Adjust the this parameter by a value stored in the vtable. */
42411 if (vcall_offset)
42413 rtx vcall_addr, vcall_mem, this_mem;
42415 tmp = gen_rtx_REG (Pmode, tmp_regno);
42417 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42418 if (Pmode != ptr_mode)
42419 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42420 emit_move_insn (tmp, this_mem);
42422 /* Adjust the this parameter. */
42423 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42424 if (TARGET_64BIT
42425 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42427 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42428 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42429 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42432 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42433 if (Pmode != ptr_mode)
42434 emit_insn (gen_addsi_1_zext (this_reg,
42435 gen_rtx_REG (ptr_mode,
42436 REGNO (this_reg)),
42437 vcall_mem));
42438 else
42439 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42442 /* If necessary, drop THIS back to its stack slot. */
42443 if (this_reg && this_reg != this_param)
42444 emit_move_insn (this_param, this_reg);
42446 fnaddr = XEXP (DECL_RTL (function), 0);
42447 if (TARGET_64BIT)
42449 if (!flag_pic || targetm.binds_local_p (function)
42450 || TARGET_PECOFF)
42452 else
42454 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42455 tmp = gen_rtx_CONST (Pmode, tmp);
42456 fnaddr = gen_const_mem (Pmode, tmp);
42459 else
42461 if (!flag_pic || targetm.binds_local_p (function))
42463 #if TARGET_MACHO
42464 else if (TARGET_MACHO)
42466 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42467 fnaddr = XEXP (fnaddr, 0);
42469 #endif /* TARGET_MACHO */
42470 else
42472 tmp = gen_rtx_REG (Pmode, CX_REG);
42473 output_set_got (tmp, NULL_RTX);
42475 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42476 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42477 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42478 fnaddr = gen_const_mem (Pmode, fnaddr);
42482 /* Our sibling call patterns do not allow memories, because we have no
42483 predicate that can distinguish between frame and non-frame memory.
42484 For our purposes here, we can get away with (ab)using a jump pattern,
42485 because we're going to do no optimization. */
42486 if (MEM_P (fnaddr))
42488 if (sibcall_insn_operand (fnaddr, word_mode))
42490 fnaddr = XEXP (DECL_RTL (function), 0);
42491 tmp = gen_rtx_MEM (QImode, fnaddr);
42492 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42493 tmp = emit_call_insn (tmp);
42494 SIBLING_CALL_P (tmp) = 1;
42496 else
42497 emit_jump_insn (gen_indirect_jump (fnaddr));
42499 else
42501 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42503 // CM_LARGE_PIC always uses pseudo PIC register which is
42504 // uninitialized. Since FUNCTION is local and calling it
42505 // doesn't go through PLT, we use scratch register %r11 as
42506 // PIC register and initialize it here.
42507 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42508 ix86_init_large_pic_reg (tmp_regno);
42509 fnaddr = legitimize_pic_address (fnaddr,
42510 gen_rtx_REG (Pmode, tmp_regno));
42513 if (!sibcall_insn_operand (fnaddr, word_mode))
42515 tmp = gen_rtx_REG (word_mode, tmp_regno);
42516 if (GET_MODE (fnaddr) != word_mode)
42517 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42518 emit_move_insn (tmp, fnaddr);
42519 fnaddr = tmp;
42522 tmp = gen_rtx_MEM (QImode, fnaddr);
42523 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42524 tmp = emit_call_insn (tmp);
42525 SIBLING_CALL_P (tmp) = 1;
42527 emit_barrier ();
42529 /* Emit just enough of rest_of_compilation to get the insns emitted.
42530 Note that use_thunk calls assemble_start_function et al. */
42531 insn = get_insns ();
42532 shorten_branches (insn);
42533 final_start_function (insn, file, 1);
42534 final (insn, file, 1);
42535 final_end_function ();
42538 static void
42539 x86_file_start (void)
42541 default_file_start ();
42542 if (TARGET_16BIT)
42543 fputs ("\t.code16gcc\n", asm_out_file);
42544 #if TARGET_MACHO
42545 darwin_file_start ();
42546 #endif
42547 if (X86_FILE_START_VERSION_DIRECTIVE)
42548 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42549 if (X86_FILE_START_FLTUSED)
42550 fputs ("\t.global\t__fltused\n", asm_out_file);
42551 if (ix86_asm_dialect == ASM_INTEL)
42552 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42556 x86_field_alignment (tree type, int computed)
42558 machine_mode mode;
42560 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42561 return computed;
42562 if (TARGET_IAMCU)
42563 return iamcu_alignment (type, computed);
42564 mode = TYPE_MODE (strip_array_types (type));
42565 if (mode == DFmode || mode == DCmode
42566 || GET_MODE_CLASS (mode) == MODE_INT
42567 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42568 return MIN (32, computed);
42569 return computed;
42572 /* Print call to TARGET to FILE. */
42574 static void
42575 x86_print_call_or_nop (FILE *file, const char *target)
42577 if (flag_nop_mcount)
42578 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42579 else
42580 fprintf (file, "1:\tcall\t%s\n", target);
42583 /* Output assembler code to FILE to increment profiler label # LABELNO
42584 for profiling a function entry. */
42585 void
42586 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42588 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42589 : MCOUNT_NAME);
42590 if (TARGET_64BIT)
42592 #ifndef NO_PROFILE_COUNTERS
42593 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42594 #endif
42596 if (!TARGET_PECOFF && flag_pic)
42597 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42598 else
42599 x86_print_call_or_nop (file, mcount_name);
42601 else if (flag_pic)
42603 #ifndef NO_PROFILE_COUNTERS
42604 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42605 LPREFIX, labelno);
42606 #endif
42607 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42609 else
42611 #ifndef NO_PROFILE_COUNTERS
42612 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42613 LPREFIX, labelno);
42614 #endif
42615 x86_print_call_or_nop (file, mcount_name);
42618 if (flag_record_mcount)
42620 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42621 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42622 fprintf (file, "\t.previous\n");
42626 /* We don't have exact information about the insn sizes, but we may assume
42627 quite safely that we are informed about all 1 byte insns and memory
42628 address sizes. This is enough to eliminate unnecessary padding in
42629 99% of cases. */
42631 static int
42632 min_insn_size (rtx_insn *insn)
42634 int l = 0, len;
42636 if (!INSN_P (insn) || !active_insn_p (insn))
42637 return 0;
42639 /* Discard alignments we've emit and jump instructions. */
42640 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42641 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42642 return 0;
42644 /* Important case - calls are always 5 bytes.
42645 It is common to have many calls in the row. */
42646 if (CALL_P (insn)
42647 && symbolic_reference_mentioned_p (PATTERN (insn))
42648 && !SIBLING_CALL_P (insn))
42649 return 5;
42650 len = get_attr_length (insn);
42651 if (len <= 1)
42652 return 1;
42654 /* For normal instructions we rely on get_attr_length being exact,
42655 with a few exceptions. */
42656 if (!JUMP_P (insn))
42658 enum attr_type type = get_attr_type (insn);
42660 switch (type)
42662 case TYPE_MULTI:
42663 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42664 || asm_noperands (PATTERN (insn)) >= 0)
42665 return 0;
42666 break;
42667 case TYPE_OTHER:
42668 case TYPE_FCMP:
42669 break;
42670 default:
42671 /* Otherwise trust get_attr_length. */
42672 return len;
42675 l = get_attr_length_address (insn);
42676 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42677 l = 4;
42679 if (l)
42680 return 1+l;
42681 else
42682 return 2;
42685 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42687 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42688 window. */
42690 static void
42691 ix86_avoid_jump_mispredicts (void)
42693 rtx_insn *insn, *start = get_insns ();
42694 int nbytes = 0, njumps = 0;
42695 bool isjump = false;
42697 /* Look for all minimal intervals of instructions containing 4 jumps.
42698 The intervals are bounded by START and INSN. NBYTES is the total
42699 size of instructions in the interval including INSN and not including
42700 START. When the NBYTES is smaller than 16 bytes, it is possible
42701 that the end of START and INSN ends up in the same 16byte page.
42703 The smallest offset in the page INSN can start is the case where START
42704 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42705 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42707 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42708 have to, control transfer to label(s) can be performed through other
42709 means, and also we estimate minimum length of all asm stmts as 0. */
42710 for (insn = start; insn; insn = NEXT_INSN (insn))
42712 int min_size;
42714 if (LABEL_P (insn))
42716 int align = label_to_alignment (insn);
42717 int max_skip = label_to_max_skip (insn);
42719 if (max_skip > 15)
42720 max_skip = 15;
42721 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42722 already in the current 16 byte page, because otherwise
42723 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42724 bytes to reach 16 byte boundary. */
42725 if (align <= 0
42726 || (align <= 3 && max_skip != (1 << align) - 1))
42727 max_skip = 0;
42728 if (dump_file)
42729 fprintf (dump_file, "Label %i with max_skip %i\n",
42730 INSN_UID (insn), max_skip);
42731 if (max_skip)
42733 while (nbytes + max_skip >= 16)
42735 start = NEXT_INSN (start);
42736 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42737 || CALL_P (start))
42738 njumps--, isjump = true;
42739 else
42740 isjump = false;
42741 nbytes -= min_insn_size (start);
42744 continue;
42747 min_size = min_insn_size (insn);
42748 nbytes += min_size;
42749 if (dump_file)
42750 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42751 INSN_UID (insn), min_size);
42752 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42753 || CALL_P (insn))
42754 njumps++;
42755 else
42756 continue;
42758 while (njumps > 3)
42760 start = NEXT_INSN (start);
42761 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42762 || CALL_P (start))
42763 njumps--, isjump = true;
42764 else
42765 isjump = false;
42766 nbytes -= min_insn_size (start);
42768 gcc_assert (njumps >= 0);
42769 if (dump_file)
42770 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42771 INSN_UID (start), INSN_UID (insn), nbytes);
42773 if (njumps == 3 && isjump && nbytes < 16)
42775 int padsize = 15 - nbytes + min_insn_size (insn);
42777 if (dump_file)
42778 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42779 INSN_UID (insn), padsize);
42780 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42784 #endif
42786 /* AMD Athlon works faster
42787 when RET is not destination of conditional jump or directly preceded
42788 by other jump instruction. We avoid the penalty by inserting NOP just
42789 before the RET instructions in such cases. */
42790 static void
42791 ix86_pad_returns (void)
42793 edge e;
42794 edge_iterator ei;
42796 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42798 basic_block bb = e->src;
42799 rtx_insn *ret = BB_END (bb);
42800 rtx_insn *prev;
42801 bool replace = false;
42803 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42804 || optimize_bb_for_size_p (bb))
42805 continue;
42806 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42807 if (active_insn_p (prev) || LABEL_P (prev))
42808 break;
42809 if (prev && LABEL_P (prev))
42811 edge e;
42812 edge_iterator ei;
42814 FOR_EACH_EDGE (e, ei, bb->preds)
42815 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42816 && !(e->flags & EDGE_FALLTHRU))
42818 replace = true;
42819 break;
42822 if (!replace)
42824 prev = prev_active_insn (ret);
42825 if (prev
42826 && ((JUMP_P (prev) && any_condjump_p (prev))
42827 || CALL_P (prev)))
42828 replace = true;
42829 /* Empty functions get branch mispredict even when
42830 the jump destination is not visible to us. */
42831 if (!prev && !optimize_function_for_size_p (cfun))
42832 replace = true;
42834 if (replace)
42836 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42837 delete_insn (ret);
42842 /* Count the minimum number of instructions in BB. Return 4 if the
42843 number of instructions >= 4. */
42845 static int
42846 ix86_count_insn_bb (basic_block bb)
42848 rtx_insn *insn;
42849 int insn_count = 0;
42851 /* Count number of instructions in this block. Return 4 if the number
42852 of instructions >= 4. */
42853 FOR_BB_INSNS (bb, insn)
42855 /* Only happen in exit blocks. */
42856 if (JUMP_P (insn)
42857 && ANY_RETURN_P (PATTERN (insn)))
42858 break;
42860 if (NONDEBUG_INSN_P (insn)
42861 && GET_CODE (PATTERN (insn)) != USE
42862 && GET_CODE (PATTERN (insn)) != CLOBBER)
42864 insn_count++;
42865 if (insn_count >= 4)
42866 return insn_count;
42870 return insn_count;
42874 /* Count the minimum number of instructions in code path in BB.
42875 Return 4 if the number of instructions >= 4. */
42877 static int
42878 ix86_count_insn (basic_block bb)
42880 edge e;
42881 edge_iterator ei;
42882 int min_prev_count;
42884 /* Only bother counting instructions along paths with no
42885 more than 2 basic blocks between entry and exit. Given
42886 that BB has an edge to exit, determine if a predecessor
42887 of BB has an edge from entry. If so, compute the number
42888 of instructions in the predecessor block. If there
42889 happen to be multiple such blocks, compute the minimum. */
42890 min_prev_count = 4;
42891 FOR_EACH_EDGE (e, ei, bb->preds)
42893 edge prev_e;
42894 edge_iterator prev_ei;
42896 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42898 min_prev_count = 0;
42899 break;
42901 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42903 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42905 int count = ix86_count_insn_bb (e->src);
42906 if (count < min_prev_count)
42907 min_prev_count = count;
42908 break;
42913 if (min_prev_count < 4)
42914 min_prev_count += ix86_count_insn_bb (bb);
42916 return min_prev_count;
42919 /* Pad short function to 4 instructions. */
42921 static void
42922 ix86_pad_short_function (void)
42924 edge e;
42925 edge_iterator ei;
42927 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42929 rtx_insn *ret = BB_END (e->src);
42930 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42932 int insn_count = ix86_count_insn (e->src);
42934 /* Pad short function. */
42935 if (insn_count < 4)
42937 rtx_insn *insn = ret;
42939 /* Find epilogue. */
42940 while (insn
42941 && (!NOTE_P (insn)
42942 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42943 insn = PREV_INSN (insn);
42945 if (!insn)
42946 insn = ret;
42948 /* Two NOPs count as one instruction. */
42949 insn_count = 2 * (4 - insn_count);
42950 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42956 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42957 the epilogue, the Windows system unwinder will apply epilogue logic and
42958 produce incorrect offsets. This can be avoided by adding a nop between
42959 the last insn that can throw and the first insn of the epilogue. */
42961 static void
42962 ix86_seh_fixup_eh_fallthru (void)
42964 edge e;
42965 edge_iterator ei;
42967 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42969 rtx_insn *insn, *next;
42971 /* Find the beginning of the epilogue. */
42972 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42973 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42974 break;
42975 if (insn == NULL)
42976 continue;
42978 /* We only care about preceding insns that can throw. */
42979 insn = prev_active_insn (insn);
42980 if (insn == NULL || !can_throw_internal (insn))
42981 continue;
42983 /* Do not separate calls from their debug information. */
42984 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42985 if (NOTE_P (next)
42986 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42987 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42988 insn = next;
42989 else
42990 break;
42992 emit_insn_after (gen_nops (const1_rtx), insn);
42996 /* Given a register number BASE, the lowest of a group of registers, update
42997 regsets IN and OUT with the registers that should be avoided in input
42998 and output operands respectively when trying to avoid generating a modr/m
42999 byte for -fmitigate-rop. */
43001 static void
43002 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
43004 SET_HARD_REG_BIT (out, base);
43005 SET_HARD_REG_BIT (out, base + 1);
43006 SET_HARD_REG_BIT (in, base + 2);
43007 SET_HARD_REG_BIT (in, base + 3);
43010 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
43011 that certain encodings of modr/m bytes do not occur. */
43012 static void
43013 ix86_mitigate_rop (void)
43015 HARD_REG_SET input_risky;
43016 HARD_REG_SET output_risky;
43017 HARD_REG_SET inout_risky;
43019 CLEAR_HARD_REG_SET (output_risky);
43020 CLEAR_HARD_REG_SET (input_risky);
43021 SET_HARD_REG_BIT (output_risky, AX_REG);
43022 SET_HARD_REG_BIT (output_risky, CX_REG);
43023 SET_HARD_REG_BIT (input_risky, BX_REG);
43024 SET_HARD_REG_BIT (input_risky, DX_REG);
43025 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43026 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43027 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43028 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43029 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43030 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43031 COPY_HARD_REG_SET (inout_risky, input_risky);
43032 IOR_HARD_REG_SET (inout_risky, output_risky);
43034 df_note_add_problem ();
43035 /* Fix up what stack-regs did. */
43036 df_insn_rescan_all ();
43037 df_analyze ();
43039 regrename_init (true);
43040 regrename_analyze (NULL);
43042 auto_vec<du_head_p> cands;
43044 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43046 if (!NONDEBUG_INSN_P (insn))
43047 continue;
43049 if (GET_CODE (PATTERN (insn)) == USE
43050 || GET_CODE (PATTERN (insn)) == CLOBBER)
43051 continue;
43053 extract_insn (insn);
43055 int opno0, opno1;
43056 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43057 recog_data.n_operands, &opno0,
43058 &opno1);
43060 if (!ix86_rop_should_change_byte_p (modrm))
43061 continue;
43063 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43065 /* This happens when regrename has to fail a block. */
43066 if (!info->op_info)
43067 continue;
43069 if (info->op_info[opno0].n_chains != 0)
43071 gcc_assert (info->op_info[opno0].n_chains == 1);
43072 du_head_p op0c;
43073 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43074 if (op0c->target_data_1 + op0c->target_data_2 == 0
43075 && !op0c->cannot_rename)
43076 cands.safe_push (op0c);
43078 op0c->target_data_1++;
43080 if (info->op_info[opno1].n_chains != 0)
43082 gcc_assert (info->op_info[opno1].n_chains == 1);
43083 du_head_p op1c;
43084 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43085 if (op1c->target_data_1 + op1c->target_data_2 == 0
43086 && !op1c->cannot_rename)
43087 cands.safe_push (op1c);
43089 op1c->target_data_2++;
43093 int i;
43094 du_head_p head;
43095 FOR_EACH_VEC_ELT (cands, i, head)
43097 int old_reg, best_reg;
43098 HARD_REG_SET unavailable;
43100 CLEAR_HARD_REG_SET (unavailable);
43101 if (head->target_data_1)
43102 IOR_HARD_REG_SET (unavailable, output_risky);
43103 if (head->target_data_2)
43104 IOR_HARD_REG_SET (unavailable, input_risky);
43106 int n_uses;
43107 reg_class superclass = regrename_find_superclass (head, &n_uses,
43108 &unavailable);
43109 old_reg = head->regno;
43110 best_reg = find_rename_reg (head, superclass, &unavailable,
43111 old_reg, false);
43112 bool ok = regrename_do_replace (head, best_reg);
43113 gcc_assert (ok);
43114 if (dump_file)
43115 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43116 reg_names[best_reg], reg_class_names[superclass]);
43120 regrename_finish ();
43122 df_analyze ();
43124 basic_block bb;
43125 regset_head live;
43127 INIT_REG_SET (&live);
43129 FOR_EACH_BB_FN (bb, cfun)
43131 rtx_insn *insn;
43133 COPY_REG_SET (&live, DF_LR_OUT (bb));
43134 df_simulate_initialize_backwards (bb, &live);
43136 FOR_BB_INSNS_REVERSE (bb, insn)
43138 if (!NONDEBUG_INSN_P (insn))
43139 continue;
43141 df_simulate_one_insn_backwards (bb, insn, &live);
43143 if (GET_CODE (PATTERN (insn)) == USE
43144 || GET_CODE (PATTERN (insn)) == CLOBBER)
43145 continue;
43147 extract_insn (insn);
43148 constrain_operands_cached (insn, reload_completed);
43149 int opno0, opno1;
43150 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43151 recog_data.n_operands, &opno0,
43152 &opno1);
43153 if (modrm < 0
43154 || !ix86_rop_should_change_byte_p (modrm)
43155 || opno0 == opno1)
43156 continue;
43158 rtx oldreg = recog_data.operand[opno1];
43159 preprocess_constraints (insn);
43160 const operand_alternative *alt = which_op_alt ();
43162 int i;
43163 for (i = 0; i < recog_data.n_operands; i++)
43164 if (i != opno1
43165 && alt[i].earlyclobber
43166 && reg_overlap_mentioned_p (recog_data.operand[i],
43167 oldreg))
43168 break;
43170 if (i < recog_data.n_operands)
43171 continue;
43173 if (dump_file)
43174 fprintf (dump_file,
43175 "attempting to fix modrm byte in insn %d:"
43176 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43177 reg_class_names[alt[opno1].cl]);
43179 HARD_REG_SET unavailable;
43180 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43181 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43182 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43183 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43184 IOR_HARD_REG_SET (unavailable, output_risky);
43185 IOR_COMPL_HARD_REG_SET (unavailable,
43186 reg_class_contents[alt[opno1].cl]);
43188 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43189 if (!TEST_HARD_REG_BIT (unavailable, i))
43190 break;
43191 if (i == FIRST_PSEUDO_REGISTER)
43193 if (dump_file)
43194 fprintf (dump_file, ", none available\n");
43195 continue;
43197 if (dump_file)
43198 fprintf (dump_file, " -> %d\n", i);
43199 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43200 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43201 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43206 /* Implement machine specific optimizations. We implement padding of returns
43207 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43208 static void
43209 ix86_reorg (void)
43211 /* We are freeing block_for_insn in the toplev to keep compatibility
43212 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43213 compute_bb_for_insn ();
43215 if (flag_mitigate_rop)
43216 ix86_mitigate_rop ();
43218 if (TARGET_SEH && current_function_has_exception_handlers ())
43219 ix86_seh_fixup_eh_fallthru ();
43221 if (optimize && optimize_function_for_speed_p (cfun))
43223 if (TARGET_PAD_SHORT_FUNCTION)
43224 ix86_pad_short_function ();
43225 else if (TARGET_PAD_RETURNS)
43226 ix86_pad_returns ();
43227 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43228 if (TARGET_FOUR_JUMP_LIMIT)
43229 ix86_avoid_jump_mispredicts ();
43230 #endif
43234 /* Return nonzero when QImode register that must be represented via REX prefix
43235 is used. */
43236 bool
43237 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43239 int i;
43240 extract_insn_cached (insn);
43241 for (i = 0; i < recog_data.n_operands; i++)
43242 if (GENERAL_REG_P (recog_data.operand[i])
43243 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43244 return true;
43245 return false;
43248 /* Return true when INSN mentions register that must be encoded using REX
43249 prefix. */
43250 bool
43251 x86_extended_reg_mentioned_p (rtx insn)
43253 subrtx_iterator::array_type array;
43254 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43256 const_rtx x = *iter;
43257 if (REG_P (x)
43258 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43259 return true;
43261 return false;
43264 /* If profitable, negate (without causing overflow) integer constant
43265 of mode MODE at location LOC. Return true in this case. */
43266 bool
43267 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43269 HOST_WIDE_INT val;
43271 if (!CONST_INT_P (*loc))
43272 return false;
43274 switch (mode)
43276 case DImode:
43277 /* DImode x86_64 constants must fit in 32 bits. */
43278 gcc_assert (x86_64_immediate_operand (*loc, mode));
43280 mode = SImode;
43281 break;
43283 case SImode:
43284 case HImode:
43285 case QImode:
43286 break;
43288 default:
43289 gcc_unreachable ();
43292 /* Avoid overflows. */
43293 if (mode_signbit_p (mode, *loc))
43294 return false;
43296 val = INTVAL (*loc);
43298 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43299 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43300 if ((val < 0 && val != -128)
43301 || val == 128)
43303 *loc = GEN_INT (-val);
43304 return true;
43307 return false;
43310 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43311 optabs would emit if we didn't have TFmode patterns. */
43313 void
43314 x86_emit_floatuns (rtx operands[2])
43316 rtx_code_label *neglab, *donelab;
43317 rtx i0, i1, f0, in, out;
43318 machine_mode mode, inmode;
43320 inmode = GET_MODE (operands[1]);
43321 gcc_assert (inmode == SImode || inmode == DImode);
43323 out = operands[0];
43324 in = force_reg (inmode, operands[1]);
43325 mode = GET_MODE (out);
43326 neglab = gen_label_rtx ();
43327 donelab = gen_label_rtx ();
43328 f0 = gen_reg_rtx (mode);
43330 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43332 expand_float (out, in, 0);
43334 emit_jump_insn (gen_jump (donelab));
43335 emit_barrier ();
43337 emit_label (neglab);
43339 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43340 1, OPTAB_DIRECT);
43341 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43342 1, OPTAB_DIRECT);
43343 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43345 expand_float (f0, i0, 0);
43347 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43349 emit_label (donelab);
43352 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43353 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43354 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43355 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43357 /* Get a vector mode of the same size as the original but with elements
43358 twice as wide. This is only guaranteed to apply to integral vectors. */
43360 static inline machine_mode
43361 get_mode_wider_vector (machine_mode o)
43363 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43364 machine_mode n = GET_MODE_WIDER_MODE (o);
43365 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43366 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43367 return n;
43370 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43371 fill target with val via vec_duplicate. */
43373 static bool
43374 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43376 bool ok;
43377 rtx_insn *insn;
43378 rtx dup;
43380 /* First attempt to recognize VAL as-is. */
43381 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43382 insn = emit_insn (gen_rtx_SET (target, dup));
43383 if (recog_memoized (insn) < 0)
43385 rtx_insn *seq;
43386 machine_mode innermode = GET_MODE_INNER (mode);
43387 rtx reg;
43389 /* If that fails, force VAL into a register. */
43391 start_sequence ();
43392 reg = force_reg (innermode, val);
43393 if (GET_MODE (reg) != innermode)
43394 reg = gen_lowpart (innermode, reg);
43395 XEXP (dup, 0) = reg;
43396 seq = get_insns ();
43397 end_sequence ();
43398 if (seq)
43399 emit_insn_before (seq, insn);
43401 ok = recog_memoized (insn) >= 0;
43402 gcc_assert (ok);
43404 return true;
43407 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43408 with all elements equal to VAR. Return true if successful. */
43410 static bool
43411 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43412 rtx target, rtx val)
43414 bool ok;
43416 switch (mode)
43418 case V2SImode:
43419 case V2SFmode:
43420 if (!mmx_ok)
43421 return false;
43422 /* FALLTHRU */
43424 case V4DFmode:
43425 case V4DImode:
43426 case V8SFmode:
43427 case V8SImode:
43428 case V2DFmode:
43429 case V2DImode:
43430 case V4SFmode:
43431 case V4SImode:
43432 case V16SImode:
43433 case V8DImode:
43434 case V16SFmode:
43435 case V8DFmode:
43436 return ix86_vector_duplicate_value (mode, target, val);
43438 case V4HImode:
43439 if (!mmx_ok)
43440 return false;
43441 if (TARGET_SSE || TARGET_3DNOW_A)
43443 rtx x;
43445 val = gen_lowpart (SImode, val);
43446 x = gen_rtx_TRUNCATE (HImode, val);
43447 x = gen_rtx_VEC_DUPLICATE (mode, x);
43448 emit_insn (gen_rtx_SET (target, x));
43449 return true;
43451 goto widen;
43453 case V8QImode:
43454 if (!mmx_ok)
43455 return false;
43456 goto widen;
43458 case V8HImode:
43459 if (TARGET_AVX2)
43460 return ix86_vector_duplicate_value (mode, target, val);
43462 if (TARGET_SSE2)
43464 struct expand_vec_perm_d dperm;
43465 rtx tmp1, tmp2;
43467 permute:
43468 memset (&dperm, 0, sizeof (dperm));
43469 dperm.target = target;
43470 dperm.vmode = mode;
43471 dperm.nelt = GET_MODE_NUNITS (mode);
43472 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43473 dperm.one_operand_p = true;
43475 /* Extend to SImode using a paradoxical SUBREG. */
43476 tmp1 = gen_reg_rtx (SImode);
43477 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43479 /* Insert the SImode value as low element of a V4SImode vector. */
43480 tmp2 = gen_reg_rtx (V4SImode);
43481 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43482 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43484 ok = (expand_vec_perm_1 (&dperm)
43485 || expand_vec_perm_broadcast_1 (&dperm));
43486 gcc_assert (ok);
43487 return ok;
43489 goto widen;
43491 case V16QImode:
43492 if (TARGET_AVX2)
43493 return ix86_vector_duplicate_value (mode, target, val);
43495 if (TARGET_SSE2)
43496 goto permute;
43497 goto widen;
43499 widen:
43500 /* Replicate the value once into the next wider mode and recurse. */
43502 machine_mode smode, wsmode, wvmode;
43503 rtx x;
43505 smode = GET_MODE_INNER (mode);
43506 wvmode = get_mode_wider_vector (mode);
43507 wsmode = GET_MODE_INNER (wvmode);
43509 val = convert_modes (wsmode, smode, val, true);
43510 x = expand_simple_binop (wsmode, ASHIFT, val,
43511 GEN_INT (GET_MODE_BITSIZE (smode)),
43512 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43513 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43515 x = gen_reg_rtx (wvmode);
43516 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43517 gcc_assert (ok);
43518 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43519 return ok;
43522 case V16HImode:
43523 case V32QImode:
43524 if (TARGET_AVX2)
43525 return ix86_vector_duplicate_value (mode, target, val);
43526 else
43528 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43529 rtx x = gen_reg_rtx (hvmode);
43531 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43532 gcc_assert (ok);
43534 x = gen_rtx_VEC_CONCAT (mode, x, x);
43535 emit_insn (gen_rtx_SET (target, x));
43537 return true;
43539 case V64QImode:
43540 case V32HImode:
43541 if (TARGET_AVX512BW)
43542 return ix86_vector_duplicate_value (mode, target, val);
43543 else
43545 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43546 rtx x = gen_reg_rtx (hvmode);
43548 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43549 gcc_assert (ok);
43551 x = gen_rtx_VEC_CONCAT (mode, x, x);
43552 emit_insn (gen_rtx_SET (target, x));
43554 return true;
43556 default:
43557 return false;
43561 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43562 whose ONE_VAR element is VAR, and other elements are zero. Return true
43563 if successful. */
43565 static bool
43566 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43567 rtx target, rtx var, int one_var)
43569 machine_mode vsimode;
43570 rtx new_target;
43571 rtx x, tmp;
43572 bool use_vector_set = false;
43574 switch (mode)
43576 case V2DImode:
43577 /* For SSE4.1, we normally use vector set. But if the second
43578 element is zero and inter-unit moves are OK, we use movq
43579 instead. */
43580 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43581 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43582 && one_var == 0));
43583 break;
43584 case V16QImode:
43585 case V4SImode:
43586 case V4SFmode:
43587 use_vector_set = TARGET_SSE4_1;
43588 break;
43589 case V8HImode:
43590 use_vector_set = TARGET_SSE2;
43591 break;
43592 case V4HImode:
43593 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43594 break;
43595 case V32QImode:
43596 case V16HImode:
43597 case V8SImode:
43598 case V8SFmode:
43599 case V4DFmode:
43600 use_vector_set = TARGET_AVX;
43601 break;
43602 case V4DImode:
43603 /* Use ix86_expand_vector_set in 64bit mode only. */
43604 use_vector_set = TARGET_AVX && TARGET_64BIT;
43605 break;
43606 default:
43607 break;
43610 if (use_vector_set)
43612 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43613 var = force_reg (GET_MODE_INNER (mode), var);
43614 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43615 return true;
43618 switch (mode)
43620 case V2SFmode:
43621 case V2SImode:
43622 if (!mmx_ok)
43623 return false;
43624 /* FALLTHRU */
43626 case V2DFmode:
43627 case V2DImode:
43628 if (one_var != 0)
43629 return false;
43630 var = force_reg (GET_MODE_INNER (mode), var);
43631 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43632 emit_insn (gen_rtx_SET (target, x));
43633 return true;
43635 case V4SFmode:
43636 case V4SImode:
43637 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43638 new_target = gen_reg_rtx (mode);
43639 else
43640 new_target = target;
43641 var = force_reg (GET_MODE_INNER (mode), var);
43642 x = gen_rtx_VEC_DUPLICATE (mode, var);
43643 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43644 emit_insn (gen_rtx_SET (new_target, x));
43645 if (one_var != 0)
43647 /* We need to shuffle the value to the correct position, so
43648 create a new pseudo to store the intermediate result. */
43650 /* With SSE2, we can use the integer shuffle insns. */
43651 if (mode != V4SFmode && TARGET_SSE2)
43653 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43654 const1_rtx,
43655 GEN_INT (one_var == 1 ? 0 : 1),
43656 GEN_INT (one_var == 2 ? 0 : 1),
43657 GEN_INT (one_var == 3 ? 0 : 1)));
43658 if (target != new_target)
43659 emit_move_insn (target, new_target);
43660 return true;
43663 /* Otherwise convert the intermediate result to V4SFmode and
43664 use the SSE1 shuffle instructions. */
43665 if (mode != V4SFmode)
43667 tmp = gen_reg_rtx (V4SFmode);
43668 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43670 else
43671 tmp = new_target;
43673 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43674 const1_rtx,
43675 GEN_INT (one_var == 1 ? 0 : 1),
43676 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43677 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43679 if (mode != V4SFmode)
43680 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43681 else if (tmp != target)
43682 emit_move_insn (target, tmp);
43684 else if (target != new_target)
43685 emit_move_insn (target, new_target);
43686 return true;
43688 case V8HImode:
43689 case V16QImode:
43690 vsimode = V4SImode;
43691 goto widen;
43692 case V4HImode:
43693 case V8QImode:
43694 if (!mmx_ok)
43695 return false;
43696 vsimode = V2SImode;
43697 goto widen;
43698 widen:
43699 if (one_var != 0)
43700 return false;
43702 /* Zero extend the variable element to SImode and recurse. */
43703 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43705 x = gen_reg_rtx (vsimode);
43706 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43707 var, one_var))
43708 gcc_unreachable ();
43710 emit_move_insn (target, gen_lowpart (mode, x));
43711 return true;
43713 default:
43714 return false;
43718 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43719 consisting of the values in VALS. It is known that all elements
43720 except ONE_VAR are constants. Return true if successful. */
43722 static bool
43723 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43724 rtx target, rtx vals, int one_var)
43726 rtx var = XVECEXP (vals, 0, one_var);
43727 machine_mode wmode;
43728 rtx const_vec, x;
43730 const_vec = copy_rtx (vals);
43731 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43732 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43734 switch (mode)
43736 case V2DFmode:
43737 case V2DImode:
43738 case V2SFmode:
43739 case V2SImode:
43740 /* For the two element vectors, it's just as easy to use
43741 the general case. */
43742 return false;
43744 case V4DImode:
43745 /* Use ix86_expand_vector_set in 64bit mode only. */
43746 if (!TARGET_64BIT)
43747 return false;
43748 /* FALLTHRU */
43749 case V4DFmode:
43750 case V8SFmode:
43751 case V8SImode:
43752 case V16HImode:
43753 case V32QImode:
43754 case V4SFmode:
43755 case V4SImode:
43756 case V8HImode:
43757 case V4HImode:
43758 break;
43760 case V16QImode:
43761 if (TARGET_SSE4_1)
43762 break;
43763 wmode = V8HImode;
43764 goto widen;
43765 case V8QImode:
43766 wmode = V4HImode;
43767 goto widen;
43768 widen:
43769 /* There's no way to set one QImode entry easily. Combine
43770 the variable value with its adjacent constant value, and
43771 promote to an HImode set. */
43772 x = XVECEXP (vals, 0, one_var ^ 1);
43773 if (one_var & 1)
43775 var = convert_modes (HImode, QImode, var, true);
43776 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43777 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43778 x = GEN_INT (INTVAL (x) & 0xff);
43780 else
43782 var = convert_modes (HImode, QImode, var, true);
43783 x = gen_int_mode (INTVAL (x) << 8, HImode);
43785 if (x != const0_rtx)
43786 var = expand_simple_binop (HImode, IOR, var, x, var,
43787 1, OPTAB_LIB_WIDEN);
43789 x = gen_reg_rtx (wmode);
43790 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43791 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43793 emit_move_insn (target, gen_lowpart (mode, x));
43794 return true;
43796 default:
43797 return false;
43800 emit_move_insn (target, const_vec);
43801 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43802 return true;
43805 /* A subroutine of ix86_expand_vector_init_general. Use vector
43806 concatenate to handle the most general case: all values variable,
43807 and none identical. */
43809 static void
43810 ix86_expand_vector_init_concat (machine_mode mode,
43811 rtx target, rtx *ops, int n)
43813 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43814 rtx first[16], second[8], third[4];
43815 rtvec v;
43816 int i, j;
43818 switch (n)
43820 case 2:
43821 switch (mode)
43823 case V16SImode:
43824 cmode = V8SImode;
43825 break;
43826 case V16SFmode:
43827 cmode = V8SFmode;
43828 break;
43829 case V8DImode:
43830 cmode = V4DImode;
43831 break;
43832 case V8DFmode:
43833 cmode = V4DFmode;
43834 break;
43835 case V8SImode:
43836 cmode = V4SImode;
43837 break;
43838 case V8SFmode:
43839 cmode = V4SFmode;
43840 break;
43841 case V4DImode:
43842 cmode = V2DImode;
43843 break;
43844 case V4DFmode:
43845 cmode = V2DFmode;
43846 break;
43847 case V4SImode:
43848 cmode = V2SImode;
43849 break;
43850 case V4SFmode:
43851 cmode = V2SFmode;
43852 break;
43853 case V2DImode:
43854 cmode = DImode;
43855 break;
43856 case V2SImode:
43857 cmode = SImode;
43858 break;
43859 case V2DFmode:
43860 cmode = DFmode;
43861 break;
43862 case V2SFmode:
43863 cmode = SFmode;
43864 break;
43865 default:
43866 gcc_unreachable ();
43869 if (!register_operand (ops[1], cmode))
43870 ops[1] = force_reg (cmode, ops[1]);
43871 if (!register_operand (ops[0], cmode))
43872 ops[0] = force_reg (cmode, ops[0]);
43873 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43874 ops[1])));
43875 break;
43877 case 4:
43878 switch (mode)
43880 case V4DImode:
43881 cmode = V2DImode;
43882 break;
43883 case V4DFmode:
43884 cmode = V2DFmode;
43885 break;
43886 case V4SImode:
43887 cmode = V2SImode;
43888 break;
43889 case V4SFmode:
43890 cmode = V2SFmode;
43891 break;
43892 default:
43893 gcc_unreachable ();
43895 goto half;
43897 case 8:
43898 switch (mode)
43900 case V8DImode:
43901 cmode = V2DImode;
43902 hmode = V4DImode;
43903 break;
43904 case V8DFmode:
43905 cmode = V2DFmode;
43906 hmode = V4DFmode;
43907 break;
43908 case V8SImode:
43909 cmode = V2SImode;
43910 hmode = V4SImode;
43911 break;
43912 case V8SFmode:
43913 cmode = V2SFmode;
43914 hmode = V4SFmode;
43915 break;
43916 default:
43917 gcc_unreachable ();
43919 goto half;
43921 case 16:
43922 switch (mode)
43924 case V16SImode:
43925 cmode = V2SImode;
43926 hmode = V4SImode;
43927 gmode = V8SImode;
43928 break;
43929 case V16SFmode:
43930 cmode = V2SFmode;
43931 hmode = V4SFmode;
43932 gmode = V8SFmode;
43933 break;
43934 default:
43935 gcc_unreachable ();
43937 goto half;
43939 half:
43940 /* FIXME: We process inputs backward to help RA. PR 36222. */
43941 i = n - 1;
43942 j = (n >> 1) - 1;
43943 for (; i > 0; i -= 2, j--)
43945 first[j] = gen_reg_rtx (cmode);
43946 v = gen_rtvec (2, ops[i - 1], ops[i]);
43947 ix86_expand_vector_init (false, first[j],
43948 gen_rtx_PARALLEL (cmode, v));
43951 n >>= 1;
43952 if (n > 4)
43954 gcc_assert (hmode != VOIDmode);
43955 gcc_assert (gmode != VOIDmode);
43956 for (i = j = 0; i < n; i += 2, j++)
43958 second[j] = gen_reg_rtx (hmode);
43959 ix86_expand_vector_init_concat (hmode, second [j],
43960 &first [i], 2);
43962 n >>= 1;
43963 for (i = j = 0; i < n; i += 2, j++)
43965 third[j] = gen_reg_rtx (gmode);
43966 ix86_expand_vector_init_concat (gmode, third[j],
43967 &second[i], 2);
43969 n >>= 1;
43970 ix86_expand_vector_init_concat (mode, target, third, n);
43972 else if (n > 2)
43974 gcc_assert (hmode != VOIDmode);
43975 for (i = j = 0; i < n; i += 2, j++)
43977 second[j] = gen_reg_rtx (hmode);
43978 ix86_expand_vector_init_concat (hmode, second [j],
43979 &first [i], 2);
43981 n >>= 1;
43982 ix86_expand_vector_init_concat (mode, target, second, n);
43984 else
43985 ix86_expand_vector_init_concat (mode, target, first, n);
43986 break;
43988 default:
43989 gcc_unreachable ();
43993 /* A subroutine of ix86_expand_vector_init_general. Use vector
43994 interleave to handle the most general case: all values variable,
43995 and none identical. */
43997 static void
43998 ix86_expand_vector_init_interleave (machine_mode mode,
43999 rtx target, rtx *ops, int n)
44001 machine_mode first_imode, second_imode, third_imode, inner_mode;
44002 int i, j;
44003 rtx op0, op1;
44004 rtx (*gen_load_even) (rtx, rtx, rtx);
44005 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
44006 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
44008 switch (mode)
44010 case V8HImode:
44011 gen_load_even = gen_vec_setv8hi;
44012 gen_interleave_first_low = gen_vec_interleave_lowv4si;
44013 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44014 inner_mode = HImode;
44015 first_imode = V4SImode;
44016 second_imode = V2DImode;
44017 third_imode = VOIDmode;
44018 break;
44019 case V16QImode:
44020 gen_load_even = gen_vec_setv16qi;
44021 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44022 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44023 inner_mode = QImode;
44024 first_imode = V8HImode;
44025 second_imode = V4SImode;
44026 third_imode = V2DImode;
44027 break;
44028 default:
44029 gcc_unreachable ();
44032 for (i = 0; i < n; i++)
44034 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44035 op0 = gen_reg_rtx (SImode);
44036 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44038 /* Insert the SImode value as low element of V4SImode vector. */
44039 op1 = gen_reg_rtx (V4SImode);
44040 op0 = gen_rtx_VEC_MERGE (V4SImode,
44041 gen_rtx_VEC_DUPLICATE (V4SImode,
44042 op0),
44043 CONST0_RTX (V4SImode),
44044 const1_rtx);
44045 emit_insn (gen_rtx_SET (op1, op0));
44047 /* Cast the V4SImode vector back to a vector in orignal mode. */
44048 op0 = gen_reg_rtx (mode);
44049 emit_move_insn (op0, gen_lowpart (mode, op1));
44051 /* Load even elements into the second position. */
44052 emit_insn (gen_load_even (op0,
44053 force_reg (inner_mode,
44054 ops [i + i + 1]),
44055 const1_rtx));
44057 /* Cast vector to FIRST_IMODE vector. */
44058 ops[i] = gen_reg_rtx (first_imode);
44059 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44062 /* Interleave low FIRST_IMODE vectors. */
44063 for (i = j = 0; i < n; i += 2, j++)
44065 op0 = gen_reg_rtx (first_imode);
44066 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44068 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44069 ops[j] = gen_reg_rtx (second_imode);
44070 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44073 /* Interleave low SECOND_IMODE vectors. */
44074 switch (second_imode)
44076 case V4SImode:
44077 for (i = j = 0; i < n / 2; i += 2, j++)
44079 op0 = gen_reg_rtx (second_imode);
44080 emit_insn (gen_interleave_second_low (op0, ops[i],
44081 ops[i + 1]));
44083 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44084 vector. */
44085 ops[j] = gen_reg_rtx (third_imode);
44086 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44088 second_imode = V2DImode;
44089 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44090 /* FALLTHRU */
44092 case V2DImode:
44093 op0 = gen_reg_rtx (second_imode);
44094 emit_insn (gen_interleave_second_low (op0, ops[0],
44095 ops[1]));
44097 /* Cast the SECOND_IMODE vector back to a vector on original
44098 mode. */
44099 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44100 break;
44102 default:
44103 gcc_unreachable ();
44107 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44108 all values variable, and none identical. */
44110 static void
44111 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44112 rtx target, rtx vals)
44114 rtx ops[64], op0, op1, op2, op3, op4, op5;
44115 machine_mode half_mode = VOIDmode;
44116 machine_mode quarter_mode = VOIDmode;
44117 int n, i;
44119 switch (mode)
44121 case V2SFmode:
44122 case V2SImode:
44123 if (!mmx_ok && !TARGET_SSE)
44124 break;
44125 /* FALLTHRU */
44127 case V16SImode:
44128 case V16SFmode:
44129 case V8DFmode:
44130 case V8DImode:
44131 case V8SFmode:
44132 case V8SImode:
44133 case V4DFmode:
44134 case V4DImode:
44135 case V4SFmode:
44136 case V4SImode:
44137 case V2DFmode:
44138 case V2DImode:
44139 n = GET_MODE_NUNITS (mode);
44140 for (i = 0; i < n; i++)
44141 ops[i] = XVECEXP (vals, 0, i);
44142 ix86_expand_vector_init_concat (mode, target, ops, n);
44143 return;
44145 case V2TImode:
44146 for (i = 0; i < 2; i++)
44147 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44148 op0 = gen_reg_rtx (V4DImode);
44149 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44150 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44151 return;
44153 case V4TImode:
44154 for (i = 0; i < 4; i++)
44155 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44156 ops[4] = gen_reg_rtx (V4DImode);
44157 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44158 ops[5] = gen_reg_rtx (V4DImode);
44159 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44160 op0 = gen_reg_rtx (V8DImode);
44161 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44162 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44163 return;
44165 case V32QImode:
44166 half_mode = V16QImode;
44167 goto half;
44169 case V16HImode:
44170 half_mode = V8HImode;
44171 goto half;
44173 half:
44174 n = GET_MODE_NUNITS (mode);
44175 for (i = 0; i < n; i++)
44176 ops[i] = XVECEXP (vals, 0, i);
44177 op0 = gen_reg_rtx (half_mode);
44178 op1 = gen_reg_rtx (half_mode);
44179 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44180 n >> 2);
44181 ix86_expand_vector_init_interleave (half_mode, op1,
44182 &ops [n >> 1], n >> 2);
44183 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44184 return;
44186 case V64QImode:
44187 quarter_mode = V16QImode;
44188 half_mode = V32QImode;
44189 goto quarter;
44191 case V32HImode:
44192 quarter_mode = V8HImode;
44193 half_mode = V16HImode;
44194 goto quarter;
44196 quarter:
44197 n = GET_MODE_NUNITS (mode);
44198 for (i = 0; i < n; i++)
44199 ops[i] = XVECEXP (vals, 0, i);
44200 op0 = gen_reg_rtx (quarter_mode);
44201 op1 = gen_reg_rtx (quarter_mode);
44202 op2 = gen_reg_rtx (quarter_mode);
44203 op3 = gen_reg_rtx (quarter_mode);
44204 op4 = gen_reg_rtx (half_mode);
44205 op5 = gen_reg_rtx (half_mode);
44206 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44207 n >> 3);
44208 ix86_expand_vector_init_interleave (quarter_mode, op1,
44209 &ops [n >> 2], n >> 3);
44210 ix86_expand_vector_init_interleave (quarter_mode, op2,
44211 &ops [n >> 1], n >> 3);
44212 ix86_expand_vector_init_interleave (quarter_mode, op3,
44213 &ops [(n >> 1) | (n >> 2)], n >> 3);
44214 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44215 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44216 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44217 return;
44219 case V16QImode:
44220 if (!TARGET_SSE4_1)
44221 break;
44222 /* FALLTHRU */
44224 case V8HImode:
44225 if (!TARGET_SSE2)
44226 break;
44228 /* Don't use ix86_expand_vector_init_interleave if we can't
44229 move from GPR to SSE register directly. */
44230 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44231 break;
44233 n = GET_MODE_NUNITS (mode);
44234 for (i = 0; i < n; i++)
44235 ops[i] = XVECEXP (vals, 0, i);
44236 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44237 return;
44239 case V4HImode:
44240 case V8QImode:
44241 break;
44243 default:
44244 gcc_unreachable ();
44248 int i, j, n_elts, n_words, n_elt_per_word;
44249 machine_mode inner_mode;
44250 rtx words[4], shift;
44252 inner_mode = GET_MODE_INNER (mode);
44253 n_elts = GET_MODE_NUNITS (mode);
44254 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44255 n_elt_per_word = n_elts / n_words;
44256 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44258 for (i = 0; i < n_words; ++i)
44260 rtx word = NULL_RTX;
44262 for (j = 0; j < n_elt_per_word; ++j)
44264 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44265 elt = convert_modes (word_mode, inner_mode, elt, true);
44267 if (j == 0)
44268 word = elt;
44269 else
44271 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44272 word, 1, OPTAB_LIB_WIDEN);
44273 word = expand_simple_binop (word_mode, IOR, word, elt,
44274 word, 1, OPTAB_LIB_WIDEN);
44278 words[i] = word;
44281 if (n_words == 1)
44282 emit_move_insn (target, gen_lowpart (mode, words[0]));
44283 else if (n_words == 2)
44285 rtx tmp = gen_reg_rtx (mode);
44286 emit_clobber (tmp);
44287 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44288 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44289 emit_move_insn (target, tmp);
44291 else if (n_words == 4)
44293 rtx tmp = gen_reg_rtx (V4SImode);
44294 gcc_assert (word_mode == SImode);
44295 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44296 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44297 emit_move_insn (target, gen_lowpart (mode, tmp));
44299 else
44300 gcc_unreachable ();
44304 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44305 instructions unless MMX_OK is true. */
44307 void
44308 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44310 machine_mode mode = GET_MODE (target);
44311 machine_mode inner_mode = GET_MODE_INNER (mode);
44312 int n_elts = GET_MODE_NUNITS (mode);
44313 int n_var = 0, one_var = -1;
44314 bool all_same = true, all_const_zero = true;
44315 int i;
44316 rtx x;
44318 /* Handle first initialization from vector elts. */
44319 if (n_elts != XVECLEN (vals, 0))
44321 rtx subtarget = target;
44322 x = XVECEXP (vals, 0, 0);
44323 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
44324 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
44326 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
44327 if (inner_mode == QImode || inner_mode == HImode)
44329 mode = mode_for_vector (SImode,
44330 n_elts * GET_MODE_SIZE (inner_mode) / 4);
44331 inner_mode
44332 = mode_for_vector (SImode,
44333 n_elts * GET_MODE_SIZE (inner_mode) / 8);
44334 ops[0] = gen_lowpart (inner_mode, ops[0]);
44335 ops[1] = gen_lowpart (inner_mode, ops[1]);
44336 subtarget = gen_reg_rtx (mode);
44338 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
44339 if (subtarget != target)
44340 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
44341 return;
44343 gcc_unreachable ();
44346 for (i = 0; i < n_elts; ++i)
44348 x = XVECEXP (vals, 0, i);
44349 if (!(CONST_SCALAR_INT_P (x)
44350 || CONST_DOUBLE_P (x)
44351 || CONST_FIXED_P (x)))
44352 n_var++, one_var = i;
44353 else if (x != CONST0_RTX (inner_mode))
44354 all_const_zero = false;
44355 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44356 all_same = false;
44359 /* Constants are best loaded from the constant pool. */
44360 if (n_var == 0)
44362 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44363 return;
44366 /* If all values are identical, broadcast the value. */
44367 if (all_same
44368 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44369 XVECEXP (vals, 0, 0)))
44370 return;
44372 /* Values where only one field is non-constant are best loaded from
44373 the pool and overwritten via move later. */
44374 if (n_var == 1)
44376 if (all_const_zero
44377 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44378 XVECEXP (vals, 0, one_var),
44379 one_var))
44380 return;
44382 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44383 return;
44386 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44389 void
44390 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44392 machine_mode mode = GET_MODE (target);
44393 machine_mode inner_mode = GET_MODE_INNER (mode);
44394 machine_mode half_mode;
44395 bool use_vec_merge = false;
44396 rtx tmp;
44397 static rtx (*gen_extract[6][2]) (rtx, rtx)
44399 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44400 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44401 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44402 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44403 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44404 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44406 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44408 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44409 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44410 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44411 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44412 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44413 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44415 int i, j, n;
44416 machine_mode mmode = VOIDmode;
44417 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44419 switch (mode)
44421 case V2SFmode:
44422 case V2SImode:
44423 if (mmx_ok)
44425 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44426 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44427 if (elt == 0)
44428 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44429 else
44430 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44431 emit_insn (gen_rtx_SET (target, tmp));
44432 return;
44434 break;
44436 case V2DImode:
44437 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44438 if (use_vec_merge)
44439 break;
44441 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44442 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44443 if (elt == 0)
44444 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44445 else
44446 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44447 emit_insn (gen_rtx_SET (target, tmp));
44448 return;
44450 case V2DFmode:
44452 rtx op0, op1;
44454 /* For the two element vectors, we implement a VEC_CONCAT with
44455 the extraction of the other element. */
44457 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44458 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44460 if (elt == 0)
44461 op0 = val, op1 = tmp;
44462 else
44463 op0 = tmp, op1 = val;
44465 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44466 emit_insn (gen_rtx_SET (target, tmp));
44468 return;
44470 case V4SFmode:
44471 use_vec_merge = TARGET_SSE4_1;
44472 if (use_vec_merge)
44473 break;
44475 switch (elt)
44477 case 0:
44478 use_vec_merge = true;
44479 break;
44481 case 1:
44482 /* tmp = target = A B C D */
44483 tmp = copy_to_reg (target);
44484 /* target = A A B B */
44485 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44486 /* target = X A B B */
44487 ix86_expand_vector_set (false, target, val, 0);
44488 /* target = A X C D */
44489 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44490 const1_rtx, const0_rtx,
44491 GEN_INT (2+4), GEN_INT (3+4)));
44492 return;
44494 case 2:
44495 /* tmp = target = A B C D */
44496 tmp = copy_to_reg (target);
44497 /* tmp = X B C D */
44498 ix86_expand_vector_set (false, tmp, val, 0);
44499 /* target = A B X D */
44500 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44501 const0_rtx, const1_rtx,
44502 GEN_INT (0+4), GEN_INT (3+4)));
44503 return;
44505 case 3:
44506 /* tmp = target = A B C D */
44507 tmp = copy_to_reg (target);
44508 /* tmp = X B C D */
44509 ix86_expand_vector_set (false, tmp, val, 0);
44510 /* target = A B X D */
44511 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44512 const0_rtx, const1_rtx,
44513 GEN_INT (2+4), GEN_INT (0+4)));
44514 return;
44516 default:
44517 gcc_unreachable ();
44519 break;
44521 case V4SImode:
44522 use_vec_merge = TARGET_SSE4_1;
44523 if (use_vec_merge)
44524 break;
44526 /* Element 0 handled by vec_merge below. */
44527 if (elt == 0)
44529 use_vec_merge = true;
44530 break;
44533 if (TARGET_SSE2)
44535 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44536 store into element 0, then shuffle them back. */
44538 rtx order[4];
44540 order[0] = GEN_INT (elt);
44541 order[1] = const1_rtx;
44542 order[2] = const2_rtx;
44543 order[3] = GEN_INT (3);
44544 order[elt] = const0_rtx;
44546 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44547 order[1], order[2], order[3]));
44549 ix86_expand_vector_set (false, target, val, 0);
44551 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44552 order[1], order[2], order[3]));
44554 else
44556 /* For SSE1, we have to reuse the V4SF code. */
44557 rtx t = gen_reg_rtx (V4SFmode);
44558 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44559 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44560 emit_move_insn (target, gen_lowpart (mode, t));
44562 return;
44564 case V8HImode:
44565 use_vec_merge = TARGET_SSE2;
44566 break;
44567 case V4HImode:
44568 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44569 break;
44571 case V16QImode:
44572 use_vec_merge = TARGET_SSE4_1;
44573 break;
44575 case V8QImode:
44576 break;
44578 case V32QImode:
44579 half_mode = V16QImode;
44580 j = 0;
44581 n = 16;
44582 goto half;
44584 case V16HImode:
44585 half_mode = V8HImode;
44586 j = 1;
44587 n = 8;
44588 goto half;
44590 case V8SImode:
44591 half_mode = V4SImode;
44592 j = 2;
44593 n = 4;
44594 goto half;
44596 case V4DImode:
44597 half_mode = V2DImode;
44598 j = 3;
44599 n = 2;
44600 goto half;
44602 case V8SFmode:
44603 half_mode = V4SFmode;
44604 j = 4;
44605 n = 4;
44606 goto half;
44608 case V4DFmode:
44609 half_mode = V2DFmode;
44610 j = 5;
44611 n = 2;
44612 goto half;
44614 half:
44615 /* Compute offset. */
44616 i = elt / n;
44617 elt %= n;
44619 gcc_assert (i <= 1);
44621 /* Extract the half. */
44622 tmp = gen_reg_rtx (half_mode);
44623 emit_insn (gen_extract[j][i] (tmp, target));
44625 /* Put val in tmp at elt. */
44626 ix86_expand_vector_set (false, tmp, val, elt);
44628 /* Put it back. */
44629 emit_insn (gen_insert[j][i] (target, target, tmp));
44630 return;
44632 case V8DFmode:
44633 if (TARGET_AVX512F)
44635 mmode = QImode;
44636 gen_blendm = gen_avx512f_blendmv8df;
44638 break;
44640 case V8DImode:
44641 if (TARGET_AVX512F)
44643 mmode = QImode;
44644 gen_blendm = gen_avx512f_blendmv8di;
44646 break;
44648 case V16SFmode:
44649 if (TARGET_AVX512F)
44651 mmode = HImode;
44652 gen_blendm = gen_avx512f_blendmv16sf;
44654 break;
44656 case V16SImode:
44657 if (TARGET_AVX512F)
44659 mmode = HImode;
44660 gen_blendm = gen_avx512f_blendmv16si;
44662 break;
44664 case V32HImode:
44665 if (TARGET_AVX512F && TARGET_AVX512BW)
44667 mmode = SImode;
44668 gen_blendm = gen_avx512bw_blendmv32hi;
44670 break;
44672 case V64QImode:
44673 if (TARGET_AVX512F && TARGET_AVX512BW)
44675 mmode = DImode;
44676 gen_blendm = gen_avx512bw_blendmv64qi;
44678 break;
44680 default:
44681 break;
44684 if (mmode != VOIDmode)
44686 tmp = gen_reg_rtx (mode);
44687 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44688 /* The avx512*_blendm<mode> expanders have different operand order
44689 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44690 elements where the mask is set and second input operand otherwise,
44691 in {sse,avx}*_*blend* the first input operand is used for elements
44692 where the mask is clear and second input operand otherwise. */
44693 emit_insn (gen_blendm (target, target, tmp,
44694 force_reg (mmode,
44695 gen_int_mode (1 << elt, mmode))));
44697 else if (use_vec_merge)
44699 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44700 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44701 emit_insn (gen_rtx_SET (target, tmp));
44703 else
44705 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44707 emit_move_insn (mem, target);
44709 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44710 emit_move_insn (tmp, val);
44712 emit_move_insn (target, mem);
44716 void
44717 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44719 machine_mode mode = GET_MODE (vec);
44720 machine_mode inner_mode = GET_MODE_INNER (mode);
44721 bool use_vec_extr = false;
44722 rtx tmp;
44724 switch (mode)
44726 case V2SImode:
44727 case V2SFmode:
44728 if (!mmx_ok)
44729 break;
44730 /* FALLTHRU */
44732 case V2DFmode:
44733 case V2DImode:
44734 case V2TImode:
44735 case V4TImode:
44736 use_vec_extr = true;
44737 break;
44739 case V4SFmode:
44740 use_vec_extr = TARGET_SSE4_1;
44741 if (use_vec_extr)
44742 break;
44744 switch (elt)
44746 case 0:
44747 tmp = vec;
44748 break;
44750 case 1:
44751 case 3:
44752 tmp = gen_reg_rtx (mode);
44753 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44754 GEN_INT (elt), GEN_INT (elt),
44755 GEN_INT (elt+4), GEN_INT (elt+4)));
44756 break;
44758 case 2:
44759 tmp = gen_reg_rtx (mode);
44760 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44761 break;
44763 default:
44764 gcc_unreachable ();
44766 vec = tmp;
44767 use_vec_extr = true;
44768 elt = 0;
44769 break;
44771 case V4SImode:
44772 use_vec_extr = TARGET_SSE4_1;
44773 if (use_vec_extr)
44774 break;
44776 if (TARGET_SSE2)
44778 switch (elt)
44780 case 0:
44781 tmp = vec;
44782 break;
44784 case 1:
44785 case 3:
44786 tmp = gen_reg_rtx (mode);
44787 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44788 GEN_INT (elt), GEN_INT (elt),
44789 GEN_INT (elt), GEN_INT (elt)));
44790 break;
44792 case 2:
44793 tmp = gen_reg_rtx (mode);
44794 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44795 break;
44797 default:
44798 gcc_unreachable ();
44800 vec = tmp;
44801 use_vec_extr = true;
44802 elt = 0;
44804 else
44806 /* For SSE1, we have to reuse the V4SF code. */
44807 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44808 gen_lowpart (V4SFmode, vec), elt);
44809 return;
44811 break;
44813 case V8HImode:
44814 use_vec_extr = TARGET_SSE2;
44815 break;
44816 case V4HImode:
44817 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44818 break;
44820 case V16QImode:
44821 use_vec_extr = TARGET_SSE4_1;
44822 break;
44824 case V8SFmode:
44825 if (TARGET_AVX)
44827 tmp = gen_reg_rtx (V4SFmode);
44828 if (elt < 4)
44829 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44830 else
44831 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44832 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44833 return;
44835 break;
44837 case V4DFmode:
44838 if (TARGET_AVX)
44840 tmp = gen_reg_rtx (V2DFmode);
44841 if (elt < 2)
44842 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44843 else
44844 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44845 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44846 return;
44848 break;
44850 case V32QImode:
44851 if (TARGET_AVX)
44853 tmp = gen_reg_rtx (V16QImode);
44854 if (elt < 16)
44855 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44856 else
44857 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44858 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44859 return;
44861 break;
44863 case V16HImode:
44864 if (TARGET_AVX)
44866 tmp = gen_reg_rtx (V8HImode);
44867 if (elt < 8)
44868 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44869 else
44870 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44871 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44872 return;
44874 break;
44876 case V8SImode:
44877 if (TARGET_AVX)
44879 tmp = gen_reg_rtx (V4SImode);
44880 if (elt < 4)
44881 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44882 else
44883 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44884 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44885 return;
44887 break;
44889 case V4DImode:
44890 if (TARGET_AVX)
44892 tmp = gen_reg_rtx (V2DImode);
44893 if (elt < 2)
44894 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44895 else
44896 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44897 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44898 return;
44900 break;
44902 case V32HImode:
44903 if (TARGET_AVX512BW)
44905 tmp = gen_reg_rtx (V16HImode);
44906 if (elt < 16)
44907 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44908 else
44909 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44910 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44911 return;
44913 break;
44915 case V64QImode:
44916 if (TARGET_AVX512BW)
44918 tmp = gen_reg_rtx (V32QImode);
44919 if (elt < 32)
44920 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44921 else
44922 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44923 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44924 return;
44926 break;
44928 case V16SFmode:
44929 tmp = gen_reg_rtx (V8SFmode);
44930 if (elt < 8)
44931 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44932 else
44933 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44934 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44935 return;
44937 case V8DFmode:
44938 tmp = gen_reg_rtx (V4DFmode);
44939 if (elt < 4)
44940 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44941 else
44942 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44943 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44944 return;
44946 case V16SImode:
44947 tmp = gen_reg_rtx (V8SImode);
44948 if (elt < 8)
44949 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44950 else
44951 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44952 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44953 return;
44955 case V8DImode:
44956 tmp = gen_reg_rtx (V4DImode);
44957 if (elt < 4)
44958 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44959 else
44960 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44961 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44962 return;
44964 case V8QImode:
44965 /* ??? Could extract the appropriate HImode element and shift. */
44966 default:
44967 break;
44970 if (use_vec_extr)
44972 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44973 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44975 /* Let the rtl optimizers know about the zero extension performed. */
44976 if (inner_mode == QImode || inner_mode == HImode)
44978 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44979 target = gen_lowpart (SImode, target);
44982 emit_insn (gen_rtx_SET (target, tmp));
44984 else
44986 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44988 emit_move_insn (mem, vec);
44990 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44991 emit_move_insn (target, tmp);
44995 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44996 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44997 The upper bits of DEST are undefined, though they shouldn't cause
44998 exceptions (some bits from src or all zeros are ok). */
45000 static void
45001 emit_reduc_half (rtx dest, rtx src, int i)
45003 rtx tem, d = dest;
45004 switch (GET_MODE (src))
45006 case V4SFmode:
45007 if (i == 128)
45008 tem = gen_sse_movhlps (dest, src, src);
45009 else
45010 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
45011 GEN_INT (1 + 4), GEN_INT (1 + 4));
45012 break;
45013 case V2DFmode:
45014 tem = gen_vec_interleave_highv2df (dest, src, src);
45015 break;
45016 case V16QImode:
45017 case V8HImode:
45018 case V4SImode:
45019 case V2DImode:
45020 d = gen_reg_rtx (V1TImode);
45021 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
45022 GEN_INT (i / 2));
45023 break;
45024 case V8SFmode:
45025 if (i == 256)
45026 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
45027 else
45028 tem = gen_avx_shufps256 (dest, src, src,
45029 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
45030 break;
45031 case V4DFmode:
45032 if (i == 256)
45033 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
45034 else
45035 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
45036 break;
45037 case V32QImode:
45038 case V16HImode:
45039 case V8SImode:
45040 case V4DImode:
45041 if (i == 256)
45043 if (GET_MODE (dest) != V4DImode)
45044 d = gen_reg_rtx (V4DImode);
45045 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
45046 gen_lowpart (V4DImode, src),
45047 const1_rtx);
45049 else
45051 d = gen_reg_rtx (V2TImode);
45052 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45053 GEN_INT (i / 2));
45055 break;
45056 case V64QImode:
45057 case V32HImode:
45058 case V16SImode:
45059 case V16SFmode:
45060 case V8DImode:
45061 case V8DFmode:
45062 if (i > 128)
45063 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45064 gen_lowpart (V16SImode, src),
45065 gen_lowpart (V16SImode, src),
45066 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45067 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45068 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45069 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45070 GEN_INT (0xC), GEN_INT (0xD),
45071 GEN_INT (0xE), GEN_INT (0xF),
45072 GEN_INT (0x10), GEN_INT (0x11),
45073 GEN_INT (0x12), GEN_INT (0x13),
45074 GEN_INT (0x14), GEN_INT (0x15),
45075 GEN_INT (0x16), GEN_INT (0x17));
45076 else
45077 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45078 gen_lowpart (V16SImode, src),
45079 GEN_INT (i == 128 ? 0x2 : 0x1),
45080 GEN_INT (0x3),
45081 GEN_INT (0x3),
45082 GEN_INT (0x3),
45083 GEN_INT (i == 128 ? 0x6 : 0x5),
45084 GEN_INT (0x7),
45085 GEN_INT (0x7),
45086 GEN_INT (0x7),
45087 GEN_INT (i == 128 ? 0xA : 0x9),
45088 GEN_INT (0xB),
45089 GEN_INT (0xB),
45090 GEN_INT (0xB),
45091 GEN_INT (i == 128 ? 0xE : 0xD),
45092 GEN_INT (0xF),
45093 GEN_INT (0xF),
45094 GEN_INT (0xF));
45095 break;
45096 default:
45097 gcc_unreachable ();
45099 emit_insn (tem);
45100 if (d != dest)
45101 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45104 /* Expand a vector reduction. FN is the binary pattern to reduce;
45105 DEST is the destination; IN is the input vector. */
45107 void
45108 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45110 rtx half, dst, vec = in;
45111 machine_mode mode = GET_MODE (in);
45112 int i;
45114 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45115 if (TARGET_SSE4_1
45116 && mode == V8HImode
45117 && fn == gen_uminv8hi3)
45119 emit_insn (gen_sse4_1_phminposuw (dest, in));
45120 return;
45123 for (i = GET_MODE_BITSIZE (mode);
45124 i > GET_MODE_UNIT_BITSIZE (mode);
45125 i >>= 1)
45127 half = gen_reg_rtx (mode);
45128 emit_reduc_half (half, vec, i);
45129 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45130 dst = dest;
45131 else
45132 dst = gen_reg_rtx (mode);
45133 emit_insn (fn (dst, half, vec));
45134 vec = dst;
45138 /* Target hook for scalar_mode_supported_p. */
45139 static bool
45140 ix86_scalar_mode_supported_p (machine_mode mode)
45142 if (DECIMAL_FLOAT_MODE_P (mode))
45143 return default_decimal_float_supported_p ();
45144 else if (mode == TFmode)
45145 return true;
45146 else
45147 return default_scalar_mode_supported_p (mode);
45150 /* Implements target hook vector_mode_supported_p. */
45151 static bool
45152 ix86_vector_mode_supported_p (machine_mode mode)
45154 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45155 return true;
45156 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45157 return true;
45158 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45159 return true;
45160 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45161 return true;
45162 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45163 return true;
45164 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45165 return true;
45166 return false;
45169 /* Target hook for c_mode_for_suffix. */
45170 static machine_mode
45171 ix86_c_mode_for_suffix (char suffix)
45173 if (suffix == 'q')
45174 return TFmode;
45175 if (suffix == 'w')
45176 return XFmode;
45178 return VOIDmode;
45181 /* Worker function for TARGET_MD_ASM_ADJUST.
45183 We implement asm flag outputs, and maintain source compatibility
45184 with the old cc0-based compiler. */
45186 static rtx_insn *
45187 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45188 vec<const char *> &constraints,
45189 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45191 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45192 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45194 bool saw_asm_flag = false;
45196 start_sequence ();
45197 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45199 const char *con = constraints[i];
45200 if (strncmp (con, "=@cc", 4) != 0)
45201 continue;
45202 con += 4;
45203 if (strchr (con, ',') != NULL)
45205 error ("alternatives not allowed in asm flag output");
45206 continue;
45209 bool invert = false;
45210 if (con[0] == 'n')
45211 invert = true, con++;
45213 machine_mode mode = CCmode;
45214 rtx_code code = UNKNOWN;
45216 switch (con[0])
45218 case 'a':
45219 if (con[1] == 0)
45220 mode = CCAmode, code = EQ;
45221 else if (con[1] == 'e' && con[2] == 0)
45222 mode = CCCmode, code = NE;
45223 break;
45224 case 'b':
45225 if (con[1] == 0)
45226 mode = CCCmode, code = EQ;
45227 else if (con[1] == 'e' && con[2] == 0)
45228 mode = CCAmode, code = NE;
45229 break;
45230 case 'c':
45231 if (con[1] == 0)
45232 mode = CCCmode, code = EQ;
45233 break;
45234 case 'e':
45235 if (con[1] == 0)
45236 mode = CCZmode, code = EQ;
45237 break;
45238 case 'g':
45239 if (con[1] == 0)
45240 mode = CCGCmode, code = GT;
45241 else if (con[1] == 'e' && con[2] == 0)
45242 mode = CCGCmode, code = GE;
45243 break;
45244 case 'l':
45245 if (con[1] == 0)
45246 mode = CCGCmode, code = LT;
45247 else if (con[1] == 'e' && con[2] == 0)
45248 mode = CCGCmode, code = LE;
45249 break;
45250 case 'o':
45251 if (con[1] == 0)
45252 mode = CCOmode, code = EQ;
45253 break;
45254 case 'p':
45255 if (con[1] == 0)
45256 mode = CCPmode, code = EQ;
45257 break;
45258 case 's':
45259 if (con[1] == 0)
45260 mode = CCSmode, code = EQ;
45261 break;
45262 case 'z':
45263 if (con[1] == 0)
45264 mode = CCZmode, code = EQ;
45265 break;
45267 if (code == UNKNOWN)
45269 error ("unknown asm flag output %qs", constraints[i]);
45270 continue;
45272 if (invert)
45273 code = reverse_condition (code);
45275 rtx dest = outputs[i];
45276 if (!saw_asm_flag)
45278 /* This is the first asm flag output. Here we put the flags
45279 register in as the real output and adjust the condition to
45280 allow it. */
45281 constraints[i] = "=Bf";
45282 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45283 saw_asm_flag = true;
45285 else
45287 /* We don't need the flags register as output twice. */
45288 constraints[i] = "=X";
45289 outputs[i] = gen_rtx_SCRATCH (SImode);
45292 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45293 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45295 machine_mode dest_mode = GET_MODE (dest);
45296 if (!SCALAR_INT_MODE_P (dest_mode))
45298 error ("invalid type for asm flag output");
45299 continue;
45302 if (dest_mode == DImode && !TARGET_64BIT)
45303 dest_mode = SImode;
45305 if (dest_mode != QImode)
45307 rtx destqi = gen_reg_rtx (QImode);
45308 emit_insn (gen_rtx_SET (destqi, x));
45310 if (TARGET_ZERO_EXTEND_WITH_AND
45311 && optimize_function_for_speed_p (cfun))
45313 x = force_reg (dest_mode, const0_rtx);
45315 emit_insn (gen_movstrictqi
45316 (gen_lowpart (QImode, x), destqi));
45318 else
45319 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45322 if (dest_mode != GET_MODE (dest))
45324 rtx tmp = gen_reg_rtx (SImode);
45326 emit_insn (gen_rtx_SET (tmp, x));
45327 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45329 else
45330 emit_insn (gen_rtx_SET (dest, x));
45332 rtx_insn *seq = get_insns ();
45333 end_sequence ();
45335 if (saw_asm_flag)
45336 return seq;
45337 else
45339 /* If we had no asm flag outputs, clobber the flags. */
45340 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45341 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45342 return NULL;
45346 /* Implements target vector targetm.asm.encode_section_info. */
45348 static void ATTRIBUTE_UNUSED
45349 ix86_encode_section_info (tree decl, rtx rtl, int first)
45351 default_encode_section_info (decl, rtl, first);
45353 if (ix86_in_large_data_p (decl))
45354 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45357 /* Worker function for REVERSE_CONDITION. */
45359 enum rtx_code
45360 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45362 return (mode != CCFPmode && mode != CCFPUmode
45363 ? reverse_condition (code)
45364 : reverse_condition_maybe_unordered (code));
45367 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45368 to OPERANDS[0]. */
45370 const char *
45371 output_387_reg_move (rtx_insn *insn, rtx *operands)
45373 if (REG_P (operands[0]))
45375 if (REG_P (operands[1])
45376 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45378 if (REGNO (operands[0]) == FIRST_STACK_REG)
45379 return output_387_ffreep (operands, 0);
45380 return "fstp\t%y0";
45382 if (STACK_TOP_P (operands[0]))
45383 return "fld%Z1\t%y1";
45384 return "fst\t%y0";
45386 else if (MEM_P (operands[0]))
45388 gcc_assert (REG_P (operands[1]));
45389 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45390 return "fstp%Z0\t%y0";
45391 else
45393 /* There is no non-popping store to memory for XFmode.
45394 So if we need one, follow the store with a load. */
45395 if (GET_MODE (operands[0]) == XFmode)
45396 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45397 else
45398 return "fst%Z0\t%y0";
45401 else
45402 gcc_unreachable();
45405 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45406 FP status register is set. */
45408 void
45409 ix86_emit_fp_unordered_jump (rtx label)
45411 rtx reg = gen_reg_rtx (HImode);
45412 rtx temp;
45414 emit_insn (gen_x86_fnstsw_1 (reg));
45416 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45418 emit_insn (gen_x86_sahf_1 (reg));
45420 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45421 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45423 else
45425 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45427 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45428 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45431 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45432 gen_rtx_LABEL_REF (VOIDmode, label),
45433 pc_rtx);
45434 temp = gen_rtx_SET (pc_rtx, temp);
45436 emit_jump_insn (temp);
45437 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45440 /* Output code to perform a log1p XFmode calculation. */
45442 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45444 rtx_code_label *label1 = gen_label_rtx ();
45445 rtx_code_label *label2 = gen_label_rtx ();
45447 rtx tmp = gen_reg_rtx (XFmode);
45448 rtx tmp2 = gen_reg_rtx (XFmode);
45449 rtx test;
45451 emit_insn (gen_absxf2 (tmp, op1));
45452 test = gen_rtx_GE (VOIDmode, tmp,
45453 const_double_from_real_value (
45454 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45455 XFmode));
45456 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45458 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45459 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45460 emit_jump (label2);
45462 emit_label (label1);
45463 emit_move_insn (tmp, CONST1_RTX (XFmode));
45464 emit_insn (gen_addxf3 (tmp, op1, tmp));
45465 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45466 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45468 emit_label (label2);
45471 /* Emit code for round calculation. */
45472 void ix86_emit_i387_round (rtx op0, rtx op1)
45474 machine_mode inmode = GET_MODE (op1);
45475 machine_mode outmode = GET_MODE (op0);
45476 rtx e1, e2, res, tmp, tmp1, half;
45477 rtx scratch = gen_reg_rtx (HImode);
45478 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45479 rtx_code_label *jump_label = gen_label_rtx ();
45480 rtx insn;
45481 rtx (*gen_abs) (rtx, rtx);
45482 rtx (*gen_neg) (rtx, rtx);
45484 switch (inmode)
45486 case SFmode:
45487 gen_abs = gen_abssf2;
45488 break;
45489 case DFmode:
45490 gen_abs = gen_absdf2;
45491 break;
45492 case XFmode:
45493 gen_abs = gen_absxf2;
45494 break;
45495 default:
45496 gcc_unreachable ();
45499 switch (outmode)
45501 case SFmode:
45502 gen_neg = gen_negsf2;
45503 break;
45504 case DFmode:
45505 gen_neg = gen_negdf2;
45506 break;
45507 case XFmode:
45508 gen_neg = gen_negxf2;
45509 break;
45510 case HImode:
45511 gen_neg = gen_neghi2;
45512 break;
45513 case SImode:
45514 gen_neg = gen_negsi2;
45515 break;
45516 case DImode:
45517 gen_neg = gen_negdi2;
45518 break;
45519 default:
45520 gcc_unreachable ();
45523 e1 = gen_reg_rtx (inmode);
45524 e2 = gen_reg_rtx (inmode);
45525 res = gen_reg_rtx (outmode);
45527 half = const_double_from_real_value (dconsthalf, inmode);
45529 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45531 /* scratch = fxam(op1) */
45532 emit_insn (gen_rtx_SET (scratch,
45533 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45534 UNSPEC_FXAM)));
45535 /* e1 = fabs(op1) */
45536 emit_insn (gen_abs (e1, op1));
45538 /* e2 = e1 + 0.5 */
45539 half = force_reg (inmode, half);
45540 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45542 /* res = floor(e2) */
45543 if (inmode != XFmode)
45545 tmp1 = gen_reg_rtx (XFmode);
45547 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45549 else
45550 tmp1 = e2;
45552 switch (outmode)
45554 case SFmode:
45555 case DFmode:
45557 rtx tmp0 = gen_reg_rtx (XFmode);
45559 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45561 emit_insn (gen_rtx_SET (res,
45562 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45563 UNSPEC_TRUNC_NOOP)));
45565 break;
45566 case XFmode:
45567 emit_insn (gen_frndintxf2_floor (res, tmp1));
45568 break;
45569 case HImode:
45570 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45571 break;
45572 case SImode:
45573 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45574 break;
45575 case DImode:
45576 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45577 break;
45578 default:
45579 gcc_unreachable ();
45582 /* flags = signbit(a) */
45583 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45585 /* if (flags) then res = -res */
45586 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45587 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45588 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45589 pc_rtx);
45590 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45591 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45592 JUMP_LABEL (insn) = jump_label;
45594 emit_insn (gen_neg (res, res));
45596 emit_label (jump_label);
45597 LABEL_NUSES (jump_label) = 1;
45599 emit_move_insn (op0, res);
45602 /* Output code to perform a Newton-Rhapson approximation of a single precision
45603 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45605 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45607 rtx x0, x1, e0, e1;
45609 x0 = gen_reg_rtx (mode);
45610 e0 = gen_reg_rtx (mode);
45611 e1 = gen_reg_rtx (mode);
45612 x1 = gen_reg_rtx (mode);
45614 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45616 b = force_reg (mode, b);
45618 /* x0 = rcp(b) estimate */
45619 if (mode == V16SFmode || mode == V8DFmode)
45621 if (TARGET_AVX512ER)
45623 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45624 UNSPEC_RCP28)));
45625 /* res = a * x0 */
45626 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45627 return;
45629 else
45630 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45631 UNSPEC_RCP14)));
45633 else
45634 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45635 UNSPEC_RCP)));
45637 /* e0 = x0 * b */
45638 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45640 /* e0 = x0 * e0 */
45641 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45643 /* e1 = x0 + x0 */
45644 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45646 /* x1 = e1 - e0 */
45647 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45649 /* res = a * x1 */
45650 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45653 /* Output code to perform a Newton-Rhapson approximation of a
45654 single precision floating point [reciprocal] square root. */
45656 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45658 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45659 REAL_VALUE_TYPE r;
45660 int unspec;
45662 x0 = gen_reg_rtx (mode);
45663 e0 = gen_reg_rtx (mode);
45664 e1 = gen_reg_rtx (mode);
45665 e2 = gen_reg_rtx (mode);
45666 e3 = gen_reg_rtx (mode);
45668 if (TARGET_AVX512ER && mode == V16SFmode)
45670 if (recip)
45671 /* res = rsqrt28(a) estimate */
45672 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45673 UNSPEC_RSQRT28)));
45674 else
45676 /* x0 = rsqrt28(a) estimate */
45677 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45678 UNSPEC_RSQRT28)));
45679 /* res = rcp28(x0) estimate */
45680 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45681 UNSPEC_RCP28)));
45683 return;
45686 real_from_integer (&r, VOIDmode, -3, SIGNED);
45687 mthree = const_double_from_real_value (r, SFmode);
45689 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45690 mhalf = const_double_from_real_value (r, SFmode);
45691 unspec = UNSPEC_RSQRT;
45693 if (VECTOR_MODE_P (mode))
45695 mthree = ix86_build_const_vector (mode, true, mthree);
45696 mhalf = ix86_build_const_vector (mode, true, mhalf);
45697 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45698 if (GET_MODE_SIZE (mode) == 64)
45699 unspec = UNSPEC_RSQRT14;
45702 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45703 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45705 a = force_reg (mode, a);
45707 /* x0 = rsqrt(a) estimate */
45708 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45709 unspec)));
45711 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45712 if (!recip)
45714 rtx zero = force_reg (mode, CONST0_RTX(mode));
45715 rtx mask;
45717 /* Handle masked compare. */
45718 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45720 mask = gen_reg_rtx (HImode);
45721 /* Imm value 0x4 corresponds to not-equal comparison. */
45722 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45723 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45725 else
45727 mask = gen_reg_rtx (mode);
45728 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45729 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45733 /* e0 = x0 * a */
45734 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45735 /* e1 = e0 * x0 */
45736 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45738 /* e2 = e1 - 3. */
45739 mthree = force_reg (mode, mthree);
45740 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45742 mhalf = force_reg (mode, mhalf);
45743 if (recip)
45744 /* e3 = -.5 * x0 */
45745 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45746 else
45747 /* e3 = -.5 * e0 */
45748 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45749 /* ret = e2 * e3 */
45750 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45753 #ifdef TARGET_SOLARIS
45754 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45756 static void
45757 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45758 tree decl)
45760 /* With Binutils 2.15, the "@unwind" marker must be specified on
45761 every occurrence of the ".eh_frame" section, not just the first
45762 one. */
45763 if (TARGET_64BIT
45764 && strcmp (name, ".eh_frame") == 0)
45766 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45767 flags & SECTION_WRITE ? "aw" : "a");
45768 return;
45771 #ifndef USE_GAS
45772 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45774 solaris_elf_asm_comdat_section (name, flags, decl);
45775 return;
45777 #endif
45779 default_elf_asm_named_section (name, flags, decl);
45781 #endif /* TARGET_SOLARIS */
45783 /* Return the mangling of TYPE if it is an extended fundamental type. */
45785 static const char *
45786 ix86_mangle_type (const_tree type)
45788 type = TYPE_MAIN_VARIANT (type);
45790 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45791 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45792 return NULL;
45794 switch (TYPE_MODE (type))
45796 case TFmode:
45797 /* __float128 is "g". */
45798 return "g";
45799 case XFmode:
45800 /* "long double" or __float80 is "e". */
45801 return "e";
45802 default:
45803 return NULL;
45807 #ifdef TARGET_THREAD_SSP_OFFSET
45808 /* If using TLS guards, don't waste time creating and expanding
45809 __stack_chk_guard decl and MEM as we are going to ignore it. */
45810 static tree
45811 ix86_stack_protect_guard (void)
45813 if (TARGET_SSP_TLS_GUARD)
45814 return NULL_TREE;
45815 return default_stack_protect_guard ();
45817 #endif
45819 /* For 32-bit code we can save PIC register setup by using
45820 __stack_chk_fail_local hidden function instead of calling
45821 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45822 register, so it is better to call __stack_chk_fail directly. */
45824 static tree ATTRIBUTE_UNUSED
45825 ix86_stack_protect_fail (void)
45827 return TARGET_64BIT
45828 ? default_external_stack_protect_fail ()
45829 : default_hidden_stack_protect_fail ();
45832 /* Select a format to encode pointers in exception handling data. CODE
45833 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45834 true if the symbol may be affected by dynamic relocations.
45836 ??? All x86 object file formats are capable of representing this.
45837 After all, the relocation needed is the same as for the call insn.
45838 Whether or not a particular assembler allows us to enter such, I
45839 guess we'll have to see. */
45841 asm_preferred_eh_data_format (int code, int global)
45843 if (flag_pic)
45845 int type = DW_EH_PE_sdata8;
45846 if (!TARGET_64BIT
45847 || ix86_cmodel == CM_SMALL_PIC
45848 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45849 type = DW_EH_PE_sdata4;
45850 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45852 if (ix86_cmodel == CM_SMALL
45853 || (ix86_cmodel == CM_MEDIUM && code))
45854 return DW_EH_PE_udata4;
45855 return DW_EH_PE_absptr;
45858 /* Expand copysign from SIGN to the positive value ABS_VALUE
45859 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45860 the sign-bit. */
45861 static void
45862 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45864 machine_mode mode = GET_MODE (sign);
45865 rtx sgn = gen_reg_rtx (mode);
45866 if (mask == NULL_RTX)
45868 machine_mode vmode;
45870 if (mode == SFmode)
45871 vmode = V4SFmode;
45872 else if (mode == DFmode)
45873 vmode = V2DFmode;
45874 else
45875 vmode = mode;
45877 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45878 if (!VECTOR_MODE_P (mode))
45880 /* We need to generate a scalar mode mask in this case. */
45881 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45882 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45883 mask = gen_reg_rtx (mode);
45884 emit_insn (gen_rtx_SET (mask, tmp));
45887 else
45888 mask = gen_rtx_NOT (mode, mask);
45889 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45890 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45893 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45894 mask for masking out the sign-bit is stored in *SMASK, if that is
45895 non-null. */
45896 static rtx
45897 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45899 machine_mode vmode, mode = GET_MODE (op0);
45900 rtx xa, mask;
45902 xa = gen_reg_rtx (mode);
45903 if (mode == SFmode)
45904 vmode = V4SFmode;
45905 else if (mode == DFmode)
45906 vmode = V2DFmode;
45907 else
45908 vmode = mode;
45909 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45910 if (!VECTOR_MODE_P (mode))
45912 /* We need to generate a scalar mode mask in this case. */
45913 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45914 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45915 mask = gen_reg_rtx (mode);
45916 emit_insn (gen_rtx_SET (mask, tmp));
45918 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45920 if (smask)
45921 *smask = mask;
45923 return xa;
45926 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45927 swapping the operands if SWAP_OPERANDS is true. The expanded
45928 code is a forward jump to a newly created label in case the
45929 comparison is true. The generated label rtx is returned. */
45930 static rtx_code_label *
45931 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45932 bool swap_operands)
45934 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
45935 rtx_code_label *label;
45936 rtx tmp;
45938 if (swap_operands)
45939 std::swap (op0, op1);
45941 label = gen_label_rtx ();
45942 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
45943 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
45944 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
45945 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45946 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45947 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45948 JUMP_LABEL (tmp) = label;
45950 return label;
45953 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45954 using comparison code CODE. Operands are swapped for the comparison if
45955 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45956 static rtx
45957 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45958 bool swap_operands)
45960 rtx (*insn)(rtx, rtx, rtx, rtx);
45961 machine_mode mode = GET_MODE (op0);
45962 rtx mask = gen_reg_rtx (mode);
45964 if (swap_operands)
45965 std::swap (op0, op1);
45967 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45969 emit_insn (insn (mask, op0, op1,
45970 gen_rtx_fmt_ee (code, mode, op0, op1)));
45971 return mask;
45974 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45975 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45976 static rtx
45977 ix86_gen_TWO52 (machine_mode mode)
45979 REAL_VALUE_TYPE TWO52r;
45980 rtx TWO52;
45982 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45983 TWO52 = const_double_from_real_value (TWO52r, mode);
45984 TWO52 = force_reg (mode, TWO52);
45986 return TWO52;
45989 /* Expand SSE sequence for computing lround from OP1 storing
45990 into OP0. */
45991 void
45992 ix86_expand_lround (rtx op0, rtx op1)
45994 /* C code for the stuff we're doing below:
45995 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45996 return (long)tmp;
45998 machine_mode mode = GET_MODE (op1);
45999 const struct real_format *fmt;
46000 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46001 rtx adj;
46003 /* load nextafter (0.5, 0.0) */
46004 fmt = REAL_MODE_FORMAT (mode);
46005 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46006 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46008 /* adj = copysign (0.5, op1) */
46009 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
46010 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
46012 /* adj = op1 + adj */
46013 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
46015 /* op0 = (imode)adj */
46016 expand_fix (op0, adj, 0);
46019 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
46020 into OPERAND0. */
46021 void
46022 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
46024 /* C code for the stuff we're doing below (for do_floor):
46025 xi = (long)op1;
46026 xi -= (double)xi > op1 ? 1 : 0;
46027 return xi;
46029 machine_mode fmode = GET_MODE (op1);
46030 machine_mode imode = GET_MODE (op0);
46031 rtx ireg, freg, tmp;
46032 rtx_code_label *label;
46034 /* reg = (long)op1 */
46035 ireg = gen_reg_rtx (imode);
46036 expand_fix (ireg, op1, 0);
46038 /* freg = (double)reg */
46039 freg = gen_reg_rtx (fmode);
46040 expand_float (freg, ireg, 0);
46042 /* ireg = (freg > op1) ? ireg - 1 : ireg */
46043 label = ix86_expand_sse_compare_and_jump (UNLE,
46044 freg, op1, !do_floor);
46045 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
46046 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
46047 emit_move_insn (ireg, tmp);
46049 emit_label (label);
46050 LABEL_NUSES (label) = 1;
46052 emit_move_insn (op0, ireg);
46055 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46056 result in OPERAND0. */
46057 void
46058 ix86_expand_rint (rtx operand0, rtx operand1)
46060 /* C code for the stuff we're doing below:
46061 xa = fabs (operand1);
46062 if (!isless (xa, 2**52))
46063 return operand1;
46064 xa = xa + 2**52 - 2**52;
46065 return copysign (xa, operand1);
46067 machine_mode mode = GET_MODE (operand0);
46068 rtx res, xa, TWO52, mask;
46069 rtx_code_label *label;
46071 res = gen_reg_rtx (mode);
46072 emit_move_insn (res, operand1);
46074 /* xa = abs (operand1) */
46075 xa = ix86_expand_sse_fabs (res, &mask);
46077 /* if (!isless (xa, TWO52)) goto label; */
46078 TWO52 = ix86_gen_TWO52 (mode);
46079 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46081 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46082 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46084 ix86_sse_copysign_to_positive (res, xa, res, mask);
46086 emit_label (label);
46087 LABEL_NUSES (label) = 1;
46089 emit_move_insn (operand0, res);
46092 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46093 into OPERAND0. */
46094 void
46095 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46097 /* C code for the stuff we expand below.
46098 double xa = fabs (x), x2;
46099 if (!isless (xa, TWO52))
46100 return x;
46101 xa = xa + TWO52 - TWO52;
46102 x2 = copysign (xa, x);
46103 Compensate. Floor:
46104 if (x2 > x)
46105 x2 -= 1;
46106 Compensate. Ceil:
46107 if (x2 < x)
46108 x2 -= -1;
46109 return x2;
46111 machine_mode mode = GET_MODE (operand0);
46112 rtx xa, TWO52, tmp, one, res, mask;
46113 rtx_code_label *label;
46115 TWO52 = ix86_gen_TWO52 (mode);
46117 /* Temporary for holding the result, initialized to the input
46118 operand to ease control flow. */
46119 res = gen_reg_rtx (mode);
46120 emit_move_insn (res, operand1);
46122 /* xa = abs (operand1) */
46123 xa = ix86_expand_sse_fabs (res, &mask);
46125 /* if (!isless (xa, TWO52)) goto label; */
46126 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46128 /* xa = xa + TWO52 - TWO52; */
46129 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46130 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46132 /* xa = copysign (xa, operand1) */
46133 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46135 /* generate 1.0 or -1.0 */
46136 one = force_reg (mode,
46137 const_double_from_real_value (do_floor
46138 ? dconst1 : dconstm1, mode));
46140 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46141 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46142 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46143 /* We always need to subtract here to preserve signed zero. */
46144 tmp = expand_simple_binop (mode, MINUS,
46145 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46146 emit_move_insn (res, tmp);
46148 emit_label (label);
46149 LABEL_NUSES (label) = 1;
46151 emit_move_insn (operand0, res);
46154 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46155 into OPERAND0. */
46156 void
46157 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46159 /* C code for the stuff we expand below.
46160 double xa = fabs (x), x2;
46161 if (!isless (xa, TWO52))
46162 return x;
46163 x2 = (double)(long)x;
46164 Compensate. Floor:
46165 if (x2 > x)
46166 x2 -= 1;
46167 Compensate. Ceil:
46168 if (x2 < x)
46169 x2 += 1;
46170 if (HONOR_SIGNED_ZEROS (mode))
46171 return copysign (x2, x);
46172 return x2;
46174 machine_mode mode = GET_MODE (operand0);
46175 rtx xa, xi, TWO52, tmp, one, res, mask;
46176 rtx_code_label *label;
46178 TWO52 = ix86_gen_TWO52 (mode);
46180 /* Temporary for holding the result, initialized to the input
46181 operand to ease control flow. */
46182 res = gen_reg_rtx (mode);
46183 emit_move_insn (res, operand1);
46185 /* xa = abs (operand1) */
46186 xa = ix86_expand_sse_fabs (res, &mask);
46188 /* if (!isless (xa, TWO52)) goto label; */
46189 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46191 /* xa = (double)(long)x */
46192 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46193 expand_fix (xi, res, 0);
46194 expand_float (xa, xi, 0);
46196 /* generate 1.0 */
46197 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46199 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46200 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46201 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46202 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46203 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46204 emit_move_insn (res, tmp);
46206 if (HONOR_SIGNED_ZEROS (mode))
46207 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46209 emit_label (label);
46210 LABEL_NUSES (label) = 1;
46212 emit_move_insn (operand0, res);
46215 /* Expand SSE sequence for computing round from OPERAND1 storing
46216 into OPERAND0. Sequence that works without relying on DImode truncation
46217 via cvttsd2siq that is only available on 64bit targets. */
46218 void
46219 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46221 /* C code for the stuff we expand below.
46222 double xa = fabs (x), xa2, x2;
46223 if (!isless (xa, TWO52))
46224 return x;
46225 Using the absolute value and copying back sign makes
46226 -0.0 -> -0.0 correct.
46227 xa2 = xa + TWO52 - TWO52;
46228 Compensate.
46229 dxa = xa2 - xa;
46230 if (dxa <= -0.5)
46231 xa2 += 1;
46232 else if (dxa > 0.5)
46233 xa2 -= 1;
46234 x2 = copysign (xa2, x);
46235 return x2;
46237 machine_mode mode = GET_MODE (operand0);
46238 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46239 rtx_code_label *label;
46241 TWO52 = ix86_gen_TWO52 (mode);
46243 /* Temporary for holding the result, initialized to the input
46244 operand to ease control flow. */
46245 res = gen_reg_rtx (mode);
46246 emit_move_insn (res, operand1);
46248 /* xa = abs (operand1) */
46249 xa = ix86_expand_sse_fabs (res, &mask);
46251 /* if (!isless (xa, TWO52)) goto label; */
46252 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46254 /* xa2 = xa + TWO52 - TWO52; */
46255 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46256 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46258 /* dxa = xa2 - xa; */
46259 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46261 /* generate 0.5, 1.0 and -0.5 */
46262 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46263 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46264 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46265 0, OPTAB_DIRECT);
46267 /* Compensate. */
46268 tmp = gen_reg_rtx (mode);
46269 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46270 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46271 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46272 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46273 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46274 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46275 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46276 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46278 /* res = copysign (xa2, operand1) */
46279 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46281 emit_label (label);
46282 LABEL_NUSES (label) = 1;
46284 emit_move_insn (operand0, res);
46287 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46288 into OPERAND0. */
46289 void
46290 ix86_expand_trunc (rtx operand0, rtx operand1)
46292 /* C code for SSE variant we expand below.
46293 double xa = fabs (x), x2;
46294 if (!isless (xa, TWO52))
46295 return x;
46296 x2 = (double)(long)x;
46297 if (HONOR_SIGNED_ZEROS (mode))
46298 return copysign (x2, x);
46299 return x2;
46301 machine_mode mode = GET_MODE (operand0);
46302 rtx xa, xi, TWO52, res, mask;
46303 rtx_code_label *label;
46305 TWO52 = ix86_gen_TWO52 (mode);
46307 /* Temporary for holding the result, initialized to the input
46308 operand to ease control flow. */
46309 res = gen_reg_rtx (mode);
46310 emit_move_insn (res, operand1);
46312 /* xa = abs (operand1) */
46313 xa = ix86_expand_sse_fabs (res, &mask);
46315 /* if (!isless (xa, TWO52)) goto label; */
46316 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46318 /* x = (double)(long)x */
46319 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46320 expand_fix (xi, res, 0);
46321 expand_float (res, xi, 0);
46323 if (HONOR_SIGNED_ZEROS (mode))
46324 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46326 emit_label (label);
46327 LABEL_NUSES (label) = 1;
46329 emit_move_insn (operand0, res);
46332 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46333 into OPERAND0. */
46334 void
46335 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46337 machine_mode mode = GET_MODE (operand0);
46338 rtx xa, mask, TWO52, one, res, smask, tmp;
46339 rtx_code_label *label;
46341 /* C code for SSE variant we expand below.
46342 double xa = fabs (x), x2;
46343 if (!isless (xa, TWO52))
46344 return x;
46345 xa2 = xa + TWO52 - TWO52;
46346 Compensate:
46347 if (xa2 > xa)
46348 xa2 -= 1.0;
46349 x2 = copysign (xa2, x);
46350 return x2;
46353 TWO52 = ix86_gen_TWO52 (mode);
46355 /* Temporary for holding the result, initialized to the input
46356 operand to ease control flow. */
46357 res = gen_reg_rtx (mode);
46358 emit_move_insn (res, operand1);
46360 /* xa = abs (operand1) */
46361 xa = ix86_expand_sse_fabs (res, &smask);
46363 /* if (!isless (xa, TWO52)) goto label; */
46364 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46366 /* res = xa + TWO52 - TWO52; */
46367 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46368 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46369 emit_move_insn (res, tmp);
46371 /* generate 1.0 */
46372 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46374 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46375 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46376 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46377 tmp = expand_simple_binop (mode, MINUS,
46378 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46379 emit_move_insn (res, tmp);
46381 /* res = copysign (res, operand1) */
46382 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46384 emit_label (label);
46385 LABEL_NUSES (label) = 1;
46387 emit_move_insn (operand0, res);
46390 /* Expand SSE sequence for computing round from OPERAND1 storing
46391 into OPERAND0. */
46392 void
46393 ix86_expand_round (rtx operand0, rtx operand1)
46395 /* C code for the stuff we're doing below:
46396 double xa = fabs (x);
46397 if (!isless (xa, TWO52))
46398 return x;
46399 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46400 return copysign (xa, x);
46402 machine_mode mode = GET_MODE (operand0);
46403 rtx res, TWO52, xa, xi, half, mask;
46404 rtx_code_label *label;
46405 const struct real_format *fmt;
46406 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46408 /* Temporary for holding the result, initialized to the input
46409 operand to ease control flow. */
46410 res = gen_reg_rtx (mode);
46411 emit_move_insn (res, operand1);
46413 TWO52 = ix86_gen_TWO52 (mode);
46414 xa = ix86_expand_sse_fabs (res, &mask);
46415 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46417 /* load nextafter (0.5, 0.0) */
46418 fmt = REAL_MODE_FORMAT (mode);
46419 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46420 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46422 /* xa = xa + 0.5 */
46423 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46424 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46426 /* xa = (double)(int64_t)xa */
46427 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46428 expand_fix (xi, xa, 0);
46429 expand_float (xa, xi, 0);
46431 /* res = copysign (xa, operand1) */
46432 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46434 emit_label (label);
46435 LABEL_NUSES (label) = 1;
46437 emit_move_insn (operand0, res);
46440 /* Expand SSE sequence for computing round
46441 from OP1 storing into OP0 using sse4 round insn. */
46442 void
46443 ix86_expand_round_sse4 (rtx op0, rtx op1)
46445 machine_mode mode = GET_MODE (op0);
46446 rtx e1, e2, res, half;
46447 const struct real_format *fmt;
46448 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46449 rtx (*gen_copysign) (rtx, rtx, rtx);
46450 rtx (*gen_round) (rtx, rtx, rtx);
46452 switch (mode)
46454 case SFmode:
46455 gen_copysign = gen_copysignsf3;
46456 gen_round = gen_sse4_1_roundsf2;
46457 break;
46458 case DFmode:
46459 gen_copysign = gen_copysigndf3;
46460 gen_round = gen_sse4_1_rounddf2;
46461 break;
46462 default:
46463 gcc_unreachable ();
46466 /* round (a) = trunc (a + copysign (0.5, a)) */
46468 /* load nextafter (0.5, 0.0) */
46469 fmt = REAL_MODE_FORMAT (mode);
46470 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46471 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46472 half = const_double_from_real_value (pred_half, mode);
46474 /* e1 = copysign (0.5, op1) */
46475 e1 = gen_reg_rtx (mode);
46476 emit_insn (gen_copysign (e1, half, op1));
46478 /* e2 = op1 + e1 */
46479 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46481 /* res = trunc (e2) */
46482 res = gen_reg_rtx (mode);
46483 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46485 emit_move_insn (op0, res);
46489 /* Table of valid machine attributes. */
46490 static const struct attribute_spec ix86_attribute_table[] =
46492 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46493 affects_type_identity } */
46494 /* Stdcall attribute says callee is responsible for popping arguments
46495 if they are not variable. */
46496 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46497 true },
46498 /* Fastcall attribute says callee is responsible for popping arguments
46499 if they are not variable. */
46500 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46501 true },
46502 /* Thiscall attribute says callee is responsible for popping arguments
46503 if they are not variable. */
46504 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46505 true },
46506 /* Cdecl attribute says the callee is a normal C declaration */
46507 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46508 true },
46509 /* Regparm attribute specifies how many integer arguments are to be
46510 passed in registers. */
46511 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46512 true },
46513 /* Sseregparm attribute says we are using x86_64 calling conventions
46514 for FP arguments. */
46515 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46516 true },
46517 /* The transactional memory builtins are implicitly regparm or fastcall
46518 depending on the ABI. Override the generic do-nothing attribute that
46519 these builtins were declared with. */
46520 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46521 true },
46522 /* force_align_arg_pointer says this function realigns the stack at entry. */
46523 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46524 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46525 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46526 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46527 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46528 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46529 false },
46530 #endif
46531 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46532 false },
46533 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46534 false },
46535 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46536 SUBTARGET_ATTRIBUTE_TABLE,
46537 #endif
46538 /* ms_abi and sysv_abi calling convention function attributes. */
46539 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46540 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46541 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46542 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46543 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46544 false },
46545 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46546 ix86_handle_callee_pop_aggregate_return, true },
46547 { "interrupt", 0, 0, false, true, true,
46548 ix86_handle_interrupt_attribute, false },
46549 { "no_caller_saved_registers", 0, 0, false, true, true,
46550 ix86_handle_no_caller_saved_registers_attribute, false },
46551 { "naked", 0, 0, true, false, false,
46552 ix86_handle_fndecl_attribute, false },
46554 /* End element. */
46555 { NULL, 0, 0, false, false, false, NULL, false }
46558 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46559 static int
46560 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46561 tree vectype, int)
46563 switch (type_of_cost)
46565 case scalar_stmt:
46566 return ix86_cost->scalar_stmt_cost;
46568 case scalar_load:
46569 return ix86_cost->scalar_load_cost;
46571 case scalar_store:
46572 return ix86_cost->scalar_store_cost;
46574 case vector_stmt:
46575 return ix86_cost->vec_stmt_cost;
46577 case vector_load:
46578 return ix86_cost->vec_align_load_cost;
46580 case vector_store:
46581 return ix86_cost->vec_store_cost;
46583 case vec_to_scalar:
46584 return ix86_cost->vec_to_scalar_cost;
46586 case scalar_to_vec:
46587 return ix86_cost->scalar_to_vec_cost;
46589 case unaligned_load:
46590 case unaligned_store:
46591 return ix86_cost->vec_unalign_load_cost;
46593 case cond_branch_taken:
46594 return ix86_cost->cond_taken_branch_cost;
46596 case cond_branch_not_taken:
46597 return ix86_cost->cond_not_taken_branch_cost;
46599 case vec_perm:
46600 case vec_promote_demote:
46601 return ix86_cost->vec_stmt_cost;
46603 case vec_construct:
46604 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46606 default:
46607 gcc_unreachable ();
46611 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46612 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46613 insn every time. */
46615 static GTY(()) rtx_insn *vselect_insn;
46617 /* Initialize vselect_insn. */
46619 static void
46620 init_vselect_insn (void)
46622 unsigned i;
46623 rtx x;
46625 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46626 for (i = 0; i < MAX_VECT_LEN; ++i)
46627 XVECEXP (x, 0, i) = const0_rtx;
46628 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46629 const0_rtx), x);
46630 x = gen_rtx_SET (const0_rtx, x);
46631 start_sequence ();
46632 vselect_insn = emit_insn (x);
46633 end_sequence ();
46636 /* Construct (set target (vec_select op0 (parallel perm))) and
46637 return true if that's a valid instruction in the active ISA. */
46639 static bool
46640 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46641 unsigned nelt, bool testing_p)
46643 unsigned int i;
46644 rtx x, save_vconcat;
46645 int icode;
46647 if (vselect_insn == NULL_RTX)
46648 init_vselect_insn ();
46650 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46651 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46652 for (i = 0; i < nelt; ++i)
46653 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46654 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46655 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46656 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46657 SET_DEST (PATTERN (vselect_insn)) = target;
46658 icode = recog_memoized (vselect_insn);
46660 if (icode >= 0 && !testing_p)
46661 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46663 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46664 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46665 INSN_CODE (vselect_insn) = -1;
46667 return icode >= 0;
46670 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46672 static bool
46673 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46674 const unsigned char *perm, unsigned nelt,
46675 bool testing_p)
46677 machine_mode v2mode;
46678 rtx x;
46679 bool ok;
46681 if (vselect_insn == NULL_RTX)
46682 init_vselect_insn ();
46684 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
46685 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46686 PUT_MODE (x, v2mode);
46687 XEXP (x, 0) = op0;
46688 XEXP (x, 1) = op1;
46689 ok = expand_vselect (target, x, perm, nelt, testing_p);
46690 XEXP (x, 0) = const0_rtx;
46691 XEXP (x, 1) = const0_rtx;
46692 return ok;
46695 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46696 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46698 static bool
46699 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46701 machine_mode mmode, vmode = d->vmode;
46702 unsigned i, mask, nelt = d->nelt;
46703 rtx target, op0, op1, maskop, x;
46704 rtx rperm[32], vperm;
46706 if (d->one_operand_p)
46707 return false;
46708 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46709 && (TARGET_AVX512BW
46710 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46712 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46714 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46716 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46718 else
46719 return false;
46721 /* This is a blend, not a permute. Elements must stay in their
46722 respective lanes. */
46723 for (i = 0; i < nelt; ++i)
46725 unsigned e = d->perm[i];
46726 if (!(e == i || e == i + nelt))
46727 return false;
46730 if (d->testing_p)
46731 return true;
46733 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46734 decision should be extracted elsewhere, so that we only try that
46735 sequence once all budget==3 options have been tried. */
46736 target = d->target;
46737 op0 = d->op0;
46738 op1 = d->op1;
46739 mask = 0;
46741 switch (vmode)
46743 case V8DFmode:
46744 case V16SFmode:
46745 case V4DFmode:
46746 case V8SFmode:
46747 case V2DFmode:
46748 case V4SFmode:
46749 case V8HImode:
46750 case V8SImode:
46751 case V32HImode:
46752 case V64QImode:
46753 case V16SImode:
46754 case V8DImode:
46755 for (i = 0; i < nelt; ++i)
46756 mask |= (d->perm[i] >= nelt) << i;
46757 break;
46759 case V2DImode:
46760 for (i = 0; i < 2; ++i)
46761 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46762 vmode = V8HImode;
46763 goto do_subreg;
46765 case V4SImode:
46766 for (i = 0; i < 4; ++i)
46767 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46768 vmode = V8HImode;
46769 goto do_subreg;
46771 case V16QImode:
46772 /* See if bytes move in pairs so we can use pblendw with
46773 an immediate argument, rather than pblendvb with a vector
46774 argument. */
46775 for (i = 0; i < 16; i += 2)
46776 if (d->perm[i] + 1 != d->perm[i + 1])
46778 use_pblendvb:
46779 for (i = 0; i < nelt; ++i)
46780 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46782 finish_pblendvb:
46783 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46784 vperm = force_reg (vmode, vperm);
46786 if (GET_MODE_SIZE (vmode) == 16)
46787 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46788 else
46789 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46790 if (target != d->target)
46791 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46792 return true;
46795 for (i = 0; i < 8; ++i)
46796 mask |= (d->perm[i * 2] >= 16) << i;
46797 vmode = V8HImode;
46798 /* FALLTHRU */
46800 do_subreg:
46801 target = gen_reg_rtx (vmode);
46802 op0 = gen_lowpart (vmode, op0);
46803 op1 = gen_lowpart (vmode, op1);
46804 break;
46806 case V32QImode:
46807 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46808 for (i = 0; i < 32; i += 2)
46809 if (d->perm[i] + 1 != d->perm[i + 1])
46810 goto use_pblendvb;
46811 /* See if bytes move in quadruplets. If yes, vpblendd
46812 with immediate can be used. */
46813 for (i = 0; i < 32; i += 4)
46814 if (d->perm[i] + 2 != d->perm[i + 2])
46815 break;
46816 if (i < 32)
46818 /* See if bytes move the same in both lanes. If yes,
46819 vpblendw with immediate can be used. */
46820 for (i = 0; i < 16; i += 2)
46821 if (d->perm[i] + 16 != d->perm[i + 16])
46822 goto use_pblendvb;
46824 /* Use vpblendw. */
46825 for (i = 0; i < 16; ++i)
46826 mask |= (d->perm[i * 2] >= 32) << i;
46827 vmode = V16HImode;
46828 goto do_subreg;
46831 /* Use vpblendd. */
46832 for (i = 0; i < 8; ++i)
46833 mask |= (d->perm[i * 4] >= 32) << i;
46834 vmode = V8SImode;
46835 goto do_subreg;
46837 case V16HImode:
46838 /* See if words move in pairs. If yes, vpblendd can be used. */
46839 for (i = 0; i < 16; i += 2)
46840 if (d->perm[i] + 1 != d->perm[i + 1])
46841 break;
46842 if (i < 16)
46844 /* See if words move the same in both lanes. If not,
46845 vpblendvb must be used. */
46846 for (i = 0; i < 8; i++)
46847 if (d->perm[i] + 8 != d->perm[i + 8])
46849 /* Use vpblendvb. */
46850 for (i = 0; i < 32; ++i)
46851 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46853 vmode = V32QImode;
46854 nelt = 32;
46855 target = gen_reg_rtx (vmode);
46856 op0 = gen_lowpart (vmode, op0);
46857 op1 = gen_lowpart (vmode, op1);
46858 goto finish_pblendvb;
46861 /* Use vpblendw. */
46862 for (i = 0; i < 16; ++i)
46863 mask |= (d->perm[i] >= 16) << i;
46864 break;
46867 /* Use vpblendd. */
46868 for (i = 0; i < 8; ++i)
46869 mask |= (d->perm[i * 2] >= 16) << i;
46870 vmode = V8SImode;
46871 goto do_subreg;
46873 case V4DImode:
46874 /* Use vpblendd. */
46875 for (i = 0; i < 4; ++i)
46876 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46877 vmode = V8SImode;
46878 goto do_subreg;
46880 default:
46881 gcc_unreachable ();
46884 switch (vmode)
46886 case V8DFmode:
46887 case V8DImode:
46888 mmode = QImode;
46889 break;
46890 case V16SFmode:
46891 case V16SImode:
46892 mmode = HImode;
46893 break;
46894 case V32HImode:
46895 mmode = SImode;
46896 break;
46897 case V64QImode:
46898 mmode = DImode;
46899 break;
46900 default:
46901 mmode = VOIDmode;
46904 if (mmode != VOIDmode)
46905 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46906 else
46907 maskop = GEN_INT (mask);
46909 /* This matches five different patterns with the different modes. */
46910 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46911 x = gen_rtx_SET (target, x);
46912 emit_insn (x);
46913 if (target != d->target)
46914 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46916 return true;
46919 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46920 in terms of the variable form of vpermilps.
46922 Note that we will have already failed the immediate input vpermilps,
46923 which requires that the high and low part shuffle be identical; the
46924 variable form doesn't require that. */
46926 static bool
46927 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46929 rtx rperm[8], vperm;
46930 unsigned i;
46932 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46933 return false;
46935 /* We can only permute within the 128-bit lane. */
46936 for (i = 0; i < 8; ++i)
46938 unsigned e = d->perm[i];
46939 if (i < 4 ? e >= 4 : e < 4)
46940 return false;
46943 if (d->testing_p)
46944 return true;
46946 for (i = 0; i < 8; ++i)
46948 unsigned e = d->perm[i];
46950 /* Within each 128-bit lane, the elements of op0 are numbered
46951 from 0 and the elements of op1 are numbered from 4. */
46952 if (e >= 8 + 4)
46953 e -= 8;
46954 else if (e >= 4)
46955 e -= 4;
46957 rperm[i] = GEN_INT (e);
46960 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46961 vperm = force_reg (V8SImode, vperm);
46962 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46964 return true;
46967 /* Return true if permutation D can be performed as VMODE permutation
46968 instead. */
46970 static bool
46971 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46973 unsigned int i, j, chunk;
46975 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46976 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46977 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46978 return false;
46980 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46981 return true;
46983 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46984 for (i = 0; i < d->nelt; i += chunk)
46985 if (d->perm[i] & (chunk - 1))
46986 return false;
46987 else
46988 for (j = 1; j < chunk; ++j)
46989 if (d->perm[i] + j != d->perm[i + j])
46990 return false;
46992 return true;
46995 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46996 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46998 static bool
46999 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
47001 unsigned i, nelt, eltsz, mask;
47002 unsigned char perm[64];
47003 machine_mode vmode = V16QImode;
47004 rtx rperm[64], vperm, target, op0, op1;
47006 nelt = d->nelt;
47008 if (!d->one_operand_p)
47010 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
47012 if (TARGET_AVX2
47013 && valid_perm_using_mode_p (V2TImode, d))
47015 if (d->testing_p)
47016 return true;
47018 /* Use vperm2i128 insn. The pattern uses
47019 V4DImode instead of V2TImode. */
47020 target = d->target;
47021 if (d->vmode != V4DImode)
47022 target = gen_reg_rtx (V4DImode);
47023 op0 = gen_lowpart (V4DImode, d->op0);
47024 op1 = gen_lowpart (V4DImode, d->op1);
47025 rperm[0]
47026 = GEN_INT ((d->perm[0] / (nelt / 2))
47027 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
47028 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
47029 if (target != d->target)
47030 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47031 return true;
47033 return false;
47036 else
47038 if (GET_MODE_SIZE (d->vmode) == 16)
47040 if (!TARGET_SSSE3)
47041 return false;
47043 else if (GET_MODE_SIZE (d->vmode) == 32)
47045 if (!TARGET_AVX2)
47046 return false;
47048 /* V4DImode should be already handled through
47049 expand_vselect by vpermq instruction. */
47050 gcc_assert (d->vmode != V4DImode);
47052 vmode = V32QImode;
47053 if (d->vmode == V8SImode
47054 || d->vmode == V16HImode
47055 || d->vmode == V32QImode)
47057 /* First see if vpermq can be used for
47058 V8SImode/V16HImode/V32QImode. */
47059 if (valid_perm_using_mode_p (V4DImode, d))
47061 for (i = 0; i < 4; i++)
47062 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47063 if (d->testing_p)
47064 return true;
47065 target = gen_reg_rtx (V4DImode);
47066 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47067 perm, 4, false))
47069 emit_move_insn (d->target,
47070 gen_lowpart (d->vmode, target));
47071 return true;
47073 return false;
47076 /* Next see if vpermd can be used. */
47077 if (valid_perm_using_mode_p (V8SImode, d))
47078 vmode = V8SImode;
47080 /* Or if vpermps can be used. */
47081 else if (d->vmode == V8SFmode)
47082 vmode = V8SImode;
47084 if (vmode == V32QImode)
47086 /* vpshufb only works intra lanes, it is not
47087 possible to shuffle bytes in between the lanes. */
47088 for (i = 0; i < nelt; ++i)
47089 if ((d->perm[i] ^ i) & (nelt / 2))
47090 return false;
47093 else if (GET_MODE_SIZE (d->vmode) == 64)
47095 if (!TARGET_AVX512BW)
47096 return false;
47098 /* If vpermq didn't work, vpshufb won't work either. */
47099 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47100 return false;
47102 vmode = V64QImode;
47103 if (d->vmode == V16SImode
47104 || d->vmode == V32HImode
47105 || d->vmode == V64QImode)
47107 /* First see if vpermq can be used for
47108 V16SImode/V32HImode/V64QImode. */
47109 if (valid_perm_using_mode_p (V8DImode, d))
47111 for (i = 0; i < 8; i++)
47112 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47113 if (d->testing_p)
47114 return true;
47115 target = gen_reg_rtx (V8DImode);
47116 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47117 perm, 8, false))
47119 emit_move_insn (d->target,
47120 gen_lowpart (d->vmode, target));
47121 return true;
47123 return false;
47126 /* Next see if vpermd can be used. */
47127 if (valid_perm_using_mode_p (V16SImode, d))
47128 vmode = V16SImode;
47130 /* Or if vpermps can be used. */
47131 else if (d->vmode == V16SFmode)
47132 vmode = V16SImode;
47133 if (vmode == V64QImode)
47135 /* vpshufb only works intra lanes, it is not
47136 possible to shuffle bytes in between the lanes. */
47137 for (i = 0; i < nelt; ++i)
47138 if ((d->perm[i] ^ i) & (nelt / 4))
47139 return false;
47142 else
47143 return false;
47146 if (d->testing_p)
47147 return true;
47149 if (vmode == V8SImode)
47150 for (i = 0; i < 8; ++i)
47151 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47152 else if (vmode == V16SImode)
47153 for (i = 0; i < 16; ++i)
47154 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47155 else
47157 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47158 if (!d->one_operand_p)
47159 mask = 2 * nelt - 1;
47160 else if (vmode == V16QImode)
47161 mask = nelt - 1;
47162 else if (vmode == V64QImode)
47163 mask = nelt / 4 - 1;
47164 else
47165 mask = nelt / 2 - 1;
47167 for (i = 0; i < nelt; ++i)
47169 unsigned j, e = d->perm[i] & mask;
47170 for (j = 0; j < eltsz; ++j)
47171 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47175 vperm = gen_rtx_CONST_VECTOR (vmode,
47176 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47177 vperm = force_reg (vmode, vperm);
47179 target = d->target;
47180 if (d->vmode != vmode)
47181 target = gen_reg_rtx (vmode);
47182 op0 = gen_lowpart (vmode, d->op0);
47183 if (d->one_operand_p)
47185 if (vmode == V16QImode)
47186 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47187 else if (vmode == V32QImode)
47188 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47189 else if (vmode == V64QImode)
47190 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47191 else if (vmode == V8SFmode)
47192 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47193 else if (vmode == V8SImode)
47194 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47195 else if (vmode == V16SFmode)
47196 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47197 else if (vmode == V16SImode)
47198 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47199 else
47200 gcc_unreachable ();
47202 else
47204 op1 = gen_lowpart (vmode, d->op1);
47205 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47207 if (target != d->target)
47208 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47210 return true;
47213 /* For V*[QHS]Imode permutations, check if the same permutation
47214 can't be performed in a 2x, 4x or 8x wider inner mode. */
47216 static bool
47217 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47218 struct expand_vec_perm_d *nd)
47220 int i;
47221 machine_mode mode = VOIDmode;
47223 switch (d->vmode)
47225 case V16QImode: mode = V8HImode; break;
47226 case V32QImode: mode = V16HImode; break;
47227 case V64QImode: mode = V32HImode; break;
47228 case V8HImode: mode = V4SImode; break;
47229 case V16HImode: mode = V8SImode; break;
47230 case V32HImode: mode = V16SImode; break;
47231 case V4SImode: mode = V2DImode; break;
47232 case V8SImode: mode = V4DImode; break;
47233 case V16SImode: mode = V8DImode; break;
47234 default: return false;
47236 for (i = 0; i < d->nelt; i += 2)
47237 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47238 return false;
47239 nd->vmode = mode;
47240 nd->nelt = d->nelt / 2;
47241 for (i = 0; i < nd->nelt; i++)
47242 nd->perm[i] = d->perm[2 * i] / 2;
47243 if (GET_MODE_INNER (mode) != DImode)
47244 canonicalize_vector_int_perm (nd, nd);
47245 if (nd != d)
47247 nd->one_operand_p = d->one_operand_p;
47248 nd->testing_p = d->testing_p;
47249 if (d->op0 == d->op1)
47250 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47251 else
47253 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47254 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47256 if (d->testing_p)
47257 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47258 else
47259 nd->target = gen_reg_rtx (nd->vmode);
47261 return true;
47264 /* Try to expand one-operand permutation with constant mask. */
47266 static bool
47267 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47269 machine_mode mode = GET_MODE (d->op0);
47270 machine_mode maskmode = mode;
47271 rtx (*gen) (rtx, rtx, rtx) = NULL;
47272 rtx target, op0, mask;
47273 rtx vec[64];
47275 if (!rtx_equal_p (d->op0, d->op1))
47276 return false;
47278 if (!TARGET_AVX512F)
47279 return false;
47281 switch (mode)
47283 case V16SImode:
47284 gen = gen_avx512f_permvarv16si;
47285 break;
47286 case V16SFmode:
47287 gen = gen_avx512f_permvarv16sf;
47288 maskmode = V16SImode;
47289 break;
47290 case V8DImode:
47291 gen = gen_avx512f_permvarv8di;
47292 break;
47293 case V8DFmode:
47294 gen = gen_avx512f_permvarv8df;
47295 maskmode = V8DImode;
47296 break;
47297 default:
47298 return false;
47301 target = d->target;
47302 op0 = d->op0;
47303 for (int i = 0; i < d->nelt; ++i)
47304 vec[i] = GEN_INT (d->perm[i]);
47305 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47306 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47307 return true;
47310 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47311 in a single instruction. */
47313 static bool
47314 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47316 unsigned i, nelt = d->nelt;
47317 struct expand_vec_perm_d nd;
47319 /* Check plain VEC_SELECT first, because AVX has instructions that could
47320 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47321 input where SEL+CONCAT may not. */
47322 if (d->one_operand_p)
47324 int mask = nelt - 1;
47325 bool identity_perm = true;
47326 bool broadcast_perm = true;
47328 for (i = 0; i < nelt; i++)
47330 nd.perm[i] = d->perm[i] & mask;
47331 if (nd.perm[i] != i)
47332 identity_perm = false;
47333 if (nd.perm[i])
47334 broadcast_perm = false;
47337 if (identity_perm)
47339 if (!d->testing_p)
47340 emit_move_insn (d->target, d->op0);
47341 return true;
47343 else if (broadcast_perm && TARGET_AVX2)
47345 /* Use vpbroadcast{b,w,d}. */
47346 rtx (*gen) (rtx, rtx) = NULL;
47347 switch (d->vmode)
47349 case V64QImode:
47350 if (TARGET_AVX512BW)
47351 gen = gen_avx512bw_vec_dupv64qi_1;
47352 break;
47353 case V32QImode:
47354 gen = gen_avx2_pbroadcastv32qi_1;
47355 break;
47356 case V32HImode:
47357 if (TARGET_AVX512BW)
47358 gen = gen_avx512bw_vec_dupv32hi_1;
47359 break;
47360 case V16HImode:
47361 gen = gen_avx2_pbroadcastv16hi_1;
47362 break;
47363 case V16SImode:
47364 if (TARGET_AVX512F)
47365 gen = gen_avx512f_vec_dupv16si_1;
47366 break;
47367 case V8SImode:
47368 gen = gen_avx2_pbroadcastv8si_1;
47369 break;
47370 case V16QImode:
47371 gen = gen_avx2_pbroadcastv16qi;
47372 break;
47373 case V8HImode:
47374 gen = gen_avx2_pbroadcastv8hi;
47375 break;
47376 case V16SFmode:
47377 if (TARGET_AVX512F)
47378 gen = gen_avx512f_vec_dupv16sf_1;
47379 break;
47380 case V8SFmode:
47381 gen = gen_avx2_vec_dupv8sf_1;
47382 break;
47383 case V8DFmode:
47384 if (TARGET_AVX512F)
47385 gen = gen_avx512f_vec_dupv8df_1;
47386 break;
47387 case V8DImode:
47388 if (TARGET_AVX512F)
47389 gen = gen_avx512f_vec_dupv8di_1;
47390 break;
47391 /* For other modes prefer other shuffles this function creates. */
47392 default: break;
47394 if (gen != NULL)
47396 if (!d->testing_p)
47397 emit_insn (gen (d->target, d->op0));
47398 return true;
47402 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47403 return true;
47405 /* There are plenty of patterns in sse.md that are written for
47406 SEL+CONCAT and are not replicated for a single op. Perhaps
47407 that should be changed, to avoid the nastiness here. */
47409 /* Recognize interleave style patterns, which means incrementing
47410 every other permutation operand. */
47411 for (i = 0; i < nelt; i += 2)
47413 nd.perm[i] = d->perm[i] & mask;
47414 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47416 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47417 d->testing_p))
47418 return true;
47420 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47421 if (nelt >= 4)
47423 for (i = 0; i < nelt; i += 4)
47425 nd.perm[i + 0] = d->perm[i + 0] & mask;
47426 nd.perm[i + 1] = d->perm[i + 1] & mask;
47427 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47428 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47431 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47432 d->testing_p))
47433 return true;
47437 /* Finally, try the fully general two operand permute. */
47438 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47439 d->testing_p))
47440 return true;
47442 /* Recognize interleave style patterns with reversed operands. */
47443 if (!d->one_operand_p)
47445 for (i = 0; i < nelt; ++i)
47447 unsigned e = d->perm[i];
47448 if (e >= nelt)
47449 e -= nelt;
47450 else
47451 e += nelt;
47452 nd.perm[i] = e;
47455 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47456 d->testing_p))
47457 return true;
47460 /* Try the SSE4.1 blend variable merge instructions. */
47461 if (expand_vec_perm_blend (d))
47462 return true;
47464 /* Try one of the AVX vpermil variable permutations. */
47465 if (expand_vec_perm_vpermil (d))
47466 return true;
47468 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47469 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47470 if (expand_vec_perm_pshufb (d))
47471 return true;
47473 /* Try the AVX2 vpalignr instruction. */
47474 if (expand_vec_perm_palignr (d, true))
47475 return true;
47477 /* Try the AVX512F vperm{s,d} instructions. */
47478 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47479 return true;
47481 /* Try the AVX512F vpermi2 instructions. */
47482 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47483 return true;
47485 /* See if we can get the same permutation in different vector integer
47486 mode. */
47487 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47489 if (!d->testing_p)
47490 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47491 return true;
47493 return false;
47496 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47497 in terms of a pair of pshuflw + pshufhw instructions. */
47499 static bool
47500 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47502 unsigned char perm2[MAX_VECT_LEN];
47503 unsigned i;
47504 bool ok;
47506 if (d->vmode != V8HImode || !d->one_operand_p)
47507 return false;
47509 /* The two permutations only operate in 64-bit lanes. */
47510 for (i = 0; i < 4; ++i)
47511 if (d->perm[i] >= 4)
47512 return false;
47513 for (i = 4; i < 8; ++i)
47514 if (d->perm[i] < 4)
47515 return false;
47517 if (d->testing_p)
47518 return true;
47520 /* Emit the pshuflw. */
47521 memcpy (perm2, d->perm, 4);
47522 for (i = 4; i < 8; ++i)
47523 perm2[i] = i;
47524 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47525 gcc_assert (ok);
47527 /* Emit the pshufhw. */
47528 memcpy (perm2 + 4, d->perm + 4, 4);
47529 for (i = 0; i < 4; ++i)
47530 perm2[i] = i;
47531 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47532 gcc_assert (ok);
47534 return true;
47537 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47538 the permutation using the SSSE3 palignr instruction. This succeeds
47539 when all of the elements in PERM fit within one vector and we merely
47540 need to shift them down so that a single vector permutation has a
47541 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47542 the vpalignr instruction itself can perform the requested permutation. */
47544 static bool
47545 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47547 unsigned i, nelt = d->nelt;
47548 unsigned min, max, minswap, maxswap;
47549 bool in_order, ok, swap = false;
47550 rtx shift, target;
47551 struct expand_vec_perm_d dcopy;
47553 /* Even with AVX, palignr only operates on 128-bit vectors,
47554 in AVX2 palignr operates on both 128-bit lanes. */
47555 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47556 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47557 return false;
47559 min = 2 * nelt;
47560 max = 0;
47561 minswap = 2 * nelt;
47562 maxswap = 0;
47563 for (i = 0; i < nelt; ++i)
47565 unsigned e = d->perm[i];
47566 unsigned eswap = d->perm[i] ^ nelt;
47567 if (GET_MODE_SIZE (d->vmode) == 32)
47569 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47570 eswap = e ^ (nelt / 2);
47572 if (e < min)
47573 min = e;
47574 if (e > max)
47575 max = e;
47576 if (eswap < minswap)
47577 minswap = eswap;
47578 if (eswap > maxswap)
47579 maxswap = eswap;
47581 if (min == 0
47582 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47584 if (d->one_operand_p
47585 || minswap == 0
47586 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47587 ? nelt / 2 : nelt))
47588 return false;
47589 swap = true;
47590 min = minswap;
47591 max = maxswap;
47594 /* Given that we have SSSE3, we know we'll be able to implement the
47595 single operand permutation after the palignr with pshufb for
47596 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47597 first. */
47598 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47599 return true;
47601 dcopy = *d;
47602 if (swap)
47604 dcopy.op0 = d->op1;
47605 dcopy.op1 = d->op0;
47606 for (i = 0; i < nelt; ++i)
47607 dcopy.perm[i] ^= nelt;
47610 in_order = true;
47611 for (i = 0; i < nelt; ++i)
47613 unsigned e = dcopy.perm[i];
47614 if (GET_MODE_SIZE (d->vmode) == 32
47615 && e >= nelt
47616 && (e & (nelt / 2 - 1)) < min)
47617 e = e - min - (nelt / 2);
47618 else
47619 e = e - min;
47620 if (e != i)
47621 in_order = false;
47622 dcopy.perm[i] = e;
47624 dcopy.one_operand_p = true;
47626 if (single_insn_only_p && !in_order)
47627 return false;
47629 /* For AVX2, test whether we can permute the result in one instruction. */
47630 if (d->testing_p)
47632 if (in_order)
47633 return true;
47634 dcopy.op1 = dcopy.op0;
47635 return expand_vec_perm_1 (&dcopy);
47638 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47639 if (GET_MODE_SIZE (d->vmode) == 16)
47641 target = gen_reg_rtx (TImode);
47642 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47643 gen_lowpart (TImode, dcopy.op0), shift));
47645 else
47647 target = gen_reg_rtx (V2TImode);
47648 emit_insn (gen_avx2_palignrv2ti (target,
47649 gen_lowpart (V2TImode, dcopy.op1),
47650 gen_lowpart (V2TImode, dcopy.op0),
47651 shift));
47654 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47656 /* Test for the degenerate case where the alignment by itself
47657 produces the desired permutation. */
47658 if (in_order)
47660 emit_move_insn (d->target, dcopy.op0);
47661 return true;
47664 ok = expand_vec_perm_1 (&dcopy);
47665 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47667 return ok;
47670 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47671 the permutation using the SSE4_1 pblendv instruction. Potentially
47672 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47674 static bool
47675 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47677 unsigned i, which, nelt = d->nelt;
47678 struct expand_vec_perm_d dcopy, dcopy1;
47679 machine_mode vmode = d->vmode;
47680 bool ok;
47682 /* Use the same checks as in expand_vec_perm_blend. */
47683 if (d->one_operand_p)
47684 return false;
47685 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47687 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47689 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47691 else
47692 return false;
47694 /* Figure out where permutation elements stay not in their
47695 respective lanes. */
47696 for (i = 0, which = 0; i < nelt; ++i)
47698 unsigned e = d->perm[i];
47699 if (e != i)
47700 which |= (e < nelt ? 1 : 2);
47702 /* We can pblend the part where elements stay not in their
47703 respective lanes only when these elements are all in one
47704 half of a permutation.
47705 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47706 lanes, but both 8 and 9 >= 8
47707 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47708 respective lanes and 8 >= 8, but 2 not. */
47709 if (which != 1 && which != 2)
47710 return false;
47711 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47712 return true;
47714 /* First we apply one operand permutation to the part where
47715 elements stay not in their respective lanes. */
47716 dcopy = *d;
47717 if (which == 2)
47718 dcopy.op0 = dcopy.op1 = d->op1;
47719 else
47720 dcopy.op0 = dcopy.op1 = d->op0;
47721 if (!d->testing_p)
47722 dcopy.target = gen_reg_rtx (vmode);
47723 dcopy.one_operand_p = true;
47725 for (i = 0; i < nelt; ++i)
47726 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47728 ok = expand_vec_perm_1 (&dcopy);
47729 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47730 return false;
47731 else
47732 gcc_assert (ok);
47733 if (d->testing_p)
47734 return true;
47736 /* Next we put permuted elements into their positions. */
47737 dcopy1 = *d;
47738 if (which == 2)
47739 dcopy1.op1 = dcopy.target;
47740 else
47741 dcopy1.op0 = dcopy.target;
47743 for (i = 0; i < nelt; ++i)
47744 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47746 ok = expand_vec_perm_blend (&dcopy1);
47747 gcc_assert (ok);
47749 return true;
47752 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47754 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47755 a two vector permutation into a single vector permutation by using
47756 an interleave operation to merge the vectors. */
47758 static bool
47759 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47761 struct expand_vec_perm_d dremap, dfinal;
47762 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47763 unsigned HOST_WIDE_INT contents;
47764 unsigned char remap[2 * MAX_VECT_LEN];
47765 rtx_insn *seq;
47766 bool ok, same_halves = false;
47768 if (GET_MODE_SIZE (d->vmode) == 16)
47770 if (d->one_operand_p)
47771 return false;
47773 else if (GET_MODE_SIZE (d->vmode) == 32)
47775 if (!TARGET_AVX)
47776 return false;
47777 /* For 32-byte modes allow even d->one_operand_p.
47778 The lack of cross-lane shuffling in some instructions
47779 might prevent a single insn shuffle. */
47780 dfinal = *d;
47781 dfinal.testing_p = true;
47782 /* If expand_vec_perm_interleave3 can expand this into
47783 a 3 insn sequence, give up and let it be expanded as
47784 3 insn sequence. While that is one insn longer,
47785 it doesn't need a memory operand and in the common
47786 case that both interleave low and high permutations
47787 with the same operands are adjacent needs 4 insns
47788 for both after CSE. */
47789 if (expand_vec_perm_interleave3 (&dfinal))
47790 return false;
47792 else
47793 return false;
47795 /* Examine from whence the elements come. */
47796 contents = 0;
47797 for (i = 0; i < nelt; ++i)
47798 contents |= HOST_WIDE_INT_1U << d->perm[i];
47800 memset (remap, 0xff, sizeof (remap));
47801 dremap = *d;
47803 if (GET_MODE_SIZE (d->vmode) == 16)
47805 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47807 /* Split the two input vectors into 4 halves. */
47808 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47809 h2 = h1 << nelt2;
47810 h3 = h2 << nelt2;
47811 h4 = h3 << nelt2;
47813 /* If the elements from the low halves use interleave low, and similarly
47814 for interleave high. If the elements are from mis-matched halves, we
47815 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47816 if ((contents & (h1 | h3)) == contents)
47818 /* punpckl* */
47819 for (i = 0; i < nelt2; ++i)
47821 remap[i] = i * 2;
47822 remap[i + nelt] = i * 2 + 1;
47823 dremap.perm[i * 2] = i;
47824 dremap.perm[i * 2 + 1] = i + nelt;
47826 if (!TARGET_SSE2 && d->vmode == V4SImode)
47827 dremap.vmode = V4SFmode;
47829 else if ((contents & (h2 | h4)) == contents)
47831 /* punpckh* */
47832 for (i = 0; i < nelt2; ++i)
47834 remap[i + nelt2] = i * 2;
47835 remap[i + nelt + nelt2] = i * 2 + 1;
47836 dremap.perm[i * 2] = i + nelt2;
47837 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47839 if (!TARGET_SSE2 && d->vmode == V4SImode)
47840 dremap.vmode = V4SFmode;
47842 else if ((contents & (h1 | h4)) == contents)
47844 /* shufps */
47845 for (i = 0; i < nelt2; ++i)
47847 remap[i] = i;
47848 remap[i + nelt + nelt2] = i + nelt2;
47849 dremap.perm[i] = i;
47850 dremap.perm[i + nelt2] = i + nelt + nelt2;
47852 if (nelt != 4)
47854 /* shufpd */
47855 dremap.vmode = V2DImode;
47856 dremap.nelt = 2;
47857 dremap.perm[0] = 0;
47858 dremap.perm[1] = 3;
47861 else if ((contents & (h2 | h3)) == contents)
47863 /* shufps */
47864 for (i = 0; i < nelt2; ++i)
47866 remap[i + nelt2] = i;
47867 remap[i + nelt] = i + nelt2;
47868 dremap.perm[i] = i + nelt2;
47869 dremap.perm[i + nelt2] = i + nelt;
47871 if (nelt != 4)
47873 /* shufpd */
47874 dremap.vmode = V2DImode;
47875 dremap.nelt = 2;
47876 dremap.perm[0] = 1;
47877 dremap.perm[1] = 2;
47880 else
47881 return false;
47883 else
47885 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47886 unsigned HOST_WIDE_INT q[8];
47887 unsigned int nonzero_halves[4];
47889 /* Split the two input vectors into 8 quarters. */
47890 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47891 for (i = 1; i < 8; ++i)
47892 q[i] = q[0] << (nelt4 * i);
47893 for (i = 0; i < 4; ++i)
47894 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47896 nonzero_halves[nzcnt] = i;
47897 ++nzcnt;
47900 if (nzcnt == 1)
47902 gcc_assert (d->one_operand_p);
47903 nonzero_halves[1] = nonzero_halves[0];
47904 same_halves = true;
47906 else if (d->one_operand_p)
47908 gcc_assert (nonzero_halves[0] == 0);
47909 gcc_assert (nonzero_halves[1] == 1);
47912 if (nzcnt <= 2)
47914 if (d->perm[0] / nelt2 == nonzero_halves[1])
47916 /* Attempt to increase the likelihood that dfinal
47917 shuffle will be intra-lane. */
47918 std::swap (nonzero_halves[0], nonzero_halves[1]);
47921 /* vperm2f128 or vperm2i128. */
47922 for (i = 0; i < nelt2; ++i)
47924 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47925 remap[i + nonzero_halves[0] * nelt2] = i;
47926 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47927 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47930 if (d->vmode != V8SFmode
47931 && d->vmode != V4DFmode
47932 && d->vmode != V8SImode)
47934 dremap.vmode = V8SImode;
47935 dremap.nelt = 8;
47936 for (i = 0; i < 4; ++i)
47938 dremap.perm[i] = i + nonzero_halves[0] * 4;
47939 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47943 else if (d->one_operand_p)
47944 return false;
47945 else if (TARGET_AVX2
47946 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47948 /* vpunpckl* */
47949 for (i = 0; i < nelt4; ++i)
47951 remap[i] = i * 2;
47952 remap[i + nelt] = i * 2 + 1;
47953 remap[i + nelt2] = i * 2 + nelt2;
47954 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47955 dremap.perm[i * 2] = i;
47956 dremap.perm[i * 2 + 1] = i + nelt;
47957 dremap.perm[i * 2 + nelt2] = i + nelt2;
47958 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47961 else if (TARGET_AVX2
47962 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47964 /* vpunpckh* */
47965 for (i = 0; i < nelt4; ++i)
47967 remap[i + nelt4] = i * 2;
47968 remap[i + nelt + nelt4] = i * 2 + 1;
47969 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47970 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47971 dremap.perm[i * 2] = i + nelt4;
47972 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47973 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47974 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47977 else
47978 return false;
47981 /* Use the remapping array set up above to move the elements from their
47982 swizzled locations into their final destinations. */
47983 dfinal = *d;
47984 for (i = 0; i < nelt; ++i)
47986 unsigned e = remap[d->perm[i]];
47987 gcc_assert (e < nelt);
47988 /* If same_halves is true, both halves of the remapped vector are the
47989 same. Avoid cross-lane accesses if possible. */
47990 if (same_halves && i >= nelt2)
47992 gcc_assert (e < nelt2);
47993 dfinal.perm[i] = e + nelt2;
47995 else
47996 dfinal.perm[i] = e;
47998 if (!d->testing_p)
48000 dremap.target = gen_reg_rtx (dremap.vmode);
48001 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48003 dfinal.op1 = dfinal.op0;
48004 dfinal.one_operand_p = true;
48006 /* Test if the final remap can be done with a single insn. For V4SFmode or
48007 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
48008 start_sequence ();
48009 ok = expand_vec_perm_1 (&dfinal);
48010 seq = get_insns ();
48011 end_sequence ();
48013 if (!ok)
48014 return false;
48016 if (d->testing_p)
48017 return true;
48019 if (dremap.vmode != dfinal.vmode)
48021 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
48022 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
48025 ok = expand_vec_perm_1 (&dremap);
48026 gcc_assert (ok);
48028 emit_insn (seq);
48029 return true;
48032 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48033 a single vector cross-lane permutation into vpermq followed
48034 by any of the single insn permutations. */
48036 static bool
48037 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
48039 struct expand_vec_perm_d dremap, dfinal;
48040 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
48041 unsigned contents[2];
48042 bool ok;
48044 if (!(TARGET_AVX2
48045 && (d->vmode == V32QImode || d->vmode == V16HImode)
48046 && d->one_operand_p))
48047 return false;
48049 contents[0] = 0;
48050 contents[1] = 0;
48051 for (i = 0; i < nelt2; ++i)
48053 contents[0] |= 1u << (d->perm[i] / nelt4);
48054 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48057 for (i = 0; i < 2; ++i)
48059 unsigned int cnt = 0;
48060 for (j = 0; j < 4; ++j)
48061 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48062 return false;
48065 if (d->testing_p)
48066 return true;
48068 dremap = *d;
48069 dremap.vmode = V4DImode;
48070 dremap.nelt = 4;
48071 dremap.target = gen_reg_rtx (V4DImode);
48072 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48073 dremap.op1 = dremap.op0;
48074 dremap.one_operand_p = true;
48075 for (i = 0; i < 2; ++i)
48077 unsigned int cnt = 0;
48078 for (j = 0; j < 4; ++j)
48079 if ((contents[i] & (1u << j)) != 0)
48080 dremap.perm[2 * i + cnt++] = j;
48081 for (; cnt < 2; ++cnt)
48082 dremap.perm[2 * i + cnt] = 0;
48085 dfinal = *d;
48086 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48087 dfinal.op1 = dfinal.op0;
48088 dfinal.one_operand_p = true;
48089 for (i = 0, j = 0; i < nelt; ++i)
48091 if (i == nelt2)
48092 j = 2;
48093 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48094 if ((d->perm[i] / nelt4) == dremap.perm[j])
48096 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48097 dfinal.perm[i] |= nelt4;
48098 else
48099 gcc_unreachable ();
48102 ok = expand_vec_perm_1 (&dremap);
48103 gcc_assert (ok);
48105 ok = expand_vec_perm_1 (&dfinal);
48106 gcc_assert (ok);
48108 return true;
48111 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48112 a vector permutation using two instructions, vperm2f128 resp.
48113 vperm2i128 followed by any single in-lane permutation. */
48115 static bool
48116 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48118 struct expand_vec_perm_d dfirst, dsecond;
48119 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48120 bool ok;
48122 if (!TARGET_AVX
48123 || GET_MODE_SIZE (d->vmode) != 32
48124 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48125 return false;
48127 dsecond = *d;
48128 dsecond.one_operand_p = false;
48129 dsecond.testing_p = true;
48131 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48132 immediate. For perm < 16 the second permutation uses
48133 d->op0 as first operand, for perm >= 16 it uses d->op1
48134 as first operand. The second operand is the result of
48135 vperm2[fi]128. */
48136 for (perm = 0; perm < 32; perm++)
48138 /* Ignore permutations which do not move anything cross-lane. */
48139 if (perm < 16)
48141 /* The second shuffle for e.g. V4DFmode has
48142 0123 and ABCD operands.
48143 Ignore AB23, as 23 is already in the second lane
48144 of the first operand. */
48145 if ((perm & 0xc) == (1 << 2)) continue;
48146 /* And 01CD, as 01 is in the first lane of the first
48147 operand. */
48148 if ((perm & 3) == 0) continue;
48149 /* And 4567, as then the vperm2[fi]128 doesn't change
48150 anything on the original 4567 second operand. */
48151 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48153 else
48155 /* The second shuffle for e.g. V4DFmode has
48156 4567 and ABCD operands.
48157 Ignore AB67, as 67 is already in the second lane
48158 of the first operand. */
48159 if ((perm & 0xc) == (3 << 2)) continue;
48160 /* And 45CD, as 45 is in the first lane of the first
48161 operand. */
48162 if ((perm & 3) == 2) continue;
48163 /* And 0123, as then the vperm2[fi]128 doesn't change
48164 anything on the original 0123 first operand. */
48165 if ((perm & 0xf) == (1 << 2)) continue;
48168 for (i = 0; i < nelt; i++)
48170 j = d->perm[i] / nelt2;
48171 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48172 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48173 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48174 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48175 else
48176 break;
48179 if (i == nelt)
48181 start_sequence ();
48182 ok = expand_vec_perm_1 (&dsecond);
48183 end_sequence ();
48185 else
48186 ok = false;
48188 if (ok)
48190 if (d->testing_p)
48191 return true;
48193 /* Found a usable second shuffle. dfirst will be
48194 vperm2f128 on d->op0 and d->op1. */
48195 dsecond.testing_p = false;
48196 dfirst = *d;
48197 dfirst.target = gen_reg_rtx (d->vmode);
48198 for (i = 0; i < nelt; i++)
48199 dfirst.perm[i] = (i & (nelt2 - 1))
48200 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48202 canonicalize_perm (&dfirst);
48203 ok = expand_vec_perm_1 (&dfirst);
48204 gcc_assert (ok);
48206 /* And dsecond is some single insn shuffle, taking
48207 d->op0 and result of vperm2f128 (if perm < 16) or
48208 d->op1 and result of vperm2f128 (otherwise). */
48209 if (perm >= 16)
48210 dsecond.op0 = dsecond.op1;
48211 dsecond.op1 = dfirst.target;
48213 ok = expand_vec_perm_1 (&dsecond);
48214 gcc_assert (ok);
48216 return true;
48219 /* For one operand, the only useful vperm2f128 permutation is 0x01
48220 aka lanes swap. */
48221 if (d->one_operand_p)
48222 return false;
48225 return false;
48228 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48229 a two vector permutation using 2 intra-lane interleave insns
48230 and cross-lane shuffle for 32-byte vectors. */
48232 static bool
48233 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48235 unsigned i, nelt;
48236 rtx (*gen) (rtx, rtx, rtx);
48238 if (d->one_operand_p)
48239 return false;
48240 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48242 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48244 else
48245 return false;
48247 nelt = d->nelt;
48248 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48249 return false;
48250 for (i = 0; i < nelt; i += 2)
48251 if (d->perm[i] != d->perm[0] + i / 2
48252 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48253 return false;
48255 if (d->testing_p)
48256 return true;
48258 switch (d->vmode)
48260 case V32QImode:
48261 if (d->perm[0])
48262 gen = gen_vec_interleave_highv32qi;
48263 else
48264 gen = gen_vec_interleave_lowv32qi;
48265 break;
48266 case V16HImode:
48267 if (d->perm[0])
48268 gen = gen_vec_interleave_highv16hi;
48269 else
48270 gen = gen_vec_interleave_lowv16hi;
48271 break;
48272 case V8SImode:
48273 if (d->perm[0])
48274 gen = gen_vec_interleave_highv8si;
48275 else
48276 gen = gen_vec_interleave_lowv8si;
48277 break;
48278 case V4DImode:
48279 if (d->perm[0])
48280 gen = gen_vec_interleave_highv4di;
48281 else
48282 gen = gen_vec_interleave_lowv4di;
48283 break;
48284 case V8SFmode:
48285 if (d->perm[0])
48286 gen = gen_vec_interleave_highv8sf;
48287 else
48288 gen = gen_vec_interleave_lowv8sf;
48289 break;
48290 case V4DFmode:
48291 if (d->perm[0])
48292 gen = gen_vec_interleave_highv4df;
48293 else
48294 gen = gen_vec_interleave_lowv4df;
48295 break;
48296 default:
48297 gcc_unreachable ();
48300 emit_insn (gen (d->target, d->op0, d->op1));
48301 return true;
48304 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48305 a single vector permutation using a single intra-lane vector
48306 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48307 the non-swapped and swapped vectors together. */
48309 static bool
48310 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48312 struct expand_vec_perm_d dfirst, dsecond;
48313 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48314 rtx_insn *seq;
48315 bool ok;
48316 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48318 if (!TARGET_AVX
48319 || TARGET_AVX2
48320 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48321 || !d->one_operand_p)
48322 return false;
48324 dfirst = *d;
48325 for (i = 0; i < nelt; i++)
48326 dfirst.perm[i] = 0xff;
48327 for (i = 0, msk = 0; i < nelt; i++)
48329 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48330 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48331 return false;
48332 dfirst.perm[j] = d->perm[i];
48333 if (j != i)
48334 msk |= (1 << i);
48336 for (i = 0; i < nelt; i++)
48337 if (dfirst.perm[i] == 0xff)
48338 dfirst.perm[i] = i;
48340 if (!d->testing_p)
48341 dfirst.target = gen_reg_rtx (dfirst.vmode);
48343 start_sequence ();
48344 ok = expand_vec_perm_1 (&dfirst);
48345 seq = get_insns ();
48346 end_sequence ();
48348 if (!ok)
48349 return false;
48351 if (d->testing_p)
48352 return true;
48354 emit_insn (seq);
48356 dsecond = *d;
48357 dsecond.op0 = dfirst.target;
48358 dsecond.op1 = dfirst.target;
48359 dsecond.one_operand_p = true;
48360 dsecond.target = gen_reg_rtx (dsecond.vmode);
48361 for (i = 0; i < nelt; i++)
48362 dsecond.perm[i] = i ^ nelt2;
48364 ok = expand_vec_perm_1 (&dsecond);
48365 gcc_assert (ok);
48367 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48368 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48369 return true;
48372 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48373 permutation using two vperm2f128, followed by a vshufpd insn blending
48374 the two vectors together. */
48376 static bool
48377 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48379 struct expand_vec_perm_d dfirst, dsecond, dthird;
48380 bool ok;
48382 if (!TARGET_AVX || (d->vmode != V4DFmode))
48383 return false;
48385 if (d->testing_p)
48386 return true;
48388 dfirst = *d;
48389 dsecond = *d;
48390 dthird = *d;
48392 dfirst.perm[0] = (d->perm[0] & ~1);
48393 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48394 dfirst.perm[2] = (d->perm[2] & ~1);
48395 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48396 dsecond.perm[0] = (d->perm[1] & ~1);
48397 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48398 dsecond.perm[2] = (d->perm[3] & ~1);
48399 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48400 dthird.perm[0] = (d->perm[0] % 2);
48401 dthird.perm[1] = (d->perm[1] % 2) + 4;
48402 dthird.perm[2] = (d->perm[2] % 2) + 2;
48403 dthird.perm[3] = (d->perm[3] % 2) + 6;
48405 dfirst.target = gen_reg_rtx (dfirst.vmode);
48406 dsecond.target = gen_reg_rtx (dsecond.vmode);
48407 dthird.op0 = dfirst.target;
48408 dthird.op1 = dsecond.target;
48409 dthird.one_operand_p = false;
48411 canonicalize_perm (&dfirst);
48412 canonicalize_perm (&dsecond);
48414 ok = expand_vec_perm_1 (&dfirst)
48415 && expand_vec_perm_1 (&dsecond)
48416 && expand_vec_perm_1 (&dthird);
48418 gcc_assert (ok);
48420 return true;
48423 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48424 permutation with two pshufb insns and an ior. We should have already
48425 failed all two instruction sequences. */
48427 static bool
48428 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48430 rtx rperm[2][16], vperm, l, h, op, m128;
48431 unsigned int i, nelt, eltsz;
48433 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48434 return false;
48435 gcc_assert (!d->one_operand_p);
48437 if (d->testing_p)
48438 return true;
48440 nelt = d->nelt;
48441 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48443 /* Generate two permutation masks. If the required element is within
48444 the given vector it is shuffled into the proper lane. If the required
48445 element is in the other vector, force a zero into the lane by setting
48446 bit 7 in the permutation mask. */
48447 m128 = GEN_INT (-128);
48448 for (i = 0; i < nelt; ++i)
48450 unsigned j, e = d->perm[i];
48451 unsigned which = (e >= nelt);
48452 if (e >= nelt)
48453 e -= nelt;
48455 for (j = 0; j < eltsz; ++j)
48457 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48458 rperm[1-which][i*eltsz + j] = m128;
48462 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48463 vperm = force_reg (V16QImode, vperm);
48465 l = gen_reg_rtx (V16QImode);
48466 op = gen_lowpart (V16QImode, d->op0);
48467 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48469 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48470 vperm = force_reg (V16QImode, vperm);
48472 h = gen_reg_rtx (V16QImode);
48473 op = gen_lowpart (V16QImode, d->op1);
48474 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48476 op = d->target;
48477 if (d->vmode != V16QImode)
48478 op = gen_reg_rtx (V16QImode);
48479 emit_insn (gen_iorv16qi3 (op, l, h));
48480 if (op != d->target)
48481 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48483 return true;
48486 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48487 with two vpshufb insns, vpermq and vpor. We should have already failed
48488 all two or three instruction sequences. */
48490 static bool
48491 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48493 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48494 unsigned int i, nelt, eltsz;
48496 if (!TARGET_AVX2
48497 || !d->one_operand_p
48498 || (d->vmode != V32QImode && d->vmode != V16HImode))
48499 return false;
48501 if (d->testing_p)
48502 return true;
48504 nelt = d->nelt;
48505 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48507 /* Generate two permutation masks. If the required element is within
48508 the same lane, it is shuffled in. If the required element from the
48509 other lane, force a zero by setting bit 7 in the permutation mask.
48510 In the other mask the mask has non-negative elements if element
48511 is requested from the other lane, but also moved to the other lane,
48512 so that the result of vpshufb can have the two V2TImode halves
48513 swapped. */
48514 m128 = GEN_INT (-128);
48515 for (i = 0; i < nelt; ++i)
48517 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48518 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48520 for (j = 0; j < eltsz; ++j)
48522 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48523 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48527 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48528 vperm = force_reg (V32QImode, vperm);
48530 h = gen_reg_rtx (V32QImode);
48531 op = gen_lowpart (V32QImode, d->op0);
48532 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48534 /* Swap the 128-byte lanes of h into hp. */
48535 hp = gen_reg_rtx (V4DImode);
48536 op = gen_lowpart (V4DImode, h);
48537 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48538 const1_rtx));
48540 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48541 vperm = force_reg (V32QImode, vperm);
48543 l = gen_reg_rtx (V32QImode);
48544 op = gen_lowpart (V32QImode, d->op0);
48545 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48547 op = d->target;
48548 if (d->vmode != V32QImode)
48549 op = gen_reg_rtx (V32QImode);
48550 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48551 if (op != d->target)
48552 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48554 return true;
48557 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48558 and extract-odd permutations of two V32QImode and V16QImode operand
48559 with two vpshufb insns, vpor and vpermq. We should have already
48560 failed all two or three instruction sequences. */
48562 static bool
48563 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48565 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48566 unsigned int i, nelt, eltsz;
48568 if (!TARGET_AVX2
48569 || d->one_operand_p
48570 || (d->vmode != V32QImode && d->vmode != V16HImode))
48571 return false;
48573 for (i = 0; i < d->nelt; ++i)
48574 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48575 return false;
48577 if (d->testing_p)
48578 return true;
48580 nelt = d->nelt;
48581 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48583 /* Generate two permutation masks. In the first permutation mask
48584 the first quarter will contain indexes for the first half
48585 of the op0, the second quarter will contain bit 7 set, third quarter
48586 will contain indexes for the second half of the op0 and the
48587 last quarter bit 7 set. In the second permutation mask
48588 the first quarter will contain bit 7 set, the second quarter
48589 indexes for the first half of the op1, the third quarter bit 7 set
48590 and last quarter indexes for the second half of the op1.
48591 I.e. the first mask e.g. for V32QImode extract even will be:
48592 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48593 (all values masked with 0xf except for -128) and second mask
48594 for extract even will be
48595 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48596 m128 = GEN_INT (-128);
48597 for (i = 0; i < nelt; ++i)
48599 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48600 unsigned which = d->perm[i] >= nelt;
48601 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48603 for (j = 0; j < eltsz; ++j)
48605 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48606 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48610 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48611 vperm = force_reg (V32QImode, vperm);
48613 l = gen_reg_rtx (V32QImode);
48614 op = gen_lowpart (V32QImode, d->op0);
48615 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48617 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48618 vperm = force_reg (V32QImode, vperm);
48620 h = gen_reg_rtx (V32QImode);
48621 op = gen_lowpart (V32QImode, d->op1);
48622 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48624 ior = gen_reg_rtx (V32QImode);
48625 emit_insn (gen_iorv32qi3 (ior, l, h));
48627 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48628 op = gen_reg_rtx (V4DImode);
48629 ior = gen_lowpart (V4DImode, ior);
48630 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48631 const1_rtx, GEN_INT (3)));
48632 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48634 return true;
48637 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48638 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48639 with two "and" and "pack" or two "shift" and "pack" insns. We should
48640 have already failed all two instruction sequences. */
48642 static bool
48643 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48645 rtx op, dop0, dop1, t, rperm[16];
48646 unsigned i, odd, c, s, nelt = d->nelt;
48647 bool end_perm = false;
48648 machine_mode half_mode;
48649 rtx (*gen_and) (rtx, rtx, rtx);
48650 rtx (*gen_pack) (rtx, rtx, rtx);
48651 rtx (*gen_shift) (rtx, rtx, rtx);
48653 if (d->one_operand_p)
48654 return false;
48656 switch (d->vmode)
48658 case V8HImode:
48659 /* Required for "pack". */
48660 if (!TARGET_SSE4_1)
48661 return false;
48662 c = 0xffff;
48663 s = 16;
48664 half_mode = V4SImode;
48665 gen_and = gen_andv4si3;
48666 gen_pack = gen_sse4_1_packusdw;
48667 gen_shift = gen_lshrv4si3;
48668 break;
48669 case V16QImode:
48670 /* No check as all instructions are SSE2. */
48671 c = 0xff;
48672 s = 8;
48673 half_mode = V8HImode;
48674 gen_and = gen_andv8hi3;
48675 gen_pack = gen_sse2_packuswb;
48676 gen_shift = gen_lshrv8hi3;
48677 break;
48678 case V16HImode:
48679 if (!TARGET_AVX2)
48680 return false;
48681 c = 0xffff;
48682 s = 16;
48683 half_mode = V8SImode;
48684 gen_and = gen_andv8si3;
48685 gen_pack = gen_avx2_packusdw;
48686 gen_shift = gen_lshrv8si3;
48687 end_perm = true;
48688 break;
48689 case V32QImode:
48690 if (!TARGET_AVX2)
48691 return false;
48692 c = 0xff;
48693 s = 8;
48694 half_mode = V16HImode;
48695 gen_and = gen_andv16hi3;
48696 gen_pack = gen_avx2_packuswb;
48697 gen_shift = gen_lshrv16hi3;
48698 end_perm = true;
48699 break;
48700 default:
48701 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48702 general shuffles. */
48703 return false;
48706 /* Check that permutation is even or odd. */
48707 odd = d->perm[0];
48708 if (odd > 1)
48709 return false;
48711 for (i = 1; i < nelt; ++i)
48712 if (d->perm[i] != 2 * i + odd)
48713 return false;
48715 if (d->testing_p)
48716 return true;
48718 dop0 = gen_reg_rtx (half_mode);
48719 dop1 = gen_reg_rtx (half_mode);
48720 if (odd == 0)
48722 for (i = 0; i < nelt / 2; i++)
48723 rperm[i] = GEN_INT (c);
48724 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
48725 t = force_reg (half_mode, t);
48726 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48727 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48729 else
48731 emit_insn (gen_shift (dop0,
48732 gen_lowpart (half_mode, d->op0),
48733 GEN_INT (s)));
48734 emit_insn (gen_shift (dop1,
48735 gen_lowpart (half_mode, d->op1),
48736 GEN_INT (s)));
48738 /* In AVX2 for 256 bit case we need to permute pack result. */
48739 if (TARGET_AVX2 && end_perm)
48741 op = gen_reg_rtx (d->vmode);
48742 t = gen_reg_rtx (V4DImode);
48743 emit_insn (gen_pack (op, dop0, dop1));
48744 emit_insn (gen_avx2_permv4di_1 (t,
48745 gen_lowpart (V4DImode, op),
48746 const0_rtx,
48747 const2_rtx,
48748 const1_rtx,
48749 GEN_INT (3)));
48750 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48752 else
48753 emit_insn (gen_pack (d->target, dop0, dop1));
48755 return true;
48758 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48759 and extract-odd permutations of two V64QI operands
48760 with two "shifts", two "truncs" and one "concat" insns for "odd"
48761 and two "truncs" and one concat insn for "even."
48762 Have already failed all two instruction sequences. */
48764 static bool
48765 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48767 rtx t1, t2, t3, t4;
48768 unsigned i, odd, nelt = d->nelt;
48770 if (!TARGET_AVX512BW
48771 || d->one_operand_p
48772 || d->vmode != V64QImode)
48773 return false;
48775 /* Check that permutation is even or odd. */
48776 odd = d->perm[0];
48777 if (odd > 1)
48778 return false;
48780 for (i = 1; i < nelt; ++i)
48781 if (d->perm[i] != 2 * i + odd)
48782 return false;
48784 if (d->testing_p)
48785 return true;
48788 if (odd)
48790 t1 = gen_reg_rtx (V32HImode);
48791 t2 = gen_reg_rtx (V32HImode);
48792 emit_insn (gen_lshrv32hi3 (t1,
48793 gen_lowpart (V32HImode, d->op0),
48794 GEN_INT (8)));
48795 emit_insn (gen_lshrv32hi3 (t2,
48796 gen_lowpart (V32HImode, d->op1),
48797 GEN_INT (8)));
48799 else
48801 t1 = gen_lowpart (V32HImode, d->op0);
48802 t2 = gen_lowpart (V32HImode, d->op1);
48805 t3 = gen_reg_rtx (V32QImode);
48806 t4 = gen_reg_rtx (V32QImode);
48807 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48808 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48809 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48811 return true;
48814 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48815 and extract-odd permutations. */
48817 static bool
48818 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48820 rtx t1, t2, t3, t4, t5;
48822 switch (d->vmode)
48824 case V4DFmode:
48825 if (d->testing_p)
48826 break;
48827 t1 = gen_reg_rtx (V4DFmode);
48828 t2 = gen_reg_rtx (V4DFmode);
48830 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48831 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48832 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48834 /* Now an unpck[lh]pd will produce the result required. */
48835 if (odd)
48836 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48837 else
48838 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48839 emit_insn (t3);
48840 break;
48842 case V8SFmode:
48844 int mask = odd ? 0xdd : 0x88;
48846 if (d->testing_p)
48847 break;
48848 t1 = gen_reg_rtx (V8SFmode);
48849 t2 = gen_reg_rtx (V8SFmode);
48850 t3 = gen_reg_rtx (V8SFmode);
48852 /* Shuffle within the 128-bit lanes to produce:
48853 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48854 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48855 GEN_INT (mask)));
48857 /* Shuffle the lanes around to produce:
48858 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48859 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48860 GEN_INT (0x3)));
48862 /* Shuffle within the 128-bit lanes to produce:
48863 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48864 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48866 /* Shuffle within the 128-bit lanes to produce:
48867 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48868 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48870 /* Shuffle the lanes around to produce:
48871 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48872 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48873 GEN_INT (0x20)));
48875 break;
48877 case V2DFmode:
48878 case V4SFmode:
48879 case V2DImode:
48880 case V4SImode:
48881 /* These are always directly implementable by expand_vec_perm_1. */
48882 gcc_unreachable ();
48884 case V8HImode:
48885 if (TARGET_SSE4_1)
48886 return expand_vec_perm_even_odd_pack (d);
48887 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48888 return expand_vec_perm_pshufb2 (d);
48889 else
48891 if (d->testing_p)
48892 break;
48893 /* We need 2*log2(N)-1 operations to achieve odd/even
48894 with interleave. */
48895 t1 = gen_reg_rtx (V8HImode);
48896 t2 = gen_reg_rtx (V8HImode);
48897 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48898 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48899 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48900 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48901 if (odd)
48902 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48903 else
48904 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48905 emit_insn (t3);
48907 break;
48909 case V16QImode:
48910 return expand_vec_perm_even_odd_pack (d);
48912 case V16HImode:
48913 case V32QImode:
48914 return expand_vec_perm_even_odd_pack (d);
48916 case V64QImode:
48917 return expand_vec_perm_even_odd_trunc (d);
48919 case V4DImode:
48920 if (!TARGET_AVX2)
48922 struct expand_vec_perm_d d_copy = *d;
48923 d_copy.vmode = V4DFmode;
48924 if (d->testing_p)
48925 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48926 else
48927 d_copy.target = gen_reg_rtx (V4DFmode);
48928 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48929 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48930 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48932 if (!d->testing_p)
48933 emit_move_insn (d->target,
48934 gen_lowpart (V4DImode, d_copy.target));
48935 return true;
48937 return false;
48940 if (d->testing_p)
48941 break;
48943 t1 = gen_reg_rtx (V4DImode);
48944 t2 = gen_reg_rtx (V4DImode);
48946 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48947 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48948 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48950 /* Now an vpunpck[lh]qdq will produce the result required. */
48951 if (odd)
48952 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48953 else
48954 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48955 emit_insn (t3);
48956 break;
48958 case V8SImode:
48959 if (!TARGET_AVX2)
48961 struct expand_vec_perm_d d_copy = *d;
48962 d_copy.vmode = V8SFmode;
48963 if (d->testing_p)
48964 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48965 else
48966 d_copy.target = gen_reg_rtx (V8SFmode);
48967 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48968 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48969 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48971 if (!d->testing_p)
48972 emit_move_insn (d->target,
48973 gen_lowpart (V8SImode, d_copy.target));
48974 return true;
48976 return false;
48979 if (d->testing_p)
48980 break;
48982 t1 = gen_reg_rtx (V8SImode);
48983 t2 = gen_reg_rtx (V8SImode);
48984 t3 = gen_reg_rtx (V4DImode);
48985 t4 = gen_reg_rtx (V4DImode);
48986 t5 = gen_reg_rtx (V4DImode);
48988 /* Shuffle the lanes around into
48989 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48990 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48991 gen_lowpart (V4DImode, d->op1),
48992 GEN_INT (0x20)));
48993 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48994 gen_lowpart (V4DImode, d->op1),
48995 GEN_INT (0x31)));
48997 /* Swap the 2nd and 3rd position in each lane into
48998 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48999 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
49000 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49001 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
49002 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49004 /* Now an vpunpck[lh]qdq will produce
49005 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
49006 if (odd)
49007 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
49008 gen_lowpart (V4DImode, t2));
49009 else
49010 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
49011 gen_lowpart (V4DImode, t2));
49012 emit_insn (t3);
49013 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
49014 break;
49016 default:
49017 gcc_unreachable ();
49020 return true;
49023 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49024 extract-even and extract-odd permutations. */
49026 static bool
49027 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
49029 unsigned i, odd, nelt = d->nelt;
49031 odd = d->perm[0];
49032 if (odd != 0 && odd != 1)
49033 return false;
49035 for (i = 1; i < nelt; ++i)
49036 if (d->perm[i] != 2 * i + odd)
49037 return false;
49039 return expand_vec_perm_even_odd_1 (d, odd);
49042 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
49043 permutations. We assume that expand_vec_perm_1 has already failed. */
49045 static bool
49046 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
49048 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
49049 machine_mode vmode = d->vmode;
49050 unsigned char perm2[4];
49051 rtx op0 = d->op0, dest;
49052 bool ok;
49054 switch (vmode)
49056 case V4DFmode:
49057 case V8SFmode:
49058 /* These are special-cased in sse.md so that we can optionally
49059 use the vbroadcast instruction. They expand to two insns
49060 if the input happens to be in a register. */
49061 gcc_unreachable ();
49063 case V2DFmode:
49064 case V2DImode:
49065 case V4SFmode:
49066 case V4SImode:
49067 /* These are always implementable using standard shuffle patterns. */
49068 gcc_unreachable ();
49070 case V8HImode:
49071 case V16QImode:
49072 /* These can be implemented via interleave. We save one insn by
49073 stopping once we have promoted to V4SImode and then use pshufd. */
49074 if (d->testing_p)
49075 return true;
49078 rtx dest;
49079 rtx (*gen) (rtx, rtx, rtx)
49080 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49081 : gen_vec_interleave_lowv8hi;
49083 if (elt >= nelt2)
49085 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49086 : gen_vec_interleave_highv8hi;
49087 elt -= nelt2;
49089 nelt2 /= 2;
49091 dest = gen_reg_rtx (vmode);
49092 emit_insn (gen (dest, op0, op0));
49093 vmode = get_mode_wider_vector (vmode);
49094 op0 = gen_lowpart (vmode, dest);
49096 while (vmode != V4SImode);
49098 memset (perm2, elt, 4);
49099 dest = gen_reg_rtx (V4SImode);
49100 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49101 gcc_assert (ok);
49102 if (!d->testing_p)
49103 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49104 return true;
49106 case V64QImode:
49107 case V32QImode:
49108 case V16HImode:
49109 case V8SImode:
49110 case V4DImode:
49111 /* For AVX2 broadcasts of the first element vpbroadcast* or
49112 vpermq should be used by expand_vec_perm_1. */
49113 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49114 return false;
49116 default:
49117 gcc_unreachable ();
49121 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49122 broadcast permutations. */
49124 static bool
49125 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49127 unsigned i, elt, nelt = d->nelt;
49129 if (!d->one_operand_p)
49130 return false;
49132 elt = d->perm[0];
49133 for (i = 1; i < nelt; ++i)
49134 if (d->perm[i] != elt)
49135 return false;
49137 return expand_vec_perm_broadcast_1 (d);
49140 /* Implement arbitrary permutations of two V64QImode operands
49141 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49142 static bool
49143 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49145 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49146 return false;
49148 if (d->testing_p)
49149 return true;
49151 struct expand_vec_perm_d ds[2];
49152 rtx rperm[128], vperm, target0, target1;
49153 unsigned int i, nelt;
49154 machine_mode vmode;
49156 nelt = d->nelt;
49157 vmode = V64QImode;
49159 for (i = 0; i < 2; i++)
49161 ds[i] = *d;
49162 ds[i].vmode = V32HImode;
49163 ds[i].nelt = 32;
49164 ds[i].target = gen_reg_rtx (V32HImode);
49165 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49166 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49169 /* Prepare permutations such that the first one takes care of
49170 putting the even bytes into the right positions or one higher
49171 positions (ds[0]) and the second one takes care of
49172 putting the odd bytes into the right positions or one below
49173 (ds[1]). */
49175 for (i = 0; i < nelt; i++)
49177 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49178 if (i & 1)
49180 rperm[i] = constm1_rtx;
49181 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49183 else
49185 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49186 rperm[i + 64] = constm1_rtx;
49190 bool ok = expand_vec_perm_1 (&ds[0]);
49191 gcc_assert (ok);
49192 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49194 ok = expand_vec_perm_1 (&ds[1]);
49195 gcc_assert (ok);
49196 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49198 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49199 vperm = force_reg (vmode, vperm);
49200 target0 = gen_reg_rtx (V64QImode);
49201 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49203 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49204 vperm = force_reg (vmode, vperm);
49205 target1 = gen_reg_rtx (V64QImode);
49206 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49208 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49209 return true;
49212 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49213 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49214 all the shorter instruction sequences. */
49216 static bool
49217 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49219 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49220 unsigned int i, nelt, eltsz;
49221 bool used[4];
49223 if (!TARGET_AVX2
49224 || d->one_operand_p
49225 || (d->vmode != V32QImode && d->vmode != V16HImode))
49226 return false;
49228 if (d->testing_p)
49229 return true;
49231 nelt = d->nelt;
49232 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49234 /* Generate 4 permutation masks. If the required element is within
49235 the same lane, it is shuffled in. If the required element from the
49236 other lane, force a zero by setting bit 7 in the permutation mask.
49237 In the other mask the mask has non-negative elements if element
49238 is requested from the other lane, but also moved to the other lane,
49239 so that the result of vpshufb can have the two V2TImode halves
49240 swapped. */
49241 m128 = GEN_INT (-128);
49242 for (i = 0; i < 32; ++i)
49244 rperm[0][i] = m128;
49245 rperm[1][i] = m128;
49246 rperm[2][i] = m128;
49247 rperm[3][i] = m128;
49249 used[0] = false;
49250 used[1] = false;
49251 used[2] = false;
49252 used[3] = false;
49253 for (i = 0; i < nelt; ++i)
49255 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49256 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49257 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49259 for (j = 0; j < eltsz; ++j)
49260 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49261 used[which] = true;
49264 for (i = 0; i < 2; ++i)
49266 if (!used[2 * i + 1])
49268 h[i] = NULL_RTX;
49269 continue;
49271 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49272 gen_rtvec_v (32, rperm[2 * i + 1]));
49273 vperm = force_reg (V32QImode, vperm);
49274 h[i] = gen_reg_rtx (V32QImode);
49275 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49276 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49279 /* Swap the 128-byte lanes of h[X]. */
49280 for (i = 0; i < 2; ++i)
49282 if (h[i] == NULL_RTX)
49283 continue;
49284 op = gen_reg_rtx (V4DImode);
49285 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49286 const2_rtx, GEN_INT (3), const0_rtx,
49287 const1_rtx));
49288 h[i] = gen_lowpart (V32QImode, op);
49291 for (i = 0; i < 2; ++i)
49293 if (!used[2 * i])
49295 l[i] = NULL_RTX;
49296 continue;
49298 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49299 vperm = force_reg (V32QImode, vperm);
49300 l[i] = gen_reg_rtx (V32QImode);
49301 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49302 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49305 for (i = 0; i < 2; ++i)
49307 if (h[i] && l[i])
49309 op = gen_reg_rtx (V32QImode);
49310 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49311 l[i] = op;
49313 else if (h[i])
49314 l[i] = h[i];
49317 gcc_assert (l[0] && l[1]);
49318 op = d->target;
49319 if (d->vmode != V32QImode)
49320 op = gen_reg_rtx (V32QImode);
49321 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49322 if (op != d->target)
49323 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49324 return true;
49327 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49328 With all of the interface bits taken care of, perform the expansion
49329 in D and return true on success. */
49331 static bool
49332 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49334 /* Try a single instruction expansion. */
49335 if (expand_vec_perm_1 (d))
49336 return true;
49338 /* Try sequences of two instructions. */
49340 if (expand_vec_perm_pshuflw_pshufhw (d))
49341 return true;
49343 if (expand_vec_perm_palignr (d, false))
49344 return true;
49346 if (expand_vec_perm_interleave2 (d))
49347 return true;
49349 if (expand_vec_perm_broadcast (d))
49350 return true;
49352 if (expand_vec_perm_vpermq_perm_1 (d))
49353 return true;
49355 if (expand_vec_perm_vperm2f128 (d))
49356 return true;
49358 if (expand_vec_perm_pblendv (d))
49359 return true;
49361 /* Try sequences of three instructions. */
49363 if (expand_vec_perm_even_odd_pack (d))
49364 return true;
49366 if (expand_vec_perm_2vperm2f128_vshuf (d))
49367 return true;
49369 if (expand_vec_perm_pshufb2 (d))
49370 return true;
49372 if (expand_vec_perm_interleave3 (d))
49373 return true;
49375 if (expand_vec_perm_vperm2f128_vblend (d))
49376 return true;
49378 /* Try sequences of four instructions. */
49380 if (expand_vec_perm_even_odd_trunc (d))
49381 return true;
49382 if (expand_vec_perm_vpshufb2_vpermq (d))
49383 return true;
49385 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49386 return true;
49388 if (expand_vec_perm_vpermi2_vpshub2 (d))
49389 return true;
49391 /* ??? Look for narrow permutations whose element orderings would
49392 allow the promotion to a wider mode. */
49394 /* ??? Look for sequences of interleave or a wider permute that place
49395 the data into the correct lanes for a half-vector shuffle like
49396 pshuf[lh]w or vpermilps. */
49398 /* ??? Look for sequences of interleave that produce the desired results.
49399 The combinatorics of punpck[lh] get pretty ugly... */
49401 if (expand_vec_perm_even_odd (d))
49402 return true;
49404 /* Even longer sequences. */
49405 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49406 return true;
49408 /* See if we can get the same permutation in different vector integer
49409 mode. */
49410 struct expand_vec_perm_d nd;
49411 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49413 if (!d->testing_p)
49414 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49415 return true;
49418 return false;
49421 /* If a permutation only uses one operand, make it clear. Returns true
49422 if the permutation references both operands. */
49424 static bool
49425 canonicalize_perm (struct expand_vec_perm_d *d)
49427 int i, which, nelt = d->nelt;
49429 for (i = which = 0; i < nelt; ++i)
49430 which |= (d->perm[i] < nelt ? 1 : 2);
49432 d->one_operand_p = true;
49433 switch (which)
49435 default:
49436 gcc_unreachable();
49438 case 3:
49439 if (!rtx_equal_p (d->op0, d->op1))
49441 d->one_operand_p = false;
49442 break;
49444 /* The elements of PERM do not suggest that only the first operand
49445 is used, but both operands are identical. Allow easier matching
49446 of the permutation by folding the permutation into the single
49447 input vector. */
49448 /* FALLTHRU */
49450 case 2:
49451 for (i = 0; i < nelt; ++i)
49452 d->perm[i] &= nelt - 1;
49453 d->op0 = d->op1;
49454 break;
49456 case 1:
49457 d->op1 = d->op0;
49458 break;
49461 return (which == 3);
49464 bool
49465 ix86_expand_vec_perm_const (rtx operands[4])
49467 struct expand_vec_perm_d d;
49468 unsigned char perm[MAX_VECT_LEN];
49469 int i, nelt;
49470 bool two_args;
49471 rtx sel;
49473 d.target = operands[0];
49474 d.op0 = operands[1];
49475 d.op1 = operands[2];
49476 sel = operands[3];
49478 d.vmode = GET_MODE (d.target);
49479 gcc_assert (VECTOR_MODE_P (d.vmode));
49480 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49481 d.testing_p = false;
49483 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49484 gcc_assert (XVECLEN (sel, 0) == nelt);
49485 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49487 for (i = 0; i < nelt; ++i)
49489 rtx e = XVECEXP (sel, 0, i);
49490 int ei = INTVAL (e) & (2 * nelt - 1);
49491 d.perm[i] = ei;
49492 perm[i] = ei;
49495 two_args = canonicalize_perm (&d);
49497 if (ix86_expand_vec_perm_const_1 (&d))
49498 return true;
49500 /* If the selector says both arguments are needed, but the operands are the
49501 same, the above tried to expand with one_operand_p and flattened selector.
49502 If that didn't work, retry without one_operand_p; we succeeded with that
49503 during testing. */
49504 if (two_args && d.one_operand_p)
49506 d.one_operand_p = false;
49507 memcpy (d.perm, perm, sizeof (perm));
49508 return ix86_expand_vec_perm_const_1 (&d);
49511 return false;
49514 /* Implement targetm.vectorize.vec_perm_const_ok. */
49516 static bool
49517 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49518 const unsigned char *sel)
49520 struct expand_vec_perm_d d;
49521 unsigned int i, nelt, which;
49522 bool ret;
49524 d.vmode = vmode;
49525 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49526 d.testing_p = true;
49528 /* Given sufficient ISA support we can just return true here
49529 for selected vector modes. */
49530 switch (d.vmode)
49532 case V16SFmode:
49533 case V16SImode:
49534 case V8DImode:
49535 case V8DFmode:
49536 if (TARGET_AVX512F)
49537 /* All implementable with a single vpermi2 insn. */
49538 return true;
49539 break;
49540 case V32HImode:
49541 if (TARGET_AVX512BW)
49542 /* All implementable with a single vpermi2 insn. */
49543 return true;
49544 break;
49545 case V64QImode:
49546 if (TARGET_AVX512BW)
49547 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49548 return true;
49549 break;
49550 case V8SImode:
49551 case V8SFmode:
49552 case V4DFmode:
49553 case V4DImode:
49554 if (TARGET_AVX512VL)
49555 /* All implementable with a single vpermi2 insn. */
49556 return true;
49557 break;
49558 case V16HImode:
49559 if (TARGET_AVX2)
49560 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49561 return true;
49562 break;
49563 case V32QImode:
49564 if (TARGET_AVX2)
49565 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49566 return true;
49567 break;
49568 case V4SImode:
49569 case V4SFmode:
49570 case V8HImode:
49571 case V16QImode:
49572 /* All implementable with a single vpperm insn. */
49573 if (TARGET_XOP)
49574 return true;
49575 /* All implementable with 2 pshufb + 1 ior. */
49576 if (TARGET_SSSE3)
49577 return true;
49578 break;
49579 case V2DImode:
49580 case V2DFmode:
49581 /* All implementable with shufpd or unpck[lh]pd. */
49582 return true;
49583 default:
49584 return false;
49587 /* Extract the values from the vector CST into the permutation
49588 array in D. */
49589 memcpy (d.perm, sel, nelt);
49590 for (i = which = 0; i < nelt; ++i)
49592 unsigned char e = d.perm[i];
49593 gcc_assert (e < 2 * nelt);
49594 which |= (e < nelt ? 1 : 2);
49597 /* For all elements from second vector, fold the elements to first. */
49598 if (which == 2)
49599 for (i = 0; i < nelt; ++i)
49600 d.perm[i] -= nelt;
49602 /* Check whether the mask can be applied to the vector type. */
49603 d.one_operand_p = (which != 3);
49605 /* Implementable with shufps or pshufd. */
49606 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49607 return true;
49609 /* Otherwise we have to go through the motions and see if we can
49610 figure out how to generate the requested permutation. */
49611 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49612 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49613 if (!d.one_operand_p)
49614 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49616 start_sequence ();
49617 ret = ix86_expand_vec_perm_const_1 (&d);
49618 end_sequence ();
49620 return ret;
49623 void
49624 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49626 struct expand_vec_perm_d d;
49627 unsigned i, nelt;
49629 d.target = targ;
49630 d.op0 = op0;
49631 d.op1 = op1;
49632 d.vmode = GET_MODE (targ);
49633 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49634 d.one_operand_p = false;
49635 d.testing_p = false;
49637 for (i = 0; i < nelt; ++i)
49638 d.perm[i] = i * 2 + odd;
49640 /* We'll either be able to implement the permutation directly... */
49641 if (expand_vec_perm_1 (&d))
49642 return;
49644 /* ... or we use the special-case patterns. */
49645 expand_vec_perm_even_odd_1 (&d, odd);
49648 static void
49649 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49651 struct expand_vec_perm_d d;
49652 unsigned i, nelt, base;
49653 bool ok;
49655 d.target = targ;
49656 d.op0 = op0;
49657 d.op1 = op1;
49658 d.vmode = GET_MODE (targ);
49659 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49660 d.one_operand_p = false;
49661 d.testing_p = false;
49663 base = high_p ? nelt / 2 : 0;
49664 for (i = 0; i < nelt / 2; ++i)
49666 d.perm[i * 2] = i + base;
49667 d.perm[i * 2 + 1] = i + base + nelt;
49670 /* Note that for AVX this isn't one instruction. */
49671 ok = ix86_expand_vec_perm_const_1 (&d);
49672 gcc_assert (ok);
49676 /* Expand a vector operation CODE for a V*QImode in terms of the
49677 same operation on V*HImode. */
49679 void
49680 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49682 machine_mode qimode = GET_MODE (dest);
49683 machine_mode himode;
49684 rtx (*gen_il) (rtx, rtx, rtx);
49685 rtx (*gen_ih) (rtx, rtx, rtx);
49686 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49687 struct expand_vec_perm_d d;
49688 bool ok, full_interleave;
49689 bool uns_p = false;
49690 int i;
49692 switch (qimode)
49694 case V16QImode:
49695 himode = V8HImode;
49696 gen_il = gen_vec_interleave_lowv16qi;
49697 gen_ih = gen_vec_interleave_highv16qi;
49698 break;
49699 case V32QImode:
49700 himode = V16HImode;
49701 gen_il = gen_avx2_interleave_lowv32qi;
49702 gen_ih = gen_avx2_interleave_highv32qi;
49703 break;
49704 case V64QImode:
49705 himode = V32HImode;
49706 gen_il = gen_avx512bw_interleave_lowv64qi;
49707 gen_ih = gen_avx512bw_interleave_highv64qi;
49708 break;
49709 default:
49710 gcc_unreachable ();
49713 op2_l = op2_h = op2;
49714 switch (code)
49716 case MULT:
49717 /* Unpack data such that we've got a source byte in each low byte of
49718 each word. We don't care what goes into the high byte of each word.
49719 Rather than trying to get zero in there, most convenient is to let
49720 it be a copy of the low byte. */
49721 op2_l = gen_reg_rtx (qimode);
49722 op2_h = gen_reg_rtx (qimode);
49723 emit_insn (gen_il (op2_l, op2, op2));
49724 emit_insn (gen_ih (op2_h, op2, op2));
49725 /* FALLTHRU */
49727 op1_l = gen_reg_rtx (qimode);
49728 op1_h = gen_reg_rtx (qimode);
49729 emit_insn (gen_il (op1_l, op1, op1));
49730 emit_insn (gen_ih (op1_h, op1, op1));
49731 full_interleave = qimode == V16QImode;
49732 break;
49734 case ASHIFT:
49735 case LSHIFTRT:
49736 uns_p = true;
49737 /* FALLTHRU */
49738 case ASHIFTRT:
49739 op1_l = gen_reg_rtx (himode);
49740 op1_h = gen_reg_rtx (himode);
49741 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49742 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49743 full_interleave = true;
49744 break;
49745 default:
49746 gcc_unreachable ();
49749 /* Perform the operation. */
49750 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49751 1, OPTAB_DIRECT);
49752 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49753 1, OPTAB_DIRECT);
49754 gcc_assert (res_l && res_h);
49756 /* Merge the data back into the right place. */
49757 d.target = dest;
49758 d.op0 = gen_lowpart (qimode, res_l);
49759 d.op1 = gen_lowpart (qimode, res_h);
49760 d.vmode = qimode;
49761 d.nelt = GET_MODE_NUNITS (qimode);
49762 d.one_operand_p = false;
49763 d.testing_p = false;
49765 if (full_interleave)
49767 /* For SSE2, we used an full interleave, so the desired
49768 results are in the even elements. */
49769 for (i = 0; i < d.nelt; ++i)
49770 d.perm[i] = i * 2;
49772 else
49774 /* For AVX, the interleave used above was not cross-lane. So the
49775 extraction is evens but with the second and third quarter swapped.
49776 Happily, that is even one insn shorter than even extraction.
49777 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49778 always first from the first and then from the second source operand,
49779 the index bits above the low 4 bits remains the same.
49780 Thus, for d.nelt == 32 we want permutation
49781 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49782 and for d.nelt == 64 we want permutation
49783 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49784 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49785 for (i = 0; i < d.nelt; ++i)
49786 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49789 ok = ix86_expand_vec_perm_const_1 (&d);
49790 gcc_assert (ok);
49792 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49793 gen_rtx_fmt_ee (code, qimode, op1, op2));
49796 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49797 if op is CONST_VECTOR with all odd elements equal to their
49798 preceding element. */
49800 static bool
49801 const_vector_equal_evenodd_p (rtx op)
49803 machine_mode mode = GET_MODE (op);
49804 int i, nunits = GET_MODE_NUNITS (mode);
49805 if (GET_CODE (op) != CONST_VECTOR
49806 || nunits != CONST_VECTOR_NUNITS (op))
49807 return false;
49808 for (i = 0; i < nunits; i += 2)
49809 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49810 return false;
49811 return true;
49814 void
49815 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49816 bool uns_p, bool odd_p)
49818 machine_mode mode = GET_MODE (op1);
49819 machine_mode wmode = GET_MODE (dest);
49820 rtx x;
49821 rtx orig_op1 = op1, orig_op2 = op2;
49823 if (!nonimmediate_operand (op1, mode))
49824 op1 = force_reg (mode, op1);
49825 if (!nonimmediate_operand (op2, mode))
49826 op2 = force_reg (mode, op2);
49828 /* We only play even/odd games with vectors of SImode. */
49829 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49831 /* If we're looking for the odd results, shift those members down to
49832 the even slots. For some cpus this is faster than a PSHUFD. */
49833 if (odd_p)
49835 /* For XOP use vpmacsdqh, but only for smult, as it is only
49836 signed. */
49837 if (TARGET_XOP && mode == V4SImode && !uns_p)
49839 x = force_reg (wmode, CONST0_RTX (wmode));
49840 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49841 return;
49844 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49845 if (!const_vector_equal_evenodd_p (orig_op1))
49846 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49847 x, NULL, 1, OPTAB_DIRECT);
49848 if (!const_vector_equal_evenodd_p (orig_op2))
49849 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49850 x, NULL, 1, OPTAB_DIRECT);
49851 op1 = gen_lowpart (mode, op1);
49852 op2 = gen_lowpart (mode, op2);
49855 if (mode == V16SImode)
49857 if (uns_p)
49858 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49859 else
49860 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49862 else if (mode == V8SImode)
49864 if (uns_p)
49865 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49866 else
49867 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49869 else if (uns_p)
49870 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49871 else if (TARGET_SSE4_1)
49872 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49873 else
49875 rtx s1, s2, t0, t1, t2;
49877 /* The easiest way to implement this without PMULDQ is to go through
49878 the motions as if we are performing a full 64-bit multiply. With
49879 the exception that we need to do less shuffling of the elements. */
49881 /* Compute the sign-extension, aka highparts, of the two operands. */
49882 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49883 op1, pc_rtx, pc_rtx);
49884 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49885 op2, pc_rtx, pc_rtx);
49887 /* Multiply LO(A) * HI(B), and vice-versa. */
49888 t1 = gen_reg_rtx (wmode);
49889 t2 = gen_reg_rtx (wmode);
49890 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49891 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49893 /* Multiply LO(A) * LO(B). */
49894 t0 = gen_reg_rtx (wmode);
49895 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49897 /* Combine and shift the highparts into place. */
49898 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49899 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49900 1, OPTAB_DIRECT);
49902 /* Combine high and low parts. */
49903 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49904 return;
49906 emit_insn (x);
49909 void
49910 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49911 bool uns_p, bool high_p)
49913 machine_mode wmode = GET_MODE (dest);
49914 machine_mode mode = GET_MODE (op1);
49915 rtx t1, t2, t3, t4, mask;
49917 switch (mode)
49919 case V4SImode:
49920 t1 = gen_reg_rtx (mode);
49921 t2 = gen_reg_rtx (mode);
49922 if (TARGET_XOP && !uns_p)
49924 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49925 shuffle the elements once so that all elements are in the right
49926 place for immediate use: { A C B D }. */
49927 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49928 const1_rtx, GEN_INT (3)));
49929 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49930 const1_rtx, GEN_INT (3)));
49932 else
49934 /* Put the elements into place for the multiply. */
49935 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49936 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49937 high_p = false;
49939 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49940 break;
49942 case V8SImode:
49943 /* Shuffle the elements between the lanes. After this we
49944 have { A B E F | C D G H } for each operand. */
49945 t1 = gen_reg_rtx (V4DImode);
49946 t2 = gen_reg_rtx (V4DImode);
49947 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49948 const0_rtx, const2_rtx,
49949 const1_rtx, GEN_INT (3)));
49950 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49951 const0_rtx, const2_rtx,
49952 const1_rtx, GEN_INT (3)));
49954 /* Shuffle the elements within the lanes. After this we
49955 have { A A B B | C C D D } or { E E F F | G G H H }. */
49956 t3 = gen_reg_rtx (V8SImode);
49957 t4 = gen_reg_rtx (V8SImode);
49958 mask = GEN_INT (high_p
49959 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49960 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49961 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49962 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49964 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49965 break;
49967 case V8HImode:
49968 case V16HImode:
49969 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49970 uns_p, OPTAB_DIRECT);
49971 t2 = expand_binop (mode,
49972 uns_p ? umul_highpart_optab : smul_highpart_optab,
49973 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49974 gcc_assert (t1 && t2);
49976 t3 = gen_reg_rtx (mode);
49977 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49978 emit_move_insn (dest, gen_lowpart (wmode, t3));
49979 break;
49981 case V16QImode:
49982 case V32QImode:
49983 case V32HImode:
49984 case V16SImode:
49985 case V64QImode:
49986 t1 = gen_reg_rtx (wmode);
49987 t2 = gen_reg_rtx (wmode);
49988 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49989 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49991 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49992 break;
49994 default:
49995 gcc_unreachable ();
49999 void
50000 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
50002 rtx res_1, res_2, res_3, res_4;
50004 res_1 = gen_reg_rtx (V4SImode);
50005 res_2 = gen_reg_rtx (V4SImode);
50006 res_3 = gen_reg_rtx (V2DImode);
50007 res_4 = gen_reg_rtx (V2DImode);
50008 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
50009 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
50011 /* Move the results in element 2 down to element 1; we don't care
50012 what goes in elements 2 and 3. Then we can merge the parts
50013 back together with an interleave.
50015 Note that two other sequences were tried:
50016 (1) Use interleaves at the start instead of psrldq, which allows
50017 us to use a single shufps to merge things back at the end.
50018 (2) Use shufps here to combine the two vectors, then pshufd to
50019 put the elements in the correct order.
50020 In both cases the cost of the reformatting stall was too high
50021 and the overall sequence slower. */
50023 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
50024 const0_rtx, const2_rtx,
50025 const0_rtx, const0_rtx));
50026 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
50027 const0_rtx, const2_rtx,
50028 const0_rtx, const0_rtx));
50029 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
50031 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
50034 void
50035 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
50037 machine_mode mode = GET_MODE (op0);
50038 rtx t1, t2, t3, t4, t5, t6;
50040 if (TARGET_AVX512DQ && mode == V8DImode)
50041 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
50042 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
50043 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
50044 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
50045 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
50046 else if (TARGET_XOP && mode == V2DImode)
50048 /* op1: A,B,C,D, op2: E,F,G,H */
50049 op1 = gen_lowpart (V4SImode, op1);
50050 op2 = gen_lowpart (V4SImode, op2);
50052 t1 = gen_reg_rtx (V4SImode);
50053 t2 = gen_reg_rtx (V4SImode);
50054 t3 = gen_reg_rtx (V2DImode);
50055 t4 = gen_reg_rtx (V2DImode);
50057 /* t1: B,A,D,C */
50058 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50059 GEN_INT (1),
50060 GEN_INT (0),
50061 GEN_INT (3),
50062 GEN_INT (2)));
50064 /* t2: (B*E),(A*F),(D*G),(C*H) */
50065 emit_insn (gen_mulv4si3 (t2, t1, op2));
50067 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50068 emit_insn (gen_xop_phadddq (t3, t2));
50070 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50071 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50073 /* Multiply lower parts and add all */
50074 t5 = gen_reg_rtx (V2DImode);
50075 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50076 gen_lowpart (V4SImode, op1),
50077 gen_lowpart (V4SImode, op2)));
50078 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50081 else
50083 machine_mode nmode;
50084 rtx (*umul) (rtx, rtx, rtx);
50086 if (mode == V2DImode)
50088 umul = gen_vec_widen_umult_even_v4si;
50089 nmode = V4SImode;
50091 else if (mode == V4DImode)
50093 umul = gen_vec_widen_umult_even_v8si;
50094 nmode = V8SImode;
50096 else if (mode == V8DImode)
50098 umul = gen_vec_widen_umult_even_v16si;
50099 nmode = V16SImode;
50101 else
50102 gcc_unreachable ();
50105 /* Multiply low parts. */
50106 t1 = gen_reg_rtx (mode);
50107 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50109 /* Shift input vectors right 32 bits so we can multiply high parts. */
50110 t6 = GEN_INT (32);
50111 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50112 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50114 /* Multiply high parts by low parts. */
50115 t4 = gen_reg_rtx (mode);
50116 t5 = gen_reg_rtx (mode);
50117 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50118 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50120 /* Combine and shift the highparts back. */
50121 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50122 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50124 /* Combine high and low parts. */
50125 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50128 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50129 gen_rtx_MULT (mode, op1, op2));
50132 /* Return 1 if control tansfer instruction INSN
50133 should be encoded with bnd prefix.
50134 If insn is NULL then return 1 when control
50135 transfer instructions should be prefixed with
50136 bnd by default for current function. */
50138 bool
50139 ix86_bnd_prefixed_insn_p (rtx insn)
50141 /* For call insns check special flag. */
50142 if (insn && CALL_P (insn))
50144 rtx call = get_call_rtx_from (insn);
50145 if (call)
50146 return CALL_EXPR_WITH_BOUNDS_P (call);
50149 /* All other insns are prefixed only if function is instrumented. */
50150 return chkp_function_instrumented_p (current_function_decl);
50153 /* Calculate integer abs() using only SSE2 instructions. */
50155 void
50156 ix86_expand_sse2_abs (rtx target, rtx input)
50158 machine_mode mode = GET_MODE (target);
50159 rtx tmp0, tmp1, x;
50161 switch (mode)
50163 /* For 32-bit signed integer X, the best way to calculate the absolute
50164 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50165 case V4SImode:
50166 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50167 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50168 NULL, 0, OPTAB_DIRECT);
50169 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50170 NULL, 0, OPTAB_DIRECT);
50171 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50172 target, 0, OPTAB_DIRECT);
50173 break;
50175 /* For 16-bit signed integer X, the best way to calculate the absolute
50176 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50177 case V8HImode:
50178 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50180 x = expand_simple_binop (mode, SMAX, tmp0, input,
50181 target, 0, OPTAB_DIRECT);
50182 break;
50184 /* For 8-bit signed integer X, the best way to calculate the absolute
50185 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50186 as SSE2 provides the PMINUB insn. */
50187 case V16QImode:
50188 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50190 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50191 target, 0, OPTAB_DIRECT);
50192 break;
50194 default:
50195 gcc_unreachable ();
50198 if (x != target)
50199 emit_move_insn (target, x);
50202 /* Expand an extract from a vector register through pextr insn.
50203 Return true if successful. */
50205 bool
50206 ix86_expand_pextr (rtx *operands)
50208 rtx dst = operands[0];
50209 rtx src = operands[1];
50211 unsigned int size = INTVAL (operands[2]);
50212 unsigned int pos = INTVAL (operands[3]);
50214 if (SUBREG_P (dst))
50216 /* Reject non-lowpart subregs. */
50217 if (SUBREG_BYTE (dst) > 0)
50218 return false;
50219 dst = SUBREG_REG (dst);
50222 if (SUBREG_P (src))
50224 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50225 src = SUBREG_REG (src);
50228 switch (GET_MODE (src))
50230 case V16QImode:
50231 case V8HImode:
50232 case V4SImode:
50233 case V2DImode:
50234 case V1TImode:
50235 case TImode:
50237 machine_mode srcmode, dstmode;
50238 rtx d, pat;
50240 dstmode = mode_for_size (size, MODE_INT, 0);
50242 switch (dstmode)
50244 case QImode:
50245 if (!TARGET_SSE4_1)
50246 return false;
50247 srcmode = V16QImode;
50248 break;
50250 case HImode:
50251 if (!TARGET_SSE2)
50252 return false;
50253 srcmode = V8HImode;
50254 break;
50256 case SImode:
50257 if (!TARGET_SSE4_1)
50258 return false;
50259 srcmode = V4SImode;
50260 break;
50262 case DImode:
50263 gcc_assert (TARGET_64BIT);
50264 if (!TARGET_SSE4_1)
50265 return false;
50266 srcmode = V2DImode;
50267 break;
50269 default:
50270 return false;
50273 /* Reject extractions from misaligned positions. */
50274 if (pos & (size-1))
50275 return false;
50277 if (GET_MODE (dst) == dstmode)
50278 d = dst;
50279 else
50280 d = gen_reg_rtx (dstmode);
50282 /* Construct insn pattern. */
50283 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50284 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50286 /* Let the rtl optimizers know about the zero extension performed. */
50287 if (dstmode == QImode || dstmode == HImode)
50289 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50290 d = gen_lowpart (SImode, d);
50293 emit_insn (gen_rtx_SET (d, pat));
50295 if (d != dst)
50296 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50297 return true;
50300 default:
50301 return false;
50305 /* Expand an insert into a vector register through pinsr insn.
50306 Return true if successful. */
50308 bool
50309 ix86_expand_pinsr (rtx *operands)
50311 rtx dst = operands[0];
50312 rtx src = operands[3];
50314 unsigned int size = INTVAL (operands[1]);
50315 unsigned int pos = INTVAL (operands[2]);
50317 if (SUBREG_P (dst))
50319 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50320 dst = SUBREG_REG (dst);
50323 switch (GET_MODE (dst))
50325 case V16QImode:
50326 case V8HImode:
50327 case V4SImode:
50328 case V2DImode:
50329 case V1TImode:
50330 case TImode:
50332 machine_mode srcmode, dstmode;
50333 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50334 rtx d;
50336 srcmode = mode_for_size (size, MODE_INT, 0);
50338 switch (srcmode)
50340 case QImode:
50341 if (!TARGET_SSE4_1)
50342 return false;
50343 dstmode = V16QImode;
50344 pinsr = gen_sse4_1_pinsrb;
50345 break;
50347 case HImode:
50348 if (!TARGET_SSE2)
50349 return false;
50350 dstmode = V8HImode;
50351 pinsr = gen_sse2_pinsrw;
50352 break;
50354 case SImode:
50355 if (!TARGET_SSE4_1)
50356 return false;
50357 dstmode = V4SImode;
50358 pinsr = gen_sse4_1_pinsrd;
50359 break;
50361 case DImode:
50362 gcc_assert (TARGET_64BIT);
50363 if (!TARGET_SSE4_1)
50364 return false;
50365 dstmode = V2DImode;
50366 pinsr = gen_sse4_1_pinsrq;
50367 break;
50369 default:
50370 return false;
50373 /* Reject insertions to misaligned positions. */
50374 if (pos & (size-1))
50375 return false;
50377 if (SUBREG_P (src))
50379 unsigned int srcpos = SUBREG_BYTE (src);
50381 if (srcpos > 0)
50383 rtx extr_ops[4];
50385 extr_ops[0] = gen_reg_rtx (srcmode);
50386 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50387 extr_ops[2] = GEN_INT (size);
50388 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50390 if (!ix86_expand_pextr (extr_ops))
50391 return false;
50393 src = extr_ops[0];
50395 else
50396 src = gen_lowpart (srcmode, SUBREG_REG (src));
50399 if (GET_MODE (dst) == dstmode)
50400 d = dst;
50401 else
50402 d = gen_reg_rtx (dstmode);
50404 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50405 gen_lowpart (srcmode, src),
50406 GEN_INT (1 << (pos / size))));
50407 if (d != dst)
50408 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50409 return true;
50412 default:
50413 return false;
50417 /* This function returns the calling abi specific va_list type node.
50418 It returns the FNDECL specific va_list type. */
50420 static tree
50421 ix86_fn_abi_va_list (tree fndecl)
50423 if (!TARGET_64BIT)
50424 return va_list_type_node;
50425 gcc_assert (fndecl != NULL_TREE);
50427 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50428 return ms_va_list_type_node;
50429 else
50430 return sysv_va_list_type_node;
50433 /* Returns the canonical va_list type specified by TYPE. If there
50434 is no valid TYPE provided, it return NULL_TREE. */
50436 static tree
50437 ix86_canonical_va_list_type (tree type)
50439 if (TARGET_64BIT)
50441 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50442 return ms_va_list_type_node;
50444 if ((TREE_CODE (type) == ARRAY_TYPE
50445 && integer_zerop (array_type_nelts (type)))
50446 || POINTER_TYPE_P (type))
50448 tree elem_type = TREE_TYPE (type);
50449 if (TREE_CODE (elem_type) == RECORD_TYPE
50450 && lookup_attribute ("sysv_abi va_list",
50451 TYPE_ATTRIBUTES (elem_type)))
50452 return sysv_va_list_type_node;
50455 return NULL_TREE;
50458 return std_canonical_va_list_type (type);
50461 /* Iterate through the target-specific builtin types for va_list.
50462 IDX denotes the iterator, *PTREE is set to the result type of
50463 the va_list builtin, and *PNAME to its internal type.
50464 Returns zero if there is no element for this index, otherwise
50465 IDX should be increased upon the next call.
50466 Note, do not iterate a base builtin's name like __builtin_va_list.
50467 Used from c_common_nodes_and_builtins. */
50469 static int
50470 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50472 if (TARGET_64BIT)
50474 switch (idx)
50476 default:
50477 break;
50479 case 0:
50480 *ptree = ms_va_list_type_node;
50481 *pname = "__builtin_ms_va_list";
50482 return 1;
50484 case 1:
50485 *ptree = sysv_va_list_type_node;
50486 *pname = "__builtin_sysv_va_list";
50487 return 1;
50491 return 0;
50494 #undef TARGET_SCHED_DISPATCH
50495 #define TARGET_SCHED_DISPATCH has_dispatch
50496 #undef TARGET_SCHED_DISPATCH_DO
50497 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50498 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50499 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50500 #undef TARGET_SCHED_REORDER
50501 #define TARGET_SCHED_REORDER ix86_sched_reorder
50502 #undef TARGET_SCHED_ADJUST_PRIORITY
50503 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50504 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50505 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50506 ix86_dependencies_evaluation_hook
50508 /* The size of the dispatch window is the total number of bytes of
50509 object code allowed in a window. */
50510 #define DISPATCH_WINDOW_SIZE 16
50512 /* Number of dispatch windows considered for scheduling. */
50513 #define MAX_DISPATCH_WINDOWS 3
50515 /* Maximum number of instructions in a window. */
50516 #define MAX_INSN 4
50518 /* Maximum number of immediate operands in a window. */
50519 #define MAX_IMM 4
50521 /* Maximum number of immediate bits allowed in a window. */
50522 #define MAX_IMM_SIZE 128
50524 /* Maximum number of 32 bit immediates allowed in a window. */
50525 #define MAX_IMM_32 4
50527 /* Maximum number of 64 bit immediates allowed in a window. */
50528 #define MAX_IMM_64 2
50530 /* Maximum total of loads or prefetches allowed in a window. */
50531 #define MAX_LOAD 2
50533 /* Maximum total of stores allowed in a window. */
50534 #define MAX_STORE 1
50536 #undef BIG
50537 #define BIG 100
50540 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50541 enum dispatch_group {
50542 disp_no_group = 0,
50543 disp_load,
50544 disp_store,
50545 disp_load_store,
50546 disp_prefetch,
50547 disp_imm,
50548 disp_imm_32,
50549 disp_imm_64,
50550 disp_branch,
50551 disp_cmp,
50552 disp_jcc,
50553 disp_last
50556 /* Number of allowable groups in a dispatch window. It is an array
50557 indexed by dispatch_group enum. 100 is used as a big number,
50558 because the number of these kind of operations does not have any
50559 effect in dispatch window, but we need them for other reasons in
50560 the table. */
50561 static unsigned int num_allowable_groups[disp_last] = {
50562 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50565 char group_name[disp_last + 1][16] = {
50566 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50567 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50568 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50571 /* Instruction path. */
50572 enum insn_path {
50573 no_path = 0,
50574 path_single, /* Single micro op. */
50575 path_double, /* Double micro op. */
50576 path_multi, /* Instructions with more than 2 micro op.. */
50577 last_path
50580 /* sched_insn_info defines a window to the instructions scheduled in
50581 the basic block. It contains a pointer to the insn_info table and
50582 the instruction scheduled.
50584 Windows are allocated for each basic block and are linked
50585 together. */
50586 typedef struct sched_insn_info_s {
50587 rtx insn;
50588 enum dispatch_group group;
50589 enum insn_path path;
50590 int byte_len;
50591 int imm_bytes;
50592 } sched_insn_info;
50594 /* Linked list of dispatch windows. This is a two way list of
50595 dispatch windows of a basic block. It contains information about
50596 the number of uops in the window and the total number of
50597 instructions and of bytes in the object code for this dispatch
50598 window. */
50599 typedef struct dispatch_windows_s {
50600 int num_insn; /* Number of insn in the window. */
50601 int num_uops; /* Number of uops in the window. */
50602 int window_size; /* Number of bytes in the window. */
50603 int window_num; /* Window number between 0 or 1. */
50604 int num_imm; /* Number of immediates in an insn. */
50605 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50606 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50607 int imm_size; /* Total immediates in the window. */
50608 int num_loads; /* Total memory loads in the window. */
50609 int num_stores; /* Total memory stores in the window. */
50610 int violation; /* Violation exists in window. */
50611 sched_insn_info *window; /* Pointer to the window. */
50612 struct dispatch_windows_s *next;
50613 struct dispatch_windows_s *prev;
50614 } dispatch_windows;
50616 /* Immediate valuse used in an insn. */
50617 typedef struct imm_info_s
50619 int imm;
50620 int imm32;
50621 int imm64;
50622 } imm_info;
50624 static dispatch_windows *dispatch_window_list;
50625 static dispatch_windows *dispatch_window_list1;
50627 /* Get dispatch group of insn. */
50629 static enum dispatch_group
50630 get_mem_group (rtx_insn *insn)
50632 enum attr_memory memory;
50634 if (INSN_CODE (insn) < 0)
50635 return disp_no_group;
50636 memory = get_attr_memory (insn);
50637 if (memory == MEMORY_STORE)
50638 return disp_store;
50640 if (memory == MEMORY_LOAD)
50641 return disp_load;
50643 if (memory == MEMORY_BOTH)
50644 return disp_load_store;
50646 return disp_no_group;
50649 /* Return true if insn is a compare instruction. */
50651 static bool
50652 is_cmp (rtx_insn *insn)
50654 enum attr_type type;
50656 type = get_attr_type (insn);
50657 return (type == TYPE_TEST
50658 || type == TYPE_ICMP
50659 || type == TYPE_FCMP
50660 || GET_CODE (PATTERN (insn)) == COMPARE);
50663 /* Return true if a dispatch violation encountered. */
50665 static bool
50666 dispatch_violation (void)
50668 if (dispatch_window_list->next)
50669 return dispatch_window_list->next->violation;
50670 return dispatch_window_list->violation;
50673 /* Return true if insn is a branch instruction. */
50675 static bool
50676 is_branch (rtx_insn *insn)
50678 return (CALL_P (insn) || JUMP_P (insn));
50681 /* Return true if insn is a prefetch instruction. */
50683 static bool
50684 is_prefetch (rtx_insn *insn)
50686 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
50689 /* This function initializes a dispatch window and the list container holding a
50690 pointer to the window. */
50692 static void
50693 init_window (int window_num)
50695 int i;
50696 dispatch_windows *new_list;
50698 if (window_num == 0)
50699 new_list = dispatch_window_list;
50700 else
50701 new_list = dispatch_window_list1;
50703 new_list->num_insn = 0;
50704 new_list->num_uops = 0;
50705 new_list->window_size = 0;
50706 new_list->next = NULL;
50707 new_list->prev = NULL;
50708 new_list->window_num = window_num;
50709 new_list->num_imm = 0;
50710 new_list->num_imm_32 = 0;
50711 new_list->num_imm_64 = 0;
50712 new_list->imm_size = 0;
50713 new_list->num_loads = 0;
50714 new_list->num_stores = 0;
50715 new_list->violation = false;
50717 for (i = 0; i < MAX_INSN; i++)
50719 new_list->window[i].insn = NULL;
50720 new_list->window[i].group = disp_no_group;
50721 new_list->window[i].path = no_path;
50722 new_list->window[i].byte_len = 0;
50723 new_list->window[i].imm_bytes = 0;
50725 return;
50728 /* This function allocates and initializes a dispatch window and the
50729 list container holding a pointer to the window. */
50731 static dispatch_windows *
50732 allocate_window (void)
50734 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
50735 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
50737 return new_list;
50740 /* This routine initializes the dispatch scheduling information. It
50741 initiates building dispatch scheduler tables and constructs the
50742 first dispatch window. */
50744 static void
50745 init_dispatch_sched (void)
50747 /* Allocate a dispatch list and a window. */
50748 dispatch_window_list = allocate_window ();
50749 dispatch_window_list1 = allocate_window ();
50750 init_window (0);
50751 init_window (1);
50754 /* This function returns true if a branch is detected. End of a basic block
50755 does not have to be a branch, but here we assume only branches end a
50756 window. */
50758 static bool
50759 is_end_basic_block (enum dispatch_group group)
50761 return group == disp_branch;
50764 /* This function is called when the end of a window processing is reached. */
50766 static void
50767 process_end_window (void)
50769 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
50770 if (dispatch_window_list->next)
50772 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
50773 gcc_assert (dispatch_window_list->window_size
50774 + dispatch_window_list1->window_size <= 48);
50775 init_window (1);
50777 init_window (0);
50780 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
50781 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
50782 for 48 bytes of instructions. Note that these windows are not dispatch
50783 windows that their sizes are DISPATCH_WINDOW_SIZE. */
50785 static dispatch_windows *
50786 allocate_next_window (int window_num)
50788 if (window_num == 0)
50790 if (dispatch_window_list->next)
50791 init_window (1);
50792 init_window (0);
50793 return dispatch_window_list;
50796 dispatch_window_list->next = dispatch_window_list1;
50797 dispatch_window_list1->prev = dispatch_window_list;
50799 return dispatch_window_list1;
50802 /* Compute number of immediate operands of an instruction. */
50804 static void
50805 find_constant (rtx in_rtx, imm_info *imm_values)
50807 if (INSN_P (in_rtx))
50808 in_rtx = PATTERN (in_rtx);
50809 subrtx_iterator::array_type array;
50810 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
50811 if (const_rtx x = *iter)
50812 switch (GET_CODE (x))
50814 case CONST:
50815 case SYMBOL_REF:
50816 case CONST_INT:
50817 (imm_values->imm)++;
50818 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
50819 (imm_values->imm32)++;
50820 else
50821 (imm_values->imm64)++;
50822 break;
50824 case CONST_DOUBLE:
50825 case CONST_WIDE_INT:
50826 (imm_values->imm)++;
50827 (imm_values->imm64)++;
50828 break;
50830 case CODE_LABEL:
50831 if (LABEL_KIND (x) == LABEL_NORMAL)
50833 (imm_values->imm)++;
50834 (imm_values->imm32)++;
50836 break;
50838 default:
50839 break;
50843 /* Return total size of immediate operands of an instruction along with number
50844 of corresponding immediate-operands. It initializes its parameters to zero
50845 befor calling FIND_CONSTANT.
50846 INSN is the input instruction. IMM is the total of immediates.
50847 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
50848 bit immediates. */
50850 static int
50851 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
50853 imm_info imm_values = {0, 0, 0};
50855 find_constant (insn, &imm_values);
50856 *imm = imm_values.imm;
50857 *imm32 = imm_values.imm32;
50858 *imm64 = imm_values.imm64;
50859 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
50862 /* This function indicates if an operand of an instruction is an
50863 immediate. */
50865 static bool
50866 has_immediate (rtx_insn *insn)
50868 int num_imm_operand;
50869 int num_imm32_operand;
50870 int num_imm64_operand;
50872 if (insn)
50873 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50874 &num_imm64_operand);
50875 return false;
50878 /* Return single or double path for instructions. */
50880 static enum insn_path
50881 get_insn_path (rtx_insn *insn)
50883 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
50885 if ((int)path == 0)
50886 return path_single;
50888 if ((int)path == 1)
50889 return path_double;
50891 return path_multi;
50894 /* Return insn dispatch group. */
50896 static enum dispatch_group
50897 get_insn_group (rtx_insn *insn)
50899 enum dispatch_group group = get_mem_group (insn);
50900 if (group)
50901 return group;
50903 if (is_branch (insn))
50904 return disp_branch;
50906 if (is_cmp (insn))
50907 return disp_cmp;
50909 if (has_immediate (insn))
50910 return disp_imm;
50912 if (is_prefetch (insn))
50913 return disp_prefetch;
50915 return disp_no_group;
50918 /* Count number of GROUP restricted instructions in a dispatch
50919 window WINDOW_LIST. */
50921 static int
50922 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
50924 enum dispatch_group group = get_insn_group (insn);
50925 int imm_size;
50926 int num_imm_operand;
50927 int num_imm32_operand;
50928 int num_imm64_operand;
50930 if (group == disp_no_group)
50931 return 0;
50933 if (group == disp_imm)
50935 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
50936 &num_imm64_operand);
50937 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
50938 || num_imm_operand + window_list->num_imm > MAX_IMM
50939 || (num_imm32_operand > 0
50940 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
50941 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
50942 || (num_imm64_operand > 0
50943 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
50944 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
50945 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
50946 && num_imm64_operand > 0
50947 && ((window_list->num_imm_64 > 0
50948 && window_list->num_insn >= 2)
50949 || window_list->num_insn >= 3)))
50950 return BIG;
50952 return 1;
50955 if ((group == disp_load_store
50956 && (window_list->num_loads >= MAX_LOAD
50957 || window_list->num_stores >= MAX_STORE))
50958 || ((group == disp_load
50959 || group == disp_prefetch)
50960 && window_list->num_loads >= MAX_LOAD)
50961 || (group == disp_store
50962 && window_list->num_stores >= MAX_STORE))
50963 return BIG;
50965 return 1;
50968 /* This function returns true if insn satisfies dispatch rules on the
50969 last window scheduled. */
50971 static bool
50972 fits_dispatch_window (rtx_insn *insn)
50974 dispatch_windows *window_list = dispatch_window_list;
50975 dispatch_windows *window_list_next = dispatch_window_list->next;
50976 unsigned int num_restrict;
50977 enum dispatch_group group = get_insn_group (insn);
50978 enum insn_path path = get_insn_path (insn);
50979 int sum;
50981 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
50982 instructions should be given the lowest priority in the
50983 scheduling process in Haifa scheduler to make sure they will be
50984 scheduled in the same dispatch window as the reference to them. */
50985 if (group == disp_jcc || group == disp_cmp)
50986 return false;
50988 /* Check nonrestricted. */
50989 if (group == disp_no_group || group == disp_branch)
50990 return true;
50992 /* Get last dispatch window. */
50993 if (window_list_next)
50994 window_list = window_list_next;
50996 if (window_list->window_num == 1)
50998 sum = window_list->prev->window_size + window_list->window_size;
51000 if (sum == 32
51001 || (min_insn_size (insn) + sum) >= 48)
51002 /* Window 1 is full. Go for next window. */
51003 return true;
51006 num_restrict = count_num_restricted (insn, window_list);
51008 if (num_restrict > num_allowable_groups[group])
51009 return false;
51011 /* See if it fits in the first window. */
51012 if (window_list->window_num == 0)
51014 /* The first widow should have only single and double path
51015 uops. */
51016 if (path == path_double
51017 && (window_list->num_uops + 2) > MAX_INSN)
51018 return false;
51019 else if (path != path_single)
51020 return false;
51022 return true;
51025 /* Add an instruction INSN with NUM_UOPS micro-operations to the
51026 dispatch window WINDOW_LIST. */
51028 static void
51029 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
51031 int byte_len = min_insn_size (insn);
51032 int num_insn = window_list->num_insn;
51033 int imm_size;
51034 sched_insn_info *window = window_list->window;
51035 enum dispatch_group group = get_insn_group (insn);
51036 enum insn_path path = get_insn_path (insn);
51037 int num_imm_operand;
51038 int num_imm32_operand;
51039 int num_imm64_operand;
51041 if (!window_list->violation && group != disp_cmp
51042 && !fits_dispatch_window (insn))
51043 window_list->violation = true;
51045 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51046 &num_imm64_operand);
51048 /* Initialize window with new instruction. */
51049 window[num_insn].insn = insn;
51050 window[num_insn].byte_len = byte_len;
51051 window[num_insn].group = group;
51052 window[num_insn].path = path;
51053 window[num_insn].imm_bytes = imm_size;
51055 window_list->window_size += byte_len;
51056 window_list->num_insn = num_insn + 1;
51057 window_list->num_uops = window_list->num_uops + num_uops;
51058 window_list->imm_size += imm_size;
51059 window_list->num_imm += num_imm_operand;
51060 window_list->num_imm_32 += num_imm32_operand;
51061 window_list->num_imm_64 += num_imm64_operand;
51063 if (group == disp_store)
51064 window_list->num_stores += 1;
51065 else if (group == disp_load
51066 || group == disp_prefetch)
51067 window_list->num_loads += 1;
51068 else if (group == disp_load_store)
51070 window_list->num_stores += 1;
51071 window_list->num_loads += 1;
51075 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51076 If the total bytes of instructions or the number of instructions in
51077 the window exceed allowable, it allocates a new window. */
51079 static void
51080 add_to_dispatch_window (rtx_insn *insn)
51082 int byte_len;
51083 dispatch_windows *window_list;
51084 dispatch_windows *next_list;
51085 dispatch_windows *window0_list;
51086 enum insn_path path;
51087 enum dispatch_group insn_group;
51088 bool insn_fits;
51089 int num_insn;
51090 int num_uops;
51091 int window_num;
51092 int insn_num_uops;
51093 int sum;
51095 if (INSN_CODE (insn) < 0)
51096 return;
51098 byte_len = min_insn_size (insn);
51099 window_list = dispatch_window_list;
51100 next_list = window_list->next;
51101 path = get_insn_path (insn);
51102 insn_group = get_insn_group (insn);
51104 /* Get the last dispatch window. */
51105 if (next_list)
51106 window_list = dispatch_window_list->next;
51108 if (path == path_single)
51109 insn_num_uops = 1;
51110 else if (path == path_double)
51111 insn_num_uops = 2;
51112 else
51113 insn_num_uops = (int) path;
51115 /* If current window is full, get a new window.
51116 Window number zero is full, if MAX_INSN uops are scheduled in it.
51117 Window number one is full, if window zero's bytes plus window
51118 one's bytes is 32, or if the bytes of the new instruction added
51119 to the total makes it greater than 48, or it has already MAX_INSN
51120 instructions in it. */
51121 num_insn = window_list->num_insn;
51122 num_uops = window_list->num_uops;
51123 window_num = window_list->window_num;
51124 insn_fits = fits_dispatch_window (insn);
51126 if (num_insn >= MAX_INSN
51127 || num_uops + insn_num_uops > MAX_INSN
51128 || !(insn_fits))
51130 window_num = ~window_num & 1;
51131 window_list = allocate_next_window (window_num);
51134 if (window_num == 0)
51136 add_insn_window (insn, window_list, insn_num_uops);
51137 if (window_list->num_insn >= MAX_INSN
51138 && insn_group == disp_branch)
51140 process_end_window ();
51141 return;
51144 else if (window_num == 1)
51146 window0_list = window_list->prev;
51147 sum = window0_list->window_size + window_list->window_size;
51148 if (sum == 32
51149 || (byte_len + sum) >= 48)
51151 process_end_window ();
51152 window_list = dispatch_window_list;
51155 add_insn_window (insn, window_list, insn_num_uops);
51157 else
51158 gcc_unreachable ();
51160 if (is_end_basic_block (insn_group))
51162 /* End of basic block is reached do end-basic-block process. */
51163 process_end_window ();
51164 return;
51168 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51170 DEBUG_FUNCTION static void
51171 debug_dispatch_window_file (FILE *file, int window_num)
51173 dispatch_windows *list;
51174 int i;
51176 if (window_num == 0)
51177 list = dispatch_window_list;
51178 else
51179 list = dispatch_window_list1;
51181 fprintf (file, "Window #%d:\n", list->window_num);
51182 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51183 list->num_insn, list->num_uops, list->window_size);
51184 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51185 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51187 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51188 list->num_stores);
51189 fprintf (file, " insn info:\n");
51191 for (i = 0; i < MAX_INSN; i++)
51193 if (!list->window[i].insn)
51194 break;
51195 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51196 i, group_name[list->window[i].group],
51197 i, (void *)list->window[i].insn,
51198 i, list->window[i].path,
51199 i, list->window[i].byte_len,
51200 i, list->window[i].imm_bytes);
51204 /* Print to stdout a dispatch window. */
51206 DEBUG_FUNCTION void
51207 debug_dispatch_window (int window_num)
51209 debug_dispatch_window_file (stdout, window_num);
51212 /* Print INSN dispatch information to FILE. */
51214 DEBUG_FUNCTION static void
51215 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51217 int byte_len;
51218 enum insn_path path;
51219 enum dispatch_group group;
51220 int imm_size;
51221 int num_imm_operand;
51222 int num_imm32_operand;
51223 int num_imm64_operand;
51225 if (INSN_CODE (insn) < 0)
51226 return;
51228 byte_len = min_insn_size (insn);
51229 path = get_insn_path (insn);
51230 group = get_insn_group (insn);
51231 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51232 &num_imm64_operand);
51234 fprintf (file, " insn info:\n");
51235 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51236 group_name[group], path, byte_len);
51237 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51238 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51241 /* Print to STDERR the status of the ready list with respect to
51242 dispatch windows. */
51244 DEBUG_FUNCTION void
51245 debug_ready_dispatch (void)
51247 int i;
51248 int no_ready = number_in_ready ();
51250 fprintf (stdout, "Number of ready: %d\n", no_ready);
51252 for (i = 0; i < no_ready; i++)
51253 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51256 /* This routine is the driver of the dispatch scheduler. */
51258 static void
51259 do_dispatch (rtx_insn *insn, int mode)
51261 if (mode == DISPATCH_INIT)
51262 init_dispatch_sched ();
51263 else if (mode == ADD_TO_DISPATCH_WINDOW)
51264 add_to_dispatch_window (insn);
51267 /* Return TRUE if Dispatch Scheduling is supported. */
51269 static bool
51270 has_dispatch (rtx_insn *insn, int action)
51272 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51273 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51274 switch (action)
51276 default:
51277 return false;
51279 case IS_DISPATCH_ON:
51280 return true;
51282 case IS_CMP:
51283 return is_cmp (insn);
51285 case DISPATCH_VIOLATION:
51286 return dispatch_violation ();
51288 case FITS_DISPATCH_WINDOW:
51289 return fits_dispatch_window (insn);
51292 return false;
51295 /* Implementation of reassociation_width target hook used by
51296 reassoc phase to identify parallelism level in reassociated
51297 tree. Statements tree_code is passed in OPC. Arguments type
51298 is passed in MODE.
51300 Currently parallel reassociation is enabled for Atom
51301 processors only and we set reassociation width to be 2
51302 because Atom may issue up to 2 instructions per cycle.
51304 Return value should be fixed if parallel reassociation is
51305 enabled for other processors. */
51307 static int
51308 ix86_reassociation_width (unsigned int, machine_mode mode)
51310 /* Vector part. */
51311 if (VECTOR_MODE_P (mode))
51313 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51314 return 2;
51315 else
51316 return 1;
51319 /* Scalar part. */
51320 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51321 return 2;
51322 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51323 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51324 else
51325 return 1;
51328 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51329 place emms and femms instructions. */
51331 static machine_mode
51332 ix86_preferred_simd_mode (machine_mode mode)
51334 if (!TARGET_SSE)
51335 return word_mode;
51337 switch (mode)
51339 case QImode:
51340 return TARGET_AVX512BW ? V64QImode :
51341 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51342 case HImode:
51343 return TARGET_AVX512BW ? V32HImode :
51344 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51345 case SImode:
51346 return TARGET_AVX512F ? V16SImode :
51347 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51348 case DImode:
51349 return TARGET_AVX512F ? V8DImode :
51350 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51352 case SFmode:
51353 if (TARGET_AVX512F)
51354 return V16SFmode;
51355 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51356 return V8SFmode;
51357 else
51358 return V4SFmode;
51360 case DFmode:
51361 if (TARGET_AVX512F)
51362 return V8DFmode;
51363 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51364 return V4DFmode;
51365 else if (TARGET_SSE2)
51366 return V2DFmode;
51367 /* FALLTHRU */
51369 default:
51370 return word_mode;
51374 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51375 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51376 256bit and 128bit vectors. */
51378 static unsigned int
51379 ix86_autovectorize_vector_sizes (void)
51381 return TARGET_AVX512F ? 64 | 32 | 16 :
51382 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51385 /* Implemenation of targetm.vectorize.get_mask_mode. */
51387 static machine_mode
51388 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51390 unsigned elem_size = vector_size / nunits;
51392 /* Scalar mask case. */
51393 if ((TARGET_AVX512F && vector_size == 64)
51394 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51396 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51397 return smallest_mode_for_size (nunits, MODE_INT);
51400 machine_mode elem_mode
51401 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
51403 gcc_assert (elem_size * nunits == vector_size);
51405 return mode_for_vector (elem_mode, nunits);
51410 /* Return class of registers which could be used for pseudo of MODE
51411 and of class RCLASS for spilling instead of memory. Return NO_REGS
51412 if it is not possible or non-profitable. */
51414 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51416 static reg_class_t
51417 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51419 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51420 && TARGET_SSE2
51421 && TARGET_INTER_UNIT_MOVES_TO_VEC
51422 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51423 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51424 && INTEGER_CLASS_P (rclass))
51425 return ALL_SSE_REGS;
51426 return NO_REGS;
51429 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51430 but returns a lower bound. */
51432 static unsigned int
51433 ix86_max_noce_ifcvt_seq_cost (edge e)
51435 bool predictable_p = predictable_edge_p (e);
51437 enum compiler_param param
51438 = (predictable_p
51439 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51440 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51442 /* If we have a parameter set, use that, otherwise take a guess using
51443 BRANCH_COST. */
51444 if (global_options_set.x_param_values[param])
51445 return PARAM_VALUE (param);
51446 else
51447 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51450 /* Return true if SEQ is a good candidate as a replacement for the
51451 if-convertible sequence described in IF_INFO. */
51453 static bool
51454 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51456 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51458 int cmov_cnt = 0;
51459 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51460 Maybe we should allow even more conditional moves as long as they
51461 are used far enough not to stall the CPU, or also consider
51462 IF_INFO->TEST_BB succ edge probabilities. */
51463 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51465 rtx set = single_set (insn);
51466 if (!set)
51467 continue;
51468 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51469 continue;
51470 rtx src = SET_SRC (set);
51471 machine_mode mode = GET_MODE (src);
51472 if (GET_MODE_CLASS (mode) != MODE_INT
51473 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51474 continue;
51475 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51476 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51477 continue;
51478 /* insn is CMOV or FCMOV. */
51479 if (++cmov_cnt > 1)
51480 return false;
51483 return default_noce_conversion_profitable_p (seq, if_info);
51486 /* Implement targetm.vectorize.init_cost. */
51488 static void *
51489 ix86_init_cost (struct loop *)
51491 unsigned *cost = XNEWVEC (unsigned, 3);
51492 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51493 return cost;
51496 /* Implement targetm.vectorize.add_stmt_cost. */
51498 static unsigned
51499 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51500 struct _stmt_vec_info *stmt_info, int misalign,
51501 enum vect_cost_model_location where)
51503 unsigned *cost = (unsigned *) data;
51504 unsigned retval = 0;
51506 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51507 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51509 /* Penalize DFmode vector operations for Bonnell. */
51510 if (TARGET_BONNELL && kind == vector_stmt
51511 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51512 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51514 /* Statements in an inner loop relative to the loop being
51515 vectorized are weighted more heavily. The value here is
51516 arbitrary and could potentially be improved with analysis. */
51517 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51518 count *= 50; /* FIXME. */
51520 retval = (unsigned) (count * stmt_cost);
51522 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51523 for Silvermont as it has out of order integer pipeline and can execute
51524 2 scalar instruction per tick, but has in order SIMD pipeline. */
51525 if ((TARGET_SILVERMONT || TARGET_INTEL)
51526 && stmt_info && stmt_info->stmt)
51528 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51529 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51530 retval = (retval * 17) / 10;
51533 cost[where] += retval;
51535 return retval;
51538 /* Implement targetm.vectorize.finish_cost. */
51540 static void
51541 ix86_finish_cost (void *data, unsigned *prologue_cost,
51542 unsigned *body_cost, unsigned *epilogue_cost)
51544 unsigned *cost = (unsigned *) data;
51545 *prologue_cost = cost[vect_prologue];
51546 *body_cost = cost[vect_body];
51547 *epilogue_cost = cost[vect_epilogue];
51550 /* Implement targetm.vectorize.destroy_cost_data. */
51552 static void
51553 ix86_destroy_cost_data (void *data)
51555 free (data);
51558 /* Validate target specific memory model bits in VAL. */
51560 static unsigned HOST_WIDE_INT
51561 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51563 enum memmodel model = memmodel_from_int (val);
51564 bool strong;
51566 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51567 |MEMMODEL_MASK)
51568 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51570 warning (OPT_Winvalid_memory_model,
51571 "Unknown architecture specific memory model");
51572 return MEMMODEL_SEQ_CST;
51574 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51575 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51577 warning (OPT_Winvalid_memory_model,
51578 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51579 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51581 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51583 warning (OPT_Winvalid_memory_model,
51584 "HLE_RELEASE not used with RELEASE or stronger memory model");
51585 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51587 return val;
51590 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51591 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51592 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51593 or number of vecsize_mangle variants that should be emitted. */
51595 static int
51596 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51597 struct cgraph_simd_clone *clonei,
51598 tree base_type, int num)
51600 int ret = 1;
51602 if (clonei->simdlen
51603 && (clonei->simdlen < 2
51604 || clonei->simdlen > 1024
51605 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51607 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51608 "unsupported simdlen %d", clonei->simdlen);
51609 return 0;
51612 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51613 if (TREE_CODE (ret_type) != VOID_TYPE)
51614 switch (TYPE_MODE (ret_type))
51616 case QImode:
51617 case HImode:
51618 case SImode:
51619 case DImode:
51620 case SFmode:
51621 case DFmode:
51622 /* case SCmode: */
51623 /* case DCmode: */
51624 break;
51625 default:
51626 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51627 "unsupported return type %qT for simd\n", ret_type);
51628 return 0;
51631 tree t;
51632 int i;
51634 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
51635 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
51636 switch (TYPE_MODE (TREE_TYPE (t)))
51638 case QImode:
51639 case HImode:
51640 case SImode:
51641 case DImode:
51642 case SFmode:
51643 case DFmode:
51644 /* case SCmode: */
51645 /* case DCmode: */
51646 break;
51647 default:
51648 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51649 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
51650 return 0;
51653 if (clonei->cilk_elemental)
51655 /* Parse here processor clause. If not present, default to 'b'. */
51656 clonei->vecsize_mangle = 'b';
51658 else if (!TREE_PUBLIC (node->decl))
51660 /* If the function isn't exported, we can pick up just one ISA
51661 for the clones. */
51662 if (TARGET_AVX512F)
51663 clonei->vecsize_mangle = 'e';
51664 else if (TARGET_AVX2)
51665 clonei->vecsize_mangle = 'd';
51666 else if (TARGET_AVX)
51667 clonei->vecsize_mangle = 'c';
51668 else
51669 clonei->vecsize_mangle = 'b';
51670 ret = 1;
51672 else
51674 clonei->vecsize_mangle = "bcde"[num];
51675 ret = 4;
51677 clonei->mask_mode = VOIDmode;
51678 switch (clonei->vecsize_mangle)
51680 case 'b':
51681 clonei->vecsize_int = 128;
51682 clonei->vecsize_float = 128;
51683 break;
51684 case 'c':
51685 clonei->vecsize_int = 128;
51686 clonei->vecsize_float = 256;
51687 break;
51688 case 'd':
51689 clonei->vecsize_int = 256;
51690 clonei->vecsize_float = 256;
51691 break;
51692 case 'e':
51693 clonei->vecsize_int = 512;
51694 clonei->vecsize_float = 512;
51695 if (TYPE_MODE (base_type) == QImode)
51696 clonei->mask_mode = DImode;
51697 else
51698 clonei->mask_mode = SImode;
51699 break;
51701 if (clonei->simdlen == 0)
51703 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
51704 clonei->simdlen = clonei->vecsize_int;
51705 else
51706 clonei->simdlen = clonei->vecsize_float;
51707 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
51709 else if (clonei->simdlen > 16)
51711 /* For compatibility with ICC, use the same upper bounds
51712 for simdlen. In particular, for CTYPE below, use the return type,
51713 unless the function returns void, in that case use the characteristic
51714 type. If it is possible for given SIMDLEN to pass CTYPE value
51715 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
51716 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
51717 emit corresponding clone. */
51718 tree ctype = ret_type;
51719 if (TREE_CODE (ret_type) == VOID_TYPE)
51720 ctype = base_type;
51721 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
51722 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
51723 cnt /= clonei->vecsize_int;
51724 else
51725 cnt /= clonei->vecsize_float;
51726 if (cnt > (TARGET_64BIT ? 16 : 8))
51728 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51729 "unsupported simdlen %d", clonei->simdlen);
51730 return 0;
51733 return ret;
51736 /* Add target attribute to SIMD clone NODE if needed. */
51738 static void
51739 ix86_simd_clone_adjust (struct cgraph_node *node)
51741 const char *str = NULL;
51742 gcc_assert (node->decl == cfun->decl);
51743 switch (node->simdclone->vecsize_mangle)
51745 case 'b':
51746 if (!TARGET_SSE2)
51747 str = "sse2";
51748 break;
51749 case 'c':
51750 if (!TARGET_AVX)
51751 str = "avx";
51752 break;
51753 case 'd':
51754 if (!TARGET_AVX2)
51755 str = "avx2";
51756 break;
51757 case 'e':
51758 if (!TARGET_AVX512F)
51759 str = "avx512f";
51760 break;
51761 default:
51762 gcc_unreachable ();
51764 if (str == NULL)
51765 return;
51766 push_cfun (NULL);
51767 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
51768 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
51769 gcc_assert (ok);
51770 pop_cfun ();
51771 ix86_reset_previous_fndecl ();
51772 ix86_set_current_function (node->decl);
51775 /* If SIMD clone NODE can't be used in a vectorized loop
51776 in current function, return -1, otherwise return a badness of using it
51777 (0 if it is most desirable from vecsize_mangle point of view, 1
51778 slightly less desirable, etc.). */
51780 static int
51781 ix86_simd_clone_usable (struct cgraph_node *node)
51783 switch (node->simdclone->vecsize_mangle)
51785 case 'b':
51786 if (!TARGET_SSE2)
51787 return -1;
51788 if (!TARGET_AVX)
51789 return 0;
51790 return TARGET_AVX2 ? 2 : 1;
51791 case 'c':
51792 if (!TARGET_AVX)
51793 return -1;
51794 return TARGET_AVX2 ? 1 : 0;
51795 case 'd':
51796 if (!TARGET_AVX2)
51797 return -1;
51798 return 0;
51799 case 'e':
51800 if (!TARGET_AVX512F)
51801 return -1;
51802 return 0;
51803 default:
51804 gcc_unreachable ();
51808 /* This function adjusts the unroll factor based on
51809 the hardware capabilities. For ex, bdver3 has
51810 a loop buffer which makes unrolling of smaller
51811 loops less important. This function decides the
51812 unroll factor using number of memory references
51813 (value 32 is used) as a heuristic. */
51815 static unsigned
51816 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
51818 basic_block *bbs;
51819 rtx_insn *insn;
51820 unsigned i;
51821 unsigned mem_count = 0;
51823 if (!TARGET_ADJUST_UNROLL)
51824 return nunroll;
51826 /* Count the number of memory references within the loop body.
51827 This value determines the unrolling factor for bdver3 and bdver4
51828 architectures. */
51829 subrtx_iterator::array_type array;
51830 bbs = get_loop_body (loop);
51831 for (i = 0; i < loop->num_nodes; i++)
51832 FOR_BB_INSNS (bbs[i], insn)
51833 if (NONDEBUG_INSN_P (insn))
51834 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
51835 if (const_rtx x = *iter)
51836 if (MEM_P (x))
51838 machine_mode mode = GET_MODE (x);
51839 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
51840 if (n_words > 4)
51841 mem_count += 2;
51842 else
51843 mem_count += 1;
51845 free (bbs);
51847 if (mem_count && mem_count <=32)
51848 return 32/mem_count;
51850 return nunroll;
51854 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
51856 static bool
51857 ix86_float_exceptions_rounding_supported_p (void)
51859 /* For x87 floating point with standard excess precision handling,
51860 there is no adddf3 pattern (since x87 floating point only has
51861 XFmode operations) so the default hook implementation gets this
51862 wrong. */
51863 return TARGET_80387 || TARGET_SSE_MATH;
51866 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
51868 static void
51869 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
51871 if (!TARGET_80387 && !TARGET_SSE_MATH)
51872 return;
51873 tree exceptions_var = create_tmp_var_raw (integer_type_node);
51874 if (TARGET_80387)
51876 tree fenv_index_type = build_index_type (size_int (6));
51877 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
51878 tree fenv_var = create_tmp_var_raw (fenv_type);
51879 TREE_ADDRESSABLE (fenv_var) = 1;
51880 tree fenv_ptr = build_pointer_type (fenv_type);
51881 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
51882 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
51883 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
51884 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
51885 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
51886 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
51887 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
51888 tree hold_fnclex = build_call_expr (fnclex, 0);
51889 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
51890 NULL_TREE, NULL_TREE);
51891 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
51892 hold_fnclex);
51893 *clear = build_call_expr (fnclex, 0);
51894 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
51895 tree fnstsw_call = build_call_expr (fnstsw, 0);
51896 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
51897 sw_var, fnstsw_call);
51898 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
51899 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
51900 exceptions_var, exceptions_x87);
51901 *update = build2 (COMPOUND_EXPR, integer_type_node,
51902 sw_mod, update_mod);
51903 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
51904 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
51906 if (TARGET_SSE_MATH)
51908 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
51909 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
51910 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
51911 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
51912 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
51913 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
51914 mxcsr_orig_var, stmxcsr_hold_call);
51915 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
51916 mxcsr_orig_var,
51917 build_int_cst (unsigned_type_node, 0x1f80));
51918 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
51919 build_int_cst (unsigned_type_node, 0xffffffc0));
51920 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
51921 mxcsr_mod_var, hold_mod_val);
51922 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51923 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
51924 hold_assign_orig, hold_assign_mod);
51925 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
51926 ldmxcsr_hold_call);
51927 if (*hold)
51928 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
51929 else
51930 *hold = hold_all;
51931 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
51932 if (*clear)
51933 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
51934 ldmxcsr_clear_call);
51935 else
51936 *clear = ldmxcsr_clear_call;
51937 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
51938 tree exceptions_sse = fold_convert (integer_type_node,
51939 stxmcsr_update_call);
51940 if (*update)
51942 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
51943 exceptions_var, exceptions_sse);
51944 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
51945 exceptions_var, exceptions_mod);
51946 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
51947 exceptions_assign);
51949 else
51950 *update = build2 (MODIFY_EXPR, integer_type_node,
51951 exceptions_var, exceptions_sse);
51952 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
51953 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51954 ldmxcsr_update_call);
51956 tree atomic_feraiseexcept
51957 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
51958 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
51959 1, exceptions_var);
51960 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
51961 atomic_feraiseexcept_call);
51964 /* Return mode to be used for bounds or VOIDmode
51965 if bounds are not supported. */
51967 static machine_mode
51968 ix86_mpx_bound_mode ()
51970 /* Do not support pointer checker if MPX
51971 is not enabled. */
51972 if (!TARGET_MPX)
51974 if (flag_check_pointer_bounds)
51975 warning (0, "Pointer Checker requires MPX support on this target."
51976 " Use -mmpx options to enable MPX.");
51977 return VOIDmode;
51980 return BNDmode;
51983 /* Return constant used to statically initialize constant bounds.
51985 This function is used to create special bound values. For now
51986 only INIT bounds and NONE bounds are expected. More special
51987 values may be added later. */
51989 static tree
51990 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
51992 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
51993 : build_zero_cst (pointer_sized_int_node);
51994 tree high = ub ? build_zero_cst (pointer_sized_int_node)
51995 : build_minus_one_cst (pointer_sized_int_node);
51997 /* This function is supposed to be used to create INIT and
51998 NONE bounds only. */
51999 gcc_assert ((lb == 0 && ub == -1)
52000 || (lb == -1 && ub == 0));
52002 return build_complex (NULL, low, high);
52005 /* Generate a list of statements STMTS to initialize pointer bounds
52006 variable VAR with bounds LB and UB. Return the number of generated
52007 statements. */
52009 static int
52010 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
52012 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
52013 tree lhs, modify, var_p;
52015 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
52016 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
52018 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
52019 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
52020 append_to_statement_list (modify, stmts);
52022 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
52023 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
52024 TYPE_SIZE_UNIT (pointer_sized_int_node)));
52025 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
52026 append_to_statement_list (modify, stmts);
52028 return 2;
52031 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
52032 /* For i386, common symbol is local only for non-PIE binaries. For
52033 x86-64, common symbol is local only for non-PIE binaries or linker
52034 supports copy reloc in PIE binaries. */
52036 static bool
52037 ix86_binds_local_p (const_tree exp)
52039 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
52040 (!flag_pic
52041 || (TARGET_64BIT
52042 && HAVE_LD_PIE_COPYRELOC != 0)));
52044 #endif
52046 /* If MEM is in the form of [base+offset], extract the two parts
52047 of address and set to BASE and OFFSET, otherwise return false. */
52049 static bool
52050 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52052 rtx addr;
52054 gcc_assert (MEM_P (mem));
52056 addr = XEXP (mem, 0);
52058 if (GET_CODE (addr) == CONST)
52059 addr = XEXP (addr, 0);
52061 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52063 *base = addr;
52064 *offset = const0_rtx;
52065 return true;
52068 if (GET_CODE (addr) == PLUS
52069 && (REG_P (XEXP (addr, 0))
52070 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52071 && CONST_INT_P (XEXP (addr, 1)))
52073 *base = XEXP (addr, 0);
52074 *offset = XEXP (addr, 1);
52075 return true;
52078 return false;
52081 /* Given OPERANDS of consecutive load/store, check if we can merge
52082 them into move multiple. LOAD is true if they are load instructions.
52083 MODE is the mode of memory operands. */
52085 bool
52086 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52087 machine_mode mode)
52089 HOST_WIDE_INT offval_1, offval_2, msize;
52090 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52092 if (load)
52094 mem_1 = operands[1];
52095 mem_2 = operands[3];
52096 reg_1 = operands[0];
52097 reg_2 = operands[2];
52099 else
52101 mem_1 = operands[0];
52102 mem_2 = operands[2];
52103 reg_1 = operands[1];
52104 reg_2 = operands[3];
52107 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52109 if (REGNO (reg_1) != REGNO (reg_2))
52110 return false;
52112 /* Check if the addresses are in the form of [base+offset]. */
52113 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52114 return false;
52115 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52116 return false;
52118 /* Check if the bases are the same. */
52119 if (!rtx_equal_p (base_1, base_2))
52120 return false;
52122 offval_1 = INTVAL (offset_1);
52123 offval_2 = INTVAL (offset_2);
52124 msize = GET_MODE_SIZE (mode);
52125 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52126 if (offval_1 + msize != offval_2)
52127 return false;
52129 return true;
52132 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52134 static bool
52135 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52136 optimization_type opt_type)
52138 switch (op)
52140 case asin_optab:
52141 case acos_optab:
52142 case log1p_optab:
52143 case exp_optab:
52144 case exp10_optab:
52145 case exp2_optab:
52146 case expm1_optab:
52147 case ldexp_optab:
52148 case scalb_optab:
52149 case round_optab:
52150 return opt_type == OPTIMIZE_FOR_SPEED;
52152 case rint_optab:
52153 if (SSE_FLOAT_MODE_P (mode1)
52154 && TARGET_SSE_MATH
52155 && !flag_trapping_math
52156 && !TARGET_ROUND)
52157 return opt_type == OPTIMIZE_FOR_SPEED;
52158 return true;
52160 case floor_optab:
52161 case ceil_optab:
52162 case btrunc_optab:
52163 if (SSE_FLOAT_MODE_P (mode1)
52164 && TARGET_SSE_MATH
52165 && !flag_trapping_math
52166 && TARGET_ROUND)
52167 return true;
52168 return opt_type == OPTIMIZE_FOR_SPEED;
52170 case rsqrt_optab:
52171 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52173 default:
52174 return true;
52178 /* Address space support.
52180 This is not "far pointers" in the 16-bit sense, but an easy way
52181 to use %fs and %gs segment prefixes. Therefore:
52183 (a) All address spaces have the same modes,
52184 (b) All address spaces have the same addresss forms,
52185 (c) While %fs and %gs are technically subsets of the generic
52186 address space, they are probably not subsets of each other.
52187 (d) Since we have no access to the segment base register values
52188 without resorting to a system call, we cannot convert a
52189 non-default address space to a default address space.
52190 Therefore we do not claim %fs or %gs are subsets of generic.
52192 Therefore we can (mostly) use the default hooks. */
52194 /* All use of segmentation is assumed to make address 0 valid. */
52196 static bool
52197 ix86_addr_space_zero_address_valid (addr_space_t as)
52199 return as != ADDR_SPACE_GENERIC;
52202 static void
52203 ix86_init_libfuncs (void)
52205 if (TARGET_64BIT)
52207 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52208 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52210 else
52212 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52213 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52216 #if TARGET_MACHO
52217 darwin_rename_builtins ();
52218 #endif
52221 /* Generate call to __divmoddi4. */
52223 static void
52224 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52225 rtx op0, rtx op1,
52226 rtx *quot_p, rtx *rem_p)
52228 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52230 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52231 mode, 3,
52232 op0, GET_MODE (op0),
52233 op1, GET_MODE (op1),
52234 XEXP (rem, 0), Pmode);
52235 *quot_p = quot;
52236 *rem_p = rem;
52239 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52240 FPU, assume that the fpcw is set to extended precision; when using
52241 only SSE, rounding is correct; when using both SSE and the FPU,
52242 the rounding precision is indeterminate, since either may be chosen
52243 apparently at random. */
52245 static enum flt_eval_method
52246 ix86_excess_precision (enum excess_precision_type type)
52248 switch (type)
52250 case EXCESS_PRECISION_TYPE_FAST:
52251 /* The fastest type to promote to will always be the native type,
52252 whether that occurs with implicit excess precision or
52253 otherwise. */
52254 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52255 case EXCESS_PRECISION_TYPE_STANDARD:
52256 case EXCESS_PRECISION_TYPE_IMPLICIT:
52257 /* Otherwise, the excess precision we want when we are
52258 in a standards compliant mode, and the implicit precision we
52259 provide would be identical were it not for the unpredictable
52260 cases. */
52261 if (!TARGET_80387)
52262 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52263 else if (!TARGET_MIX_SSE_I387)
52265 if (!TARGET_SSE_MATH)
52266 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52267 else if (TARGET_SSE2)
52268 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52271 /* If we are in standards compliant mode, but we know we will
52272 calculate in unpredictable precision, return
52273 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52274 excess precision if the target can't guarantee it will honor
52275 it. */
52276 return (type == EXCESS_PRECISION_TYPE_STANDARD
52277 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52278 : FLT_EVAL_METHOD_UNPREDICTABLE);
52279 default:
52280 gcc_unreachable ();
52283 return FLT_EVAL_METHOD_UNPREDICTABLE;
52286 /* Target-specific selftests. */
52288 #if CHECKING_P
52290 namespace selftest {
52292 /* Verify that hard regs are dumped as expected (in compact mode). */
52294 static void
52295 ix86_test_dumping_hard_regs ()
52297 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52298 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52301 /* Test dumping an insn with repeated references to the same SCRATCH,
52302 to verify the rtx_reuse code. */
52304 static void
52305 ix86_test_dumping_memory_blockage ()
52307 set_new_first_and_last_insn (NULL, NULL);
52309 rtx pat = gen_memory_blockage ();
52310 rtx_reuse_manager r;
52311 r.preprocess (pat);
52313 /* Verify that the repeated references to the SCRATCH show use
52314 reuse IDS. The first should be prefixed with a reuse ID,
52315 and the second should be dumped as a "reuse_rtx" of that ID.
52316 The expected string assumes Pmode == DImode. */
52317 if (Pmode == DImode)
52318 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52319 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52320 " (unspec:BLK [\n"
52321 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52322 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52325 /* Verify loading an RTL dump; specifically a dump of copying
52326 a param on x86_64 from a hard reg into the frame.
52327 This test is target-specific since the dump contains target-specific
52328 hard reg names. */
52330 static void
52331 ix86_test_loading_dump_fragment_1 ()
52333 rtl_dump_test t (SELFTEST_LOCATION,
52334 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52336 rtx_insn *insn = get_insn_by_uid (1);
52338 /* The block structure and indentation here is purely for
52339 readability; it mirrors the structure of the rtx. */
52340 tree mem_expr;
52342 rtx pat = PATTERN (insn);
52343 ASSERT_EQ (SET, GET_CODE (pat));
52345 rtx dest = SET_DEST (pat);
52346 ASSERT_EQ (MEM, GET_CODE (dest));
52347 /* Verify the "/c" was parsed. */
52348 ASSERT_TRUE (RTX_FLAG (dest, call));
52349 ASSERT_EQ (SImode, GET_MODE (dest));
52351 rtx addr = XEXP (dest, 0);
52352 ASSERT_EQ (PLUS, GET_CODE (addr));
52353 ASSERT_EQ (DImode, GET_MODE (addr));
52355 rtx lhs = XEXP (addr, 0);
52356 /* Verify that the "frame" REG was consolidated. */
52357 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52360 rtx rhs = XEXP (addr, 1);
52361 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52362 ASSERT_EQ (-4, INTVAL (rhs));
52365 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52366 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52367 /* "i" should have been handled by synthesizing a global int
52368 variable named "i". */
52369 mem_expr = MEM_EXPR (dest);
52370 ASSERT_NE (mem_expr, NULL);
52371 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52372 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52373 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52374 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52375 /* "+0". */
52376 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52377 ASSERT_EQ (0, MEM_OFFSET (dest));
52378 /* "S4". */
52379 ASSERT_EQ (4, MEM_SIZE (dest));
52380 /* "A32. */
52381 ASSERT_EQ (32, MEM_ALIGN (dest));
52384 rtx src = SET_SRC (pat);
52385 ASSERT_EQ (REG, GET_CODE (src));
52386 ASSERT_EQ (SImode, GET_MODE (src));
52387 ASSERT_EQ (5, REGNO (src));
52388 tree reg_expr = REG_EXPR (src);
52389 /* "i" here should point to the same var as for the MEM_EXPR. */
52390 ASSERT_EQ (reg_expr, mem_expr);
52395 /* Verify that the RTL loader copes with a call_insn dump.
52396 This test is target-specific since the dump contains a target-specific
52397 hard reg name. */
52399 static void
52400 ix86_test_loading_call_insn ()
52402 /* The test dump includes register "xmm0", where requires TARGET_SSE
52403 to exist. */
52404 if (!TARGET_SSE)
52405 return;
52407 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52409 rtx_insn *insn = get_insns ();
52410 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52412 /* "/j". */
52413 ASSERT_TRUE (RTX_FLAG (insn, jump));
52415 rtx pat = PATTERN (insn);
52416 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52418 /* Verify REG_NOTES. */
52420 /* "(expr_list:REG_CALL_DECL". */
52421 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52422 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52423 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52425 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52426 rtx_expr_list *note1 = note0->next ();
52427 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52429 ASSERT_EQ (NULL, note1->next ());
52432 /* Verify CALL_INSN_FUNCTION_USAGE. */
52434 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52435 rtx_expr_list *usage
52436 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52437 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52438 ASSERT_EQ (DFmode, GET_MODE (usage));
52439 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52440 ASSERT_EQ (NULL, usage->next ());
52444 /* Verify that the RTL loader copes a dump from print_rtx_function.
52445 This test is target-specific since the dump contains target-specific
52446 hard reg names. */
52448 static void
52449 ix86_test_loading_full_dump ()
52451 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52453 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52455 rtx_insn *insn_1 = get_insn_by_uid (1);
52456 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52458 rtx_insn *insn_7 = get_insn_by_uid (7);
52459 ASSERT_EQ (INSN, GET_CODE (insn_7));
52460 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52462 rtx_insn *insn_15 = get_insn_by_uid (15);
52463 ASSERT_EQ (INSN, GET_CODE (insn_15));
52464 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52466 /* Verify crtl->return_rtx. */
52467 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52468 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52469 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52472 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52473 In particular, verify that it correctly loads the 2nd operand.
52474 This test is target-specific since these are machine-specific
52475 operands (and enums). */
52477 static void
52478 ix86_test_loading_unspec ()
52480 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52482 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52484 ASSERT_TRUE (cfun);
52486 /* Test of an UNSPEC. */
52487 rtx_insn *insn = get_insns ();
52488 ASSERT_EQ (INSN, GET_CODE (insn));
52489 rtx set = single_set (insn);
52490 ASSERT_NE (NULL, set);
52491 rtx dst = SET_DEST (set);
52492 ASSERT_EQ (MEM, GET_CODE (dst));
52493 rtx src = SET_SRC (set);
52494 ASSERT_EQ (UNSPEC, GET_CODE (src));
52495 ASSERT_EQ (BLKmode, GET_MODE (src));
52496 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52498 rtx v0 = XVECEXP (src, 0, 0);
52500 /* Verify that the two uses of the first SCRATCH have pointer
52501 equality. */
52502 rtx scratch_a = XEXP (dst, 0);
52503 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52505 rtx scratch_b = XEXP (v0, 0);
52506 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52508 ASSERT_EQ (scratch_a, scratch_b);
52510 /* Verify that the two mems are thus treated as equal. */
52511 ASSERT_TRUE (rtx_equal_p (dst, v0));
52513 /* Verify the the insn is recognized. */
52514 ASSERT_NE(-1, recog_memoized (insn));
52516 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52517 insn = NEXT_INSN (insn);
52518 ASSERT_EQ (INSN, GET_CODE (insn));
52520 set = single_set (insn);
52521 ASSERT_NE (NULL, set);
52523 src = SET_SRC (set);
52524 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52525 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52528 /* Run all target-specific selftests. */
52530 static void
52531 ix86_run_selftests (void)
52533 ix86_test_dumping_hard_regs ();
52534 ix86_test_dumping_memory_blockage ();
52536 /* Various tests of loading RTL dumps, here because they contain
52537 ix86-isms (e.g. names of hard regs). */
52538 ix86_test_loading_dump_fragment_1 ();
52539 ix86_test_loading_call_insn ();
52540 ix86_test_loading_full_dump ();
52541 ix86_test_loading_unspec ();
52544 } // namespace selftest
52546 #endif /* CHECKING_P */
52548 /* Initialize the GCC target structure. */
52549 #undef TARGET_RETURN_IN_MEMORY
52550 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52552 #undef TARGET_LEGITIMIZE_ADDRESS
52553 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52555 #undef TARGET_ATTRIBUTE_TABLE
52556 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52557 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52558 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52559 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52560 # undef TARGET_MERGE_DECL_ATTRIBUTES
52561 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52562 #endif
52564 #undef TARGET_COMP_TYPE_ATTRIBUTES
52565 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52567 #undef TARGET_INIT_BUILTINS
52568 #define TARGET_INIT_BUILTINS ix86_init_builtins
52569 #undef TARGET_BUILTIN_DECL
52570 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52571 #undef TARGET_EXPAND_BUILTIN
52572 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52574 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52575 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52576 ix86_builtin_vectorized_function
52578 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52579 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52581 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52582 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52584 #undef TARGET_BUILTIN_RECIPROCAL
52585 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52587 #undef TARGET_ASM_FUNCTION_EPILOGUE
52588 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52590 #undef TARGET_ENCODE_SECTION_INFO
52591 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52592 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52593 #else
52594 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52595 #endif
52597 #undef TARGET_ASM_OPEN_PAREN
52598 #define TARGET_ASM_OPEN_PAREN ""
52599 #undef TARGET_ASM_CLOSE_PAREN
52600 #define TARGET_ASM_CLOSE_PAREN ""
52602 #undef TARGET_ASM_BYTE_OP
52603 #define TARGET_ASM_BYTE_OP ASM_BYTE
52605 #undef TARGET_ASM_ALIGNED_HI_OP
52606 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52607 #undef TARGET_ASM_ALIGNED_SI_OP
52608 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52609 #ifdef ASM_QUAD
52610 #undef TARGET_ASM_ALIGNED_DI_OP
52611 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52612 #endif
52614 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52615 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52617 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52618 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52620 #undef TARGET_ASM_UNALIGNED_HI_OP
52621 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52622 #undef TARGET_ASM_UNALIGNED_SI_OP
52623 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52624 #undef TARGET_ASM_UNALIGNED_DI_OP
52625 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52627 #undef TARGET_PRINT_OPERAND
52628 #define TARGET_PRINT_OPERAND ix86_print_operand
52629 #undef TARGET_PRINT_OPERAND_ADDRESS
52630 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52631 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52632 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
52633 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
52634 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
52636 #undef TARGET_SCHED_INIT_GLOBAL
52637 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
52638 #undef TARGET_SCHED_ADJUST_COST
52639 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
52640 #undef TARGET_SCHED_ISSUE_RATE
52641 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
52642 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
52643 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
52644 ia32_multipass_dfa_lookahead
52645 #undef TARGET_SCHED_MACRO_FUSION_P
52646 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
52647 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
52648 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
52650 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
52651 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
52653 #undef TARGET_MEMMODEL_CHECK
52654 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
52656 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
52657 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
52659 #ifdef HAVE_AS_TLS
52660 #undef TARGET_HAVE_TLS
52661 #define TARGET_HAVE_TLS true
52662 #endif
52663 #undef TARGET_CANNOT_FORCE_CONST_MEM
52664 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
52665 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
52666 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
52668 #undef TARGET_DELEGITIMIZE_ADDRESS
52669 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
52671 #undef TARGET_MS_BITFIELD_LAYOUT_P
52672 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
52674 #if TARGET_MACHO
52675 #undef TARGET_BINDS_LOCAL_P
52676 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
52677 #else
52678 #undef TARGET_BINDS_LOCAL_P
52679 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
52680 #endif
52681 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52682 #undef TARGET_BINDS_LOCAL_P
52683 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
52684 #endif
52686 #undef TARGET_ASM_OUTPUT_MI_THUNK
52687 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
52688 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
52689 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
52691 #undef TARGET_ASM_FILE_START
52692 #define TARGET_ASM_FILE_START x86_file_start
52694 #undef TARGET_OPTION_OVERRIDE
52695 #define TARGET_OPTION_OVERRIDE ix86_option_override
52697 #undef TARGET_REGISTER_MOVE_COST
52698 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
52699 #undef TARGET_MEMORY_MOVE_COST
52700 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
52701 #undef TARGET_RTX_COSTS
52702 #define TARGET_RTX_COSTS ix86_rtx_costs
52703 #undef TARGET_ADDRESS_COST
52704 #define TARGET_ADDRESS_COST ix86_address_cost
52706 #undef TARGET_FLAGS_REGNUM
52707 #define TARGET_FLAGS_REGNUM FLAGS_REG
52708 #undef TARGET_FIXED_CONDITION_CODE_REGS
52709 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
52710 #undef TARGET_CC_MODES_COMPATIBLE
52711 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
52713 #undef TARGET_MACHINE_DEPENDENT_REORG
52714 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
52716 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
52717 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
52719 #undef TARGET_BUILD_BUILTIN_VA_LIST
52720 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
52722 #undef TARGET_FOLD_BUILTIN
52723 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
52725 #undef TARGET_GIMPLE_FOLD_BUILTIN
52726 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
52728 #undef TARGET_COMPARE_VERSION_PRIORITY
52729 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
52731 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
52732 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
52733 ix86_generate_version_dispatcher_body
52735 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
52736 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
52737 ix86_get_function_versions_dispatcher
52739 #undef TARGET_ENUM_VA_LIST_P
52740 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
52742 #undef TARGET_FN_ABI_VA_LIST
52743 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
52745 #undef TARGET_CANONICAL_VA_LIST_TYPE
52746 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
52748 #undef TARGET_EXPAND_BUILTIN_VA_START
52749 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
52751 #undef TARGET_MD_ASM_ADJUST
52752 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
52754 #undef TARGET_C_EXCESS_PRECISION
52755 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
52756 #undef TARGET_PROMOTE_PROTOTYPES
52757 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
52758 #undef TARGET_SETUP_INCOMING_VARARGS
52759 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
52760 #undef TARGET_MUST_PASS_IN_STACK
52761 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
52762 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
52763 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
52764 #undef TARGET_FUNCTION_ARG_ADVANCE
52765 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
52766 #undef TARGET_FUNCTION_ARG
52767 #define TARGET_FUNCTION_ARG ix86_function_arg
52768 #undef TARGET_INIT_PIC_REG
52769 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
52770 #undef TARGET_USE_PSEUDO_PIC_REG
52771 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
52772 #undef TARGET_FUNCTION_ARG_BOUNDARY
52773 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
52774 #undef TARGET_PASS_BY_REFERENCE
52775 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
52776 #undef TARGET_INTERNAL_ARG_POINTER
52777 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
52778 #undef TARGET_UPDATE_STACK_BOUNDARY
52779 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
52780 #undef TARGET_GET_DRAP_RTX
52781 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
52782 #undef TARGET_STRICT_ARGUMENT_NAMING
52783 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
52784 #undef TARGET_STATIC_CHAIN
52785 #define TARGET_STATIC_CHAIN ix86_static_chain
52786 #undef TARGET_TRAMPOLINE_INIT
52787 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
52788 #undef TARGET_RETURN_POPS_ARGS
52789 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
52791 #undef TARGET_WARN_FUNC_RETURN
52792 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
52794 #undef TARGET_LEGITIMATE_COMBINED_INSN
52795 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
52797 #undef TARGET_ASAN_SHADOW_OFFSET
52798 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
52800 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
52801 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
52803 #undef TARGET_SCALAR_MODE_SUPPORTED_P
52804 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
52806 #undef TARGET_VECTOR_MODE_SUPPORTED_P
52807 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
52809 #undef TARGET_C_MODE_FOR_SUFFIX
52810 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
52812 #ifdef HAVE_AS_TLS
52813 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
52814 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
52815 #endif
52817 #ifdef SUBTARGET_INSERT_ATTRIBUTES
52818 #undef TARGET_INSERT_ATTRIBUTES
52819 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
52820 #endif
52822 #undef TARGET_MANGLE_TYPE
52823 #define TARGET_MANGLE_TYPE ix86_mangle_type
52825 #ifdef TARGET_THREAD_SSP_OFFSET
52826 #undef TARGET_STACK_PROTECT_GUARD
52827 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
52828 #endif
52830 #if !TARGET_MACHO
52831 #undef TARGET_STACK_PROTECT_FAIL
52832 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
52833 #endif
52835 #undef TARGET_FUNCTION_VALUE
52836 #define TARGET_FUNCTION_VALUE ix86_function_value
52838 #undef TARGET_FUNCTION_VALUE_REGNO_P
52839 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
52841 #undef TARGET_PROMOTE_FUNCTION_MODE
52842 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
52844 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
52845 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
52847 #undef TARGET_MEMBER_TYPE_FORCES_BLK
52848 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
52850 #undef TARGET_INSTANTIATE_DECLS
52851 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
52853 #undef TARGET_SECONDARY_RELOAD
52854 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
52856 #undef TARGET_CLASS_MAX_NREGS
52857 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
52859 #undef TARGET_PREFERRED_RELOAD_CLASS
52860 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
52861 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
52862 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
52863 #undef TARGET_CLASS_LIKELY_SPILLED_P
52864 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
52866 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
52867 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
52868 ix86_builtin_vectorization_cost
52869 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
52870 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
52871 ix86_vectorize_vec_perm_const_ok
52872 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
52873 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
52874 ix86_preferred_simd_mode
52875 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
52876 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
52877 ix86_autovectorize_vector_sizes
52878 #undef TARGET_VECTORIZE_GET_MASK_MODE
52879 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
52880 #undef TARGET_VECTORIZE_INIT_COST
52881 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
52882 #undef TARGET_VECTORIZE_ADD_STMT_COST
52883 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
52884 #undef TARGET_VECTORIZE_FINISH_COST
52885 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
52886 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
52887 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
52889 #undef TARGET_SET_CURRENT_FUNCTION
52890 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
52892 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
52893 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
52895 #undef TARGET_OPTION_SAVE
52896 #define TARGET_OPTION_SAVE ix86_function_specific_save
52898 #undef TARGET_OPTION_RESTORE
52899 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
52901 #undef TARGET_OPTION_POST_STREAM_IN
52902 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
52904 #undef TARGET_OPTION_PRINT
52905 #define TARGET_OPTION_PRINT ix86_function_specific_print
52907 #undef TARGET_OPTION_FUNCTION_VERSIONS
52908 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
52910 #undef TARGET_CAN_INLINE_P
52911 #define TARGET_CAN_INLINE_P ix86_can_inline_p
52913 #undef TARGET_LEGITIMATE_ADDRESS_P
52914 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
52916 #undef TARGET_REGISTER_PRIORITY
52917 #define TARGET_REGISTER_PRIORITY ix86_register_priority
52919 #undef TARGET_REGISTER_USAGE_LEVELING_P
52920 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
52922 #undef TARGET_LEGITIMATE_CONSTANT_P
52923 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
52925 #undef TARGET_COMPUTE_FRAME_LAYOUT
52926 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
52928 #undef TARGET_FRAME_POINTER_REQUIRED
52929 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
52931 #undef TARGET_CAN_ELIMINATE
52932 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
52934 #undef TARGET_EXTRA_LIVE_ON_ENTRY
52935 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
52937 #undef TARGET_ASM_CODE_END
52938 #define TARGET_ASM_CODE_END ix86_code_end
52940 #undef TARGET_CONDITIONAL_REGISTER_USAGE
52941 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
52943 #undef TARGET_LOOP_UNROLL_ADJUST
52944 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
52946 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
52947 #undef TARGET_SPILL_CLASS
52948 #define TARGET_SPILL_CLASS ix86_spill_class
52950 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
52951 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
52952 ix86_simd_clone_compute_vecsize_and_simdlen
52954 #undef TARGET_SIMD_CLONE_ADJUST
52955 #define TARGET_SIMD_CLONE_ADJUST \
52956 ix86_simd_clone_adjust
52958 #undef TARGET_SIMD_CLONE_USABLE
52959 #define TARGET_SIMD_CLONE_USABLE \
52960 ix86_simd_clone_usable
52962 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
52963 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
52964 ix86_float_exceptions_rounding_supported_p
52966 #undef TARGET_MODE_EMIT
52967 #define TARGET_MODE_EMIT ix86_emit_mode_set
52969 #undef TARGET_MODE_NEEDED
52970 #define TARGET_MODE_NEEDED ix86_mode_needed
52972 #undef TARGET_MODE_AFTER
52973 #define TARGET_MODE_AFTER ix86_mode_after
52975 #undef TARGET_MODE_ENTRY
52976 #define TARGET_MODE_ENTRY ix86_mode_entry
52978 #undef TARGET_MODE_EXIT
52979 #define TARGET_MODE_EXIT ix86_mode_exit
52981 #undef TARGET_MODE_PRIORITY
52982 #define TARGET_MODE_PRIORITY ix86_mode_priority
52984 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
52985 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
52987 #undef TARGET_LOAD_BOUNDS_FOR_ARG
52988 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
52990 #undef TARGET_STORE_BOUNDS_FOR_ARG
52991 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
52993 #undef TARGET_LOAD_RETURNED_BOUNDS
52994 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
52996 #undef TARGET_STORE_RETURNED_BOUNDS
52997 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
52999 #undef TARGET_CHKP_BOUND_MODE
53000 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
53002 #undef TARGET_BUILTIN_CHKP_FUNCTION
53003 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
53005 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
53006 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
53008 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
53009 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
53011 #undef TARGET_CHKP_INITIALIZE_BOUNDS
53012 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
53014 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
53015 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
53017 #undef TARGET_OFFLOAD_OPTIONS
53018 #define TARGET_OFFLOAD_OPTIONS \
53019 ix86_offload_options
53021 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
53022 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
53024 #undef TARGET_OPTAB_SUPPORTED_P
53025 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
53027 #undef TARGET_HARD_REGNO_SCRATCH_OK
53028 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
53030 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
53031 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
53033 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
53034 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
53036 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
53037 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
53039 #undef TARGET_INIT_LIBFUNCS
53040 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
53042 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
53043 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
53045 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
53046 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
53048 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
53049 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
53051 #if CHECKING_P
53052 #undef TARGET_RUN_TARGET_SELFTESTS
53053 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
53054 #endif /* #if CHECKING_P */
53056 struct gcc_target targetm = TARGET_INITIALIZER;
53058 #include "gt-i386.h"