[6/77] Make GET_MODE_WIDER return an opt_mode
[official-gcc.git] / gcc / config / i386 / i386.c
blob88850bde786a9f9f29001d83c5e1cb85ab9a3e4b
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 static rtx legitimize_dllimport_symbol (rtx, bool);
96 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
97 static rtx legitimize_pe_coff_symbol (rtx, bool);
98 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
99 static bool ix86_save_reg (unsigned int, bool, bool);
100 static bool ix86_function_naked (const_tree);
102 #ifndef CHECK_STACK_LIMIT
103 #define CHECK_STACK_LIMIT (-1)
104 #endif
106 /* Return index of given mode in mult and division cost tables. */
107 #define MODE_INDEX(mode) \
108 ((mode) == QImode ? 0 \
109 : (mode) == HImode ? 1 \
110 : (mode) == SImode ? 2 \
111 : (mode) == DImode ? 3 \
112 : 4)
114 /* Processor costs (relative to an add) */
115 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
116 #define COSTS_N_BYTES(N) ((N) * 2)
118 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
120 static stringop_algs ix86_size_memcpy[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
123 static stringop_algs ix86_size_memset[2] = {
124 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
127 const
128 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
129 COSTS_N_BYTES (2), /* cost of an add instruction */
130 COSTS_N_BYTES (3), /* cost of a lea instruction */
131 COSTS_N_BYTES (2), /* variable shift costs */
132 COSTS_N_BYTES (3), /* constant shift costs */
133 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
134 COSTS_N_BYTES (3), /* HI */
135 COSTS_N_BYTES (3), /* SI */
136 COSTS_N_BYTES (3), /* DI */
137 COSTS_N_BYTES (5)}, /* other */
138 0, /* cost of multiply per each bit set */
139 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
140 COSTS_N_BYTES (3), /* HI */
141 COSTS_N_BYTES (3), /* SI */
142 COSTS_N_BYTES (3), /* DI */
143 COSTS_N_BYTES (5)}, /* other */
144 COSTS_N_BYTES (3), /* cost of movsx */
145 COSTS_N_BYTES (3), /* cost of movzx */
146 0, /* "large" insn */
147 2, /* MOVE_RATIO */
148 2, /* cost for loading QImode using movzbl */
149 {2, 2, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 2, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {2, 2, 2}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {2, 2, 2}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 3, /* cost of moving MMX register */
159 {3, 3}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {3, 3}, /* cost of storing MMX registers
162 in SImode and DImode */
163 3, /* cost of moving SSE register */
164 {3, 3, 3}, /* cost of loading SSE registers
165 in SImode, DImode and TImode */
166 {3, 3, 3}, /* cost of storing SSE registers
167 in SImode, DImode and TImode */
168 3, /* MMX or SSE register to integer */
169 0, /* size of l1 cache */
170 0, /* size of l2 cache */
171 0, /* size of prefetch block */
172 0, /* number of parallel prefetches */
173 2, /* Branch cost */
174 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
175 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
176 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
177 COSTS_N_BYTES (2), /* cost of FABS instruction. */
178 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
179 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
180 ix86_size_memcpy,
181 ix86_size_memset,
182 1, /* scalar_stmt_cost. */
183 1, /* scalar load_cost. */
184 1, /* scalar_store_cost. */
185 1, /* vec_stmt_cost. */
186 1, /* vec_to_scalar_cost. */
187 1, /* scalar_to_vec_cost. */
188 1, /* vec_align_load_cost. */
189 1, /* vec_unalign_load_cost. */
190 1, /* vec_store_cost. */
191 1, /* cond_taken_branch_cost. */
192 1, /* cond_not_taken_branch_cost. */
195 /* Processor costs (relative to an add) */
196 static stringop_algs i386_memcpy[2] = {
197 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
198 DUMMY_STRINGOP_ALGS};
199 static stringop_algs i386_memset[2] = {
200 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
201 DUMMY_STRINGOP_ALGS};
203 static const
204 struct processor_costs i386_cost = { /* 386 specific costs */
205 COSTS_N_INSNS (1), /* cost of an add instruction */
206 COSTS_N_INSNS (1), /* cost of a lea instruction */
207 COSTS_N_INSNS (3), /* variable shift costs */
208 COSTS_N_INSNS (2), /* constant shift costs */
209 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
210 COSTS_N_INSNS (6), /* HI */
211 COSTS_N_INSNS (6), /* SI */
212 COSTS_N_INSNS (6), /* DI */
213 COSTS_N_INSNS (6)}, /* other */
214 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
215 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
216 COSTS_N_INSNS (23), /* HI */
217 COSTS_N_INSNS (23), /* SI */
218 COSTS_N_INSNS (23), /* DI */
219 COSTS_N_INSNS (23)}, /* other */
220 COSTS_N_INSNS (3), /* cost of movsx */
221 COSTS_N_INSNS (2), /* cost of movzx */
222 15, /* "large" insn */
223 3, /* MOVE_RATIO */
224 4, /* cost for loading QImode using movzbl */
225 {2, 4, 2}, /* cost of loading integer registers
226 in QImode, HImode and SImode.
227 Relative to reg-reg move (2). */
228 {2, 4, 2}, /* cost of storing integer registers */
229 2, /* cost of reg,reg fld/fst */
230 {8, 8, 8}, /* cost of loading fp registers
231 in SFmode, DFmode and XFmode */
232 {8, 8, 8}, /* cost of storing fp registers
233 in SFmode, DFmode and XFmode */
234 2, /* cost of moving MMX register */
235 {4, 8}, /* cost of loading MMX registers
236 in SImode and DImode */
237 {4, 8}, /* cost of storing MMX registers
238 in SImode and DImode */
239 2, /* cost of moving SSE register */
240 {4, 8, 16}, /* cost of loading SSE registers
241 in SImode, DImode and TImode */
242 {4, 8, 16}, /* cost of storing SSE registers
243 in SImode, DImode and TImode */
244 3, /* MMX or SSE register to integer */
245 0, /* size of l1 cache */
246 0, /* size of l2 cache */
247 0, /* size of prefetch block */
248 0, /* number of parallel prefetches */
249 1, /* Branch cost */
250 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
251 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
252 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
253 COSTS_N_INSNS (22), /* cost of FABS instruction. */
254 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
255 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
256 i386_memcpy,
257 i386_memset,
258 1, /* scalar_stmt_cost. */
259 1, /* scalar load_cost. */
260 1, /* scalar_store_cost. */
261 1, /* vec_stmt_cost. */
262 1, /* vec_to_scalar_cost. */
263 1, /* scalar_to_vec_cost. */
264 1, /* vec_align_load_cost. */
265 2, /* vec_unalign_load_cost. */
266 1, /* vec_store_cost. */
267 3, /* cond_taken_branch_cost. */
268 1, /* cond_not_taken_branch_cost. */
271 static stringop_algs i486_memcpy[2] = {
272 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
273 DUMMY_STRINGOP_ALGS};
274 static stringop_algs i486_memset[2] = {
275 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
276 DUMMY_STRINGOP_ALGS};
278 static const
279 struct processor_costs i486_cost = { /* 486 specific costs */
280 COSTS_N_INSNS (1), /* cost of an add instruction */
281 COSTS_N_INSNS (1), /* cost of a lea instruction */
282 COSTS_N_INSNS (3), /* variable shift costs */
283 COSTS_N_INSNS (2), /* constant shift costs */
284 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
285 COSTS_N_INSNS (12), /* HI */
286 COSTS_N_INSNS (12), /* SI */
287 COSTS_N_INSNS (12), /* DI */
288 COSTS_N_INSNS (12)}, /* other */
289 1, /* cost of multiply per each bit set */
290 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
291 COSTS_N_INSNS (40), /* HI */
292 COSTS_N_INSNS (40), /* SI */
293 COSTS_N_INSNS (40), /* DI */
294 COSTS_N_INSNS (40)}, /* other */
295 COSTS_N_INSNS (3), /* cost of movsx */
296 COSTS_N_INSNS (2), /* cost of movzx */
297 15, /* "large" insn */
298 3, /* MOVE_RATIO */
299 4, /* cost for loading QImode using movzbl */
300 {2, 4, 2}, /* cost of loading integer registers
301 in QImode, HImode and SImode.
302 Relative to reg-reg move (2). */
303 {2, 4, 2}, /* cost of storing integer registers */
304 2, /* cost of reg,reg fld/fst */
305 {8, 8, 8}, /* cost of loading fp registers
306 in SFmode, DFmode and XFmode */
307 {8, 8, 8}, /* cost of storing fp registers
308 in SFmode, DFmode and XFmode */
309 2, /* cost of moving MMX register */
310 {4, 8}, /* cost of loading MMX registers
311 in SImode and DImode */
312 {4, 8}, /* cost of storing MMX registers
313 in SImode and DImode */
314 2, /* cost of moving SSE register */
315 {4, 8, 16}, /* cost of loading SSE registers
316 in SImode, DImode and TImode */
317 {4, 8, 16}, /* cost of storing SSE registers
318 in SImode, DImode and TImode */
319 3, /* MMX or SSE register to integer */
320 4, /* size of l1 cache. 486 has 8kB cache
321 shared for code and data, so 4kB is
322 not really precise. */
323 4, /* size of l2 cache */
324 0, /* size of prefetch block */
325 0, /* number of parallel prefetches */
326 1, /* Branch cost */
327 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
328 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
329 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
330 COSTS_N_INSNS (3), /* cost of FABS instruction. */
331 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
332 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
333 i486_memcpy,
334 i486_memset,
335 1, /* scalar_stmt_cost. */
336 1, /* scalar load_cost. */
337 1, /* scalar_store_cost. */
338 1, /* vec_stmt_cost. */
339 1, /* vec_to_scalar_cost. */
340 1, /* scalar_to_vec_cost. */
341 1, /* vec_align_load_cost. */
342 2, /* vec_unalign_load_cost. */
343 1, /* vec_store_cost. */
344 3, /* cond_taken_branch_cost. */
345 1, /* cond_not_taken_branch_cost. */
348 static stringop_algs pentium_memcpy[2] = {
349 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
350 DUMMY_STRINGOP_ALGS};
351 static stringop_algs pentium_memset[2] = {
352 {libcall, {{-1, rep_prefix_4_byte, false}}},
353 DUMMY_STRINGOP_ALGS};
355 static const
356 struct processor_costs pentium_cost = {
357 COSTS_N_INSNS (1), /* cost of an add instruction */
358 COSTS_N_INSNS (1), /* cost of a lea instruction */
359 COSTS_N_INSNS (4), /* variable shift costs */
360 COSTS_N_INSNS (1), /* constant shift costs */
361 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
362 COSTS_N_INSNS (11), /* HI */
363 COSTS_N_INSNS (11), /* SI */
364 COSTS_N_INSNS (11), /* DI */
365 COSTS_N_INSNS (11)}, /* other */
366 0, /* cost of multiply per each bit set */
367 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
368 COSTS_N_INSNS (25), /* HI */
369 COSTS_N_INSNS (25), /* SI */
370 COSTS_N_INSNS (25), /* DI */
371 COSTS_N_INSNS (25)}, /* other */
372 COSTS_N_INSNS (3), /* cost of movsx */
373 COSTS_N_INSNS (2), /* cost of movzx */
374 8, /* "large" insn */
375 6, /* MOVE_RATIO */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, /* cost of moving SSE register */
392 {4, 8, 16}, /* cost of loading SSE registers
393 in SImode, DImode and TImode */
394 {4, 8, 16}, /* cost of storing SSE registers
395 in SImode, DImode and TImode */
396 3, /* MMX or SSE register to integer */
397 8, /* size of l1 cache. */
398 8, /* size of l2 cache */
399 0, /* size of prefetch block */
400 0, /* number of parallel prefetches */
401 2, /* Branch cost */
402 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
403 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
404 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
405 COSTS_N_INSNS (1), /* cost of FABS instruction. */
406 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
407 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
408 pentium_memcpy,
409 pentium_memset,
410 1, /* scalar_stmt_cost. */
411 1, /* scalar load_cost. */
412 1, /* scalar_store_cost. */
413 1, /* vec_stmt_cost. */
414 1, /* vec_to_scalar_cost. */
415 1, /* scalar_to_vec_cost. */
416 1, /* vec_align_load_cost. */
417 2, /* vec_unalign_load_cost. */
418 1, /* vec_store_cost. */
419 3, /* cond_taken_branch_cost. */
420 1, /* cond_not_taken_branch_cost. */
423 static const
424 struct processor_costs lakemont_cost = {
425 COSTS_N_INSNS (1), /* cost of an add instruction */
426 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
427 COSTS_N_INSNS (1), /* variable shift costs */
428 COSTS_N_INSNS (1), /* constant shift costs */
429 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
430 COSTS_N_INSNS (11), /* HI */
431 COSTS_N_INSNS (11), /* SI */
432 COSTS_N_INSNS (11), /* DI */
433 COSTS_N_INSNS (11)}, /* other */
434 0, /* cost of multiply per each bit set */
435 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
436 COSTS_N_INSNS (25), /* HI */
437 COSTS_N_INSNS (25), /* SI */
438 COSTS_N_INSNS (25), /* DI */
439 COSTS_N_INSNS (25)}, /* other */
440 COSTS_N_INSNS (3), /* cost of movsx */
441 COSTS_N_INSNS (2), /* cost of movzx */
442 8, /* "large" insn */
443 17, /* MOVE_RATIO */
444 6, /* cost for loading QImode using movzbl */
445 {2, 4, 2}, /* cost of loading integer registers
446 in QImode, HImode and SImode.
447 Relative to reg-reg move (2). */
448 {2, 4, 2}, /* cost of storing integer registers */
449 2, /* cost of reg,reg fld/fst */
450 {2, 2, 6}, /* cost of loading fp registers
451 in SFmode, DFmode and XFmode */
452 {4, 4, 6}, /* cost of storing fp registers
453 in SFmode, DFmode and XFmode */
454 8, /* cost of moving MMX register */
455 {8, 8}, /* cost of loading MMX registers
456 in SImode and DImode */
457 {8, 8}, /* cost of storing MMX registers
458 in SImode and DImode */
459 2, /* cost of moving SSE register */
460 {4, 8, 16}, /* cost of loading SSE registers
461 in SImode, DImode and TImode */
462 {4, 8, 16}, /* cost of storing SSE registers
463 in SImode, DImode and TImode */
464 3, /* MMX or SSE register to integer */
465 8, /* size of l1 cache. */
466 8, /* size of l2 cache */
467 0, /* size of prefetch block */
468 0, /* number of parallel prefetches */
469 2, /* Branch cost */
470 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
471 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
472 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
473 COSTS_N_INSNS (1), /* cost of FABS instruction. */
474 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
475 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
476 pentium_memcpy,
477 pentium_memset,
478 1, /* scalar_stmt_cost. */
479 1, /* scalar load_cost. */
480 1, /* scalar_store_cost. */
481 1, /* vec_stmt_cost. */
482 1, /* vec_to_scalar_cost. */
483 1, /* scalar_to_vec_cost. */
484 1, /* vec_align_load_cost. */
485 2, /* vec_unalign_load_cost. */
486 1, /* vec_store_cost. */
487 3, /* cond_taken_branch_cost. */
488 1, /* cond_not_taken_branch_cost. */
491 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
492 (we ensure the alignment). For small blocks inline loop is still a
493 noticeable win, for bigger blocks either rep movsl or rep movsb is
494 way to go. Rep movsb has apparently more expensive startup time in CPU,
495 but after 4K the difference is down in the noise. */
496 static stringop_algs pentiumpro_memcpy[2] = {
497 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
498 {8192, rep_prefix_4_byte, false},
499 {-1, rep_prefix_1_byte, false}}},
500 DUMMY_STRINGOP_ALGS};
501 static stringop_algs pentiumpro_memset[2] = {
502 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
503 {8192, rep_prefix_4_byte, false},
504 {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS};
506 static const
507 struct processor_costs pentiumpro_cost = {
508 COSTS_N_INSNS (1), /* cost of an add instruction */
509 COSTS_N_INSNS (1), /* cost of a lea instruction */
510 COSTS_N_INSNS (1), /* variable shift costs */
511 COSTS_N_INSNS (1), /* constant shift costs */
512 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
513 COSTS_N_INSNS (4), /* HI */
514 COSTS_N_INSNS (4), /* SI */
515 COSTS_N_INSNS (4), /* DI */
516 COSTS_N_INSNS (4)}, /* other */
517 0, /* cost of multiply per each bit set */
518 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
519 COSTS_N_INSNS (17), /* HI */
520 COSTS_N_INSNS (17), /* SI */
521 COSTS_N_INSNS (17), /* DI */
522 COSTS_N_INSNS (17)}, /* other */
523 COSTS_N_INSNS (1), /* cost of movsx */
524 COSTS_N_INSNS (1), /* cost of movzx */
525 8, /* "large" insn */
526 6, /* MOVE_RATIO */
527 2, /* cost for loading QImode using movzbl */
528 {4, 4, 4}, /* cost of loading integer registers
529 in QImode, HImode and SImode.
530 Relative to reg-reg move (2). */
531 {2, 2, 2}, /* cost of storing integer registers */
532 2, /* cost of reg,reg fld/fst */
533 {2, 2, 6}, /* cost of loading fp registers
534 in SFmode, DFmode and XFmode */
535 {4, 4, 6}, /* cost of storing fp registers
536 in SFmode, DFmode and XFmode */
537 2, /* cost of moving MMX register */
538 {2, 2}, /* cost of loading MMX registers
539 in SImode and DImode */
540 {2, 2}, /* cost of storing MMX registers
541 in SImode and DImode */
542 2, /* cost of moving SSE register */
543 {2, 2, 8}, /* cost of loading SSE registers
544 in SImode, DImode and TImode */
545 {2, 2, 8}, /* cost of storing SSE registers
546 in SImode, DImode and TImode */
547 3, /* MMX or SSE register to integer */
548 8, /* size of l1 cache. */
549 256, /* size of l2 cache */
550 32, /* size of prefetch block */
551 6, /* number of parallel prefetches */
552 2, /* Branch cost */
553 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
554 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
555 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
556 COSTS_N_INSNS (2), /* cost of FABS instruction. */
557 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
558 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
561 1, /* scalar_stmt_cost. */
562 1, /* scalar load_cost. */
563 1, /* scalar_store_cost. */
564 1, /* vec_stmt_cost. */
565 1, /* vec_to_scalar_cost. */
566 1, /* scalar_to_vec_cost. */
567 1, /* vec_align_load_cost. */
568 2, /* vec_unalign_load_cost. */
569 1, /* vec_store_cost. */
570 3, /* cond_taken_branch_cost. */
571 1, /* cond_not_taken_branch_cost. */
574 static stringop_algs geode_memcpy[2] = {
575 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS};
577 static stringop_algs geode_memset[2] = {
578 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
579 DUMMY_STRINGOP_ALGS};
580 static const
581 struct processor_costs geode_cost = {
582 COSTS_N_INSNS (1), /* cost of an add instruction */
583 COSTS_N_INSNS (1), /* cost of a lea instruction */
584 COSTS_N_INSNS (2), /* variable shift costs */
585 COSTS_N_INSNS (1), /* constant shift costs */
586 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
587 COSTS_N_INSNS (4), /* HI */
588 COSTS_N_INSNS (7), /* SI */
589 COSTS_N_INSNS (7), /* DI */
590 COSTS_N_INSNS (7)}, /* other */
591 0, /* cost of multiply per each bit set */
592 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
593 COSTS_N_INSNS (23), /* HI */
594 COSTS_N_INSNS (39), /* SI */
595 COSTS_N_INSNS (39), /* DI */
596 COSTS_N_INSNS (39)}, /* other */
597 COSTS_N_INSNS (1), /* cost of movsx */
598 COSTS_N_INSNS (1), /* cost of movzx */
599 8, /* "large" insn */
600 4, /* MOVE_RATIO */
601 1, /* cost for loading QImode using movzbl */
602 {1, 1, 1}, /* cost of loading integer registers
603 in QImode, HImode and SImode.
604 Relative to reg-reg move (2). */
605 {1, 1, 1}, /* cost of storing integer registers */
606 1, /* cost of reg,reg fld/fst */
607 {1, 1, 1}, /* cost of loading fp registers
608 in SFmode, DFmode and XFmode */
609 {4, 6, 6}, /* cost of storing fp registers
610 in SFmode, DFmode and XFmode */
612 2, /* cost of moving MMX register */
613 {2, 2}, /* cost of loading MMX registers
614 in SImode and DImode */
615 {2, 2}, /* cost of storing MMX registers
616 in SImode and DImode */
617 2, /* cost of moving SSE register */
618 {2, 2, 8}, /* cost of loading SSE registers
619 in SImode, DImode and TImode */
620 {2, 2, 8}, /* cost of storing SSE registers
621 in SImode, DImode and TImode */
622 3, /* MMX or SSE register to integer */
623 64, /* size of l1 cache. */
624 128, /* size of l2 cache. */
625 32, /* size of prefetch block */
626 1, /* number of parallel prefetches */
627 1, /* Branch cost */
628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
629 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
630 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
631 COSTS_N_INSNS (1), /* cost of FABS instruction. */
632 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
633 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
634 geode_memcpy,
635 geode_memset,
636 1, /* scalar_stmt_cost. */
637 1, /* scalar load_cost. */
638 1, /* scalar_store_cost. */
639 1, /* vec_stmt_cost. */
640 1, /* vec_to_scalar_cost. */
641 1, /* scalar_to_vec_cost. */
642 1, /* vec_align_load_cost. */
643 2, /* vec_unalign_load_cost. */
644 1, /* vec_store_cost. */
645 3, /* cond_taken_branch_cost. */
646 1, /* cond_not_taken_branch_cost. */
649 static stringop_algs k6_memcpy[2] = {
650 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static stringop_algs k6_memset[2] = {
653 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
654 DUMMY_STRINGOP_ALGS};
655 static const
656 struct processor_costs k6_cost = {
657 COSTS_N_INSNS (1), /* cost of an add instruction */
658 COSTS_N_INSNS (2), /* cost of a lea instruction */
659 COSTS_N_INSNS (1), /* variable shift costs */
660 COSTS_N_INSNS (1), /* constant shift costs */
661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
662 COSTS_N_INSNS (3), /* HI */
663 COSTS_N_INSNS (3), /* SI */
664 COSTS_N_INSNS (3), /* DI */
665 COSTS_N_INSNS (3)}, /* other */
666 0, /* cost of multiply per each bit set */
667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
668 COSTS_N_INSNS (18), /* HI */
669 COSTS_N_INSNS (18), /* SI */
670 COSTS_N_INSNS (18), /* DI */
671 COSTS_N_INSNS (18)}, /* other */
672 COSTS_N_INSNS (2), /* cost of movsx */
673 COSTS_N_INSNS (2), /* cost of movzx */
674 8, /* "large" insn */
675 4, /* MOVE_RATIO */
676 3, /* cost for loading QImode using movzbl */
677 {4, 5, 4}, /* cost of loading integer registers
678 in QImode, HImode and SImode.
679 Relative to reg-reg move (2). */
680 {2, 3, 2}, /* cost of storing integer registers */
681 4, /* cost of reg,reg fld/fst */
682 {6, 6, 6}, /* cost of loading fp registers
683 in SFmode, DFmode and XFmode */
684 {4, 4, 4}, /* cost of storing fp registers
685 in SFmode, DFmode and XFmode */
686 2, /* cost of moving MMX register */
687 {2, 2}, /* cost of loading MMX registers
688 in SImode and DImode */
689 {2, 2}, /* cost of storing MMX registers
690 in SImode and DImode */
691 2, /* cost of moving SSE register */
692 {2, 2, 8}, /* cost of loading SSE registers
693 in SImode, DImode and TImode */
694 {2, 2, 8}, /* cost of storing SSE registers
695 in SImode, DImode and TImode */
696 6, /* MMX or SSE register to integer */
697 32, /* size of l1 cache. */
698 32, /* size of l2 cache. Some models
699 have integrated l2 cache, but
700 optimizing for k6 is not important
701 enough to worry about that. */
702 32, /* size of prefetch block */
703 1, /* number of parallel prefetches */
704 1, /* Branch cost */
705 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
706 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
707 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
708 COSTS_N_INSNS (2), /* cost of FABS instruction. */
709 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
710 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
711 k6_memcpy,
712 k6_memset,
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
726 /* For some reason, Athlon deals better with REP prefix (relative to loops)
727 compared to K8. Alignment becomes important after 8 bytes for memcpy and
728 128 bytes for memset. */
729 static stringop_algs athlon_memcpy[2] = {
730 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 DUMMY_STRINGOP_ALGS};
732 static stringop_algs athlon_memset[2] = {
733 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
734 DUMMY_STRINGOP_ALGS};
735 static const
736 struct processor_costs athlon_cost = {
737 COSTS_N_INSNS (1), /* cost of an add instruction */
738 COSTS_N_INSNS (2), /* cost of a lea instruction */
739 COSTS_N_INSNS (1), /* variable shift costs */
740 COSTS_N_INSNS (1), /* constant shift costs */
741 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
742 COSTS_N_INSNS (5), /* HI */
743 COSTS_N_INSNS (5), /* SI */
744 COSTS_N_INSNS (5), /* DI */
745 COSTS_N_INSNS (5)}, /* other */
746 0, /* cost of multiply per each bit set */
747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
748 COSTS_N_INSNS (26), /* HI */
749 COSTS_N_INSNS (42), /* SI */
750 COSTS_N_INSNS (74), /* DI */
751 COSTS_N_INSNS (74)}, /* other */
752 COSTS_N_INSNS (1), /* cost of movsx */
753 COSTS_N_INSNS (1), /* cost of movzx */
754 8, /* "large" insn */
755 9, /* MOVE_RATIO */
756 4, /* cost for loading QImode using movzbl */
757 {3, 4, 3}, /* cost of loading integer registers
758 in QImode, HImode and SImode.
759 Relative to reg-reg move (2). */
760 {3, 4, 3}, /* cost of storing integer registers */
761 4, /* cost of reg,reg fld/fst */
762 {4, 4, 12}, /* cost of loading fp registers
763 in SFmode, DFmode and XFmode */
764 {6, 6, 8}, /* cost of storing fp registers
765 in SFmode, DFmode and XFmode */
766 2, /* cost of moving MMX register */
767 {4, 4}, /* cost of loading MMX registers
768 in SImode and DImode */
769 {4, 4}, /* cost of storing MMX registers
770 in SImode and DImode */
771 2, /* cost of moving SSE register */
772 {4, 4, 6}, /* cost of loading SSE registers
773 in SImode, DImode and TImode */
774 {4, 4, 5}, /* cost of storing SSE registers
775 in SImode, DImode and TImode */
776 5, /* MMX or SSE register to integer */
777 64, /* size of l1 cache. */
778 256, /* size of l2 cache. */
779 64, /* size of prefetch block */
780 6, /* number of parallel prefetches */
781 5, /* Branch cost */
782 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
783 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
784 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
785 COSTS_N_INSNS (2), /* cost of FABS instruction. */
786 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
787 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
788 athlon_memcpy,
789 athlon_memset,
790 1, /* scalar_stmt_cost. */
791 1, /* scalar load_cost. */
792 1, /* scalar_store_cost. */
793 1, /* vec_stmt_cost. */
794 1, /* vec_to_scalar_cost. */
795 1, /* scalar_to_vec_cost. */
796 1, /* vec_align_load_cost. */
797 2, /* vec_unalign_load_cost. */
798 1, /* vec_store_cost. */
799 3, /* cond_taken_branch_cost. */
800 1, /* cond_not_taken_branch_cost. */
803 /* K8 has optimized REP instruction for medium sized blocks, but for very
804 small blocks it is better to use loop. For large blocks, libcall can
805 do nontemporary accesses and beat inline considerably. */
806 static stringop_algs k8_memcpy[2] = {
807 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
808 {-1, rep_prefix_4_byte, false}}},
809 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
810 {-1, libcall, false}}}};
811 static stringop_algs k8_memset[2] = {
812 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
813 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
814 {libcall, {{48, unrolled_loop, false},
815 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
816 static const
817 struct processor_costs k8_cost = {
818 COSTS_N_INSNS (1), /* cost of an add instruction */
819 COSTS_N_INSNS (2), /* cost of a lea instruction */
820 COSTS_N_INSNS (1), /* variable shift costs */
821 COSTS_N_INSNS (1), /* constant shift costs */
822 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
823 COSTS_N_INSNS (4), /* HI */
824 COSTS_N_INSNS (3), /* SI */
825 COSTS_N_INSNS (4), /* DI */
826 COSTS_N_INSNS (5)}, /* other */
827 0, /* cost of multiply per each bit set */
828 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
829 COSTS_N_INSNS (26), /* HI */
830 COSTS_N_INSNS (42), /* SI */
831 COSTS_N_INSNS (74), /* DI */
832 COSTS_N_INSNS (74)}, /* other */
833 COSTS_N_INSNS (1), /* cost of movsx */
834 COSTS_N_INSNS (1), /* cost of movzx */
835 8, /* "large" insn */
836 9, /* MOVE_RATIO */
837 4, /* cost for loading QImode using movzbl */
838 {3, 4, 3}, /* cost of loading integer registers
839 in QImode, HImode and SImode.
840 Relative to reg-reg move (2). */
841 {3, 4, 3}, /* cost of storing integer registers */
842 4, /* cost of reg,reg fld/fst */
843 {4, 4, 12}, /* cost of loading fp registers
844 in SFmode, DFmode and XFmode */
845 {6, 6, 8}, /* cost of storing fp registers
846 in SFmode, DFmode and XFmode */
847 2, /* cost of moving MMX register */
848 {3, 3}, /* cost of loading MMX registers
849 in SImode and DImode */
850 {4, 4}, /* cost of storing MMX registers
851 in SImode and DImode */
852 2, /* cost of moving SSE register */
853 {4, 3, 6}, /* cost of loading SSE registers
854 in SImode, DImode and TImode */
855 {4, 4, 5}, /* cost of storing SSE registers
856 in SImode, DImode and TImode */
857 5, /* MMX or SSE register to integer */
858 64, /* size of l1 cache. */
859 512, /* size of l2 cache. */
860 64, /* size of prefetch block */
861 /* New AMD processors never drop prefetches; if they cannot be performed
862 immediately, they are queued. We set number of simultaneous prefetches
863 to a large constant to reflect this (it probably is not a good idea not
864 to limit number of prefetches at all, as their execution also takes some
865 time). */
866 100, /* number of parallel prefetches */
867 3, /* Branch cost */
868 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
869 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
870 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
871 COSTS_N_INSNS (2), /* cost of FABS instruction. */
872 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
873 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
875 k8_memcpy,
876 k8_memset,
877 4, /* scalar_stmt_cost. */
878 2, /* scalar load_cost. */
879 2, /* scalar_store_cost. */
880 5, /* vec_stmt_cost. */
881 0, /* vec_to_scalar_cost. */
882 2, /* scalar_to_vec_cost. */
883 2, /* vec_align_load_cost. */
884 3, /* vec_unalign_load_cost. */
885 3, /* vec_store_cost. */
886 3, /* cond_taken_branch_cost. */
887 2, /* cond_not_taken_branch_cost. */
890 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
891 very small blocks it is better to use loop. For large blocks, libcall can
892 do nontemporary accesses and beat inline considerably. */
893 static stringop_algs amdfam10_memcpy[2] = {
894 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
895 {-1, rep_prefix_4_byte, false}}},
896 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
898 static stringop_algs amdfam10_memset[2] = {
899 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
900 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
901 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
902 {-1, libcall, false}}}};
903 struct processor_costs amdfam10_cost = {
904 COSTS_N_INSNS (1), /* cost of an add instruction */
905 COSTS_N_INSNS (2), /* cost of a lea instruction */
906 COSTS_N_INSNS (1), /* variable shift costs */
907 COSTS_N_INSNS (1), /* constant shift costs */
908 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
909 COSTS_N_INSNS (4), /* HI */
910 COSTS_N_INSNS (3), /* SI */
911 COSTS_N_INSNS (4), /* DI */
912 COSTS_N_INSNS (5)}, /* other */
913 0, /* cost of multiply per each bit set */
914 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
915 COSTS_N_INSNS (35), /* HI */
916 COSTS_N_INSNS (51), /* SI */
917 COSTS_N_INSNS (83), /* DI */
918 COSTS_N_INSNS (83)}, /* other */
919 COSTS_N_INSNS (1), /* cost of movsx */
920 COSTS_N_INSNS (1), /* cost of movzx */
921 8, /* "large" insn */
922 9, /* MOVE_RATIO */
923 4, /* cost for loading QImode using movzbl */
924 {3, 4, 3}, /* cost of loading integer registers
925 in QImode, HImode and SImode.
926 Relative to reg-reg move (2). */
927 {3, 4, 3}, /* cost of storing integer registers */
928 4, /* cost of reg,reg fld/fst */
929 {4, 4, 12}, /* cost of loading fp registers
930 in SFmode, DFmode and XFmode */
931 {6, 6, 8}, /* cost of storing fp registers
932 in SFmode, DFmode and XFmode */
933 2, /* cost of moving MMX register */
934 {3, 3}, /* cost of loading MMX registers
935 in SImode and DImode */
936 {4, 4}, /* cost of storing MMX registers
937 in SImode and DImode */
938 2, /* cost of moving SSE register */
939 {4, 4, 3}, /* cost of loading SSE registers
940 in SImode, DImode and TImode */
941 {4, 4, 5}, /* cost of storing SSE registers
942 in SImode, DImode and TImode */
943 3, /* MMX or SSE register to integer */
944 /* On K8:
945 MOVD reg64, xmmreg Double FSTORE 4
946 MOVD reg32, xmmreg Double FSTORE 4
947 On AMDFAM10:
948 MOVD reg64, xmmreg Double FADD 3
949 1/1 1/1
950 MOVD reg32, xmmreg Double FADD 3
951 1/1 1/1 */
952 64, /* size of l1 cache. */
953 512, /* size of l2 cache. */
954 64, /* size of prefetch block */
955 /* New AMD processors never drop prefetches; if they cannot be performed
956 immediately, they are queued. We set number of simultaneous prefetches
957 to a large constant to reflect this (it probably is not a good idea not
958 to limit number of prefetches at all, as their execution also takes some
959 time). */
960 100, /* number of parallel prefetches */
961 2, /* Branch cost */
962 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
963 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
964 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
965 COSTS_N_INSNS (2), /* cost of FABS instruction. */
966 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
967 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
969 amdfam10_memcpy,
970 amdfam10_memset,
971 4, /* scalar_stmt_cost. */
972 2, /* scalar load_cost. */
973 2, /* scalar_store_cost. */
974 6, /* vec_stmt_cost. */
975 0, /* vec_to_scalar_cost. */
976 2, /* scalar_to_vec_cost. */
977 2, /* vec_align_load_cost. */
978 2, /* vec_unalign_load_cost. */
979 2, /* vec_store_cost. */
980 2, /* cond_taken_branch_cost. */
981 1, /* cond_not_taken_branch_cost. */
984 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
985 very small blocks it is better to use loop. For large blocks, libcall
986 can do nontemporary accesses and beat inline considerably. */
987 static stringop_algs bdver1_memcpy[2] = {
988 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
989 {-1, rep_prefix_4_byte, false}}},
990 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
991 {-1, libcall, false}}}};
992 static stringop_algs bdver1_memset[2] = {
993 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
994 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
995 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
996 {-1, libcall, false}}}};
998 const struct processor_costs bdver1_cost = {
999 COSTS_N_INSNS (1), /* cost of an add instruction */
1000 COSTS_N_INSNS (1), /* cost of a lea instruction */
1001 COSTS_N_INSNS (1), /* variable shift costs */
1002 COSTS_N_INSNS (1), /* constant shift costs */
1003 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1004 COSTS_N_INSNS (4), /* HI */
1005 COSTS_N_INSNS (4), /* SI */
1006 COSTS_N_INSNS (6), /* DI */
1007 COSTS_N_INSNS (6)}, /* other */
1008 0, /* cost of multiply per each bit set */
1009 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1010 COSTS_N_INSNS (35), /* HI */
1011 COSTS_N_INSNS (51), /* SI */
1012 COSTS_N_INSNS (83), /* DI */
1013 COSTS_N_INSNS (83)}, /* other */
1014 COSTS_N_INSNS (1), /* cost of movsx */
1015 COSTS_N_INSNS (1), /* cost of movzx */
1016 8, /* "large" insn */
1017 9, /* MOVE_RATIO */
1018 4, /* cost for loading QImode using movzbl */
1019 {5, 5, 4}, /* cost of loading integer registers
1020 in QImode, HImode and SImode.
1021 Relative to reg-reg move (2). */
1022 {4, 4, 4}, /* cost of storing integer registers */
1023 2, /* cost of reg,reg fld/fst */
1024 {5, 5, 12}, /* cost of loading fp registers
1025 in SFmode, DFmode and XFmode */
1026 {4, 4, 8}, /* cost of storing fp registers
1027 in SFmode, DFmode and XFmode */
1028 2, /* cost of moving MMX register */
1029 {4, 4}, /* cost of loading MMX registers
1030 in SImode and DImode */
1031 {4, 4}, /* cost of storing MMX registers
1032 in SImode and DImode */
1033 2, /* cost of moving SSE register */
1034 {4, 4, 4}, /* cost of loading SSE registers
1035 in SImode, DImode and TImode */
1036 {4, 4, 4}, /* cost of storing SSE registers
1037 in SImode, DImode and TImode */
1038 2, /* MMX or SSE register to integer */
1039 /* On K8:
1040 MOVD reg64, xmmreg Double FSTORE 4
1041 MOVD reg32, xmmreg Double FSTORE 4
1042 On AMDFAM10:
1043 MOVD reg64, xmmreg Double FADD 3
1044 1/1 1/1
1045 MOVD reg32, xmmreg Double FADD 3
1046 1/1 1/1 */
1047 16, /* size of l1 cache. */
1048 2048, /* size of l2 cache. */
1049 64, /* size of prefetch block */
1050 /* New AMD processors never drop prefetches; if they cannot be performed
1051 immediately, they are queued. We set number of simultaneous prefetches
1052 to a large constant to reflect this (it probably is not a good idea not
1053 to limit number of prefetches at all, as their execution also takes some
1054 time). */
1055 100, /* number of parallel prefetches */
1056 2, /* Branch cost */
1057 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1058 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1059 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1060 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1061 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1062 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1064 bdver1_memcpy,
1065 bdver1_memset,
1066 6, /* scalar_stmt_cost. */
1067 4, /* scalar load_cost. */
1068 4, /* scalar_store_cost. */
1069 6, /* vec_stmt_cost. */
1070 0, /* vec_to_scalar_cost. */
1071 2, /* scalar_to_vec_cost. */
1072 4, /* vec_align_load_cost. */
1073 4, /* vec_unalign_load_cost. */
1074 4, /* vec_store_cost. */
1075 4, /* cond_taken_branch_cost. */
1076 2, /* cond_not_taken_branch_cost. */
1079 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1080 very small blocks it is better to use loop. For large blocks, libcall
1081 can do nontemporary accesses and beat inline considerably. */
1083 static stringop_algs bdver2_memcpy[2] = {
1084 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1085 {-1, rep_prefix_4_byte, false}}},
1086 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1087 {-1, libcall, false}}}};
1088 static stringop_algs bdver2_memset[2] = {
1089 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1090 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1091 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1092 {-1, libcall, false}}}};
1094 const struct processor_costs bdver2_cost = {
1095 COSTS_N_INSNS (1), /* cost of an add instruction */
1096 COSTS_N_INSNS (1), /* cost of a lea instruction */
1097 COSTS_N_INSNS (1), /* variable shift costs */
1098 COSTS_N_INSNS (1), /* constant shift costs */
1099 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1100 COSTS_N_INSNS (4), /* HI */
1101 COSTS_N_INSNS (4), /* SI */
1102 COSTS_N_INSNS (6), /* DI */
1103 COSTS_N_INSNS (6)}, /* other */
1104 0, /* cost of multiply per each bit set */
1105 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1106 COSTS_N_INSNS (35), /* HI */
1107 COSTS_N_INSNS (51), /* SI */
1108 COSTS_N_INSNS (83), /* DI */
1109 COSTS_N_INSNS (83)}, /* other */
1110 COSTS_N_INSNS (1), /* cost of movsx */
1111 COSTS_N_INSNS (1), /* cost of movzx */
1112 8, /* "large" insn */
1113 9, /* MOVE_RATIO */
1114 4, /* cost for loading QImode using movzbl */
1115 {5, 5, 4}, /* cost of loading integer registers
1116 in QImode, HImode and SImode.
1117 Relative to reg-reg move (2). */
1118 {4, 4, 4}, /* cost of storing integer registers */
1119 2, /* cost of reg,reg fld/fst */
1120 {5, 5, 12}, /* cost of loading fp registers
1121 in SFmode, DFmode and XFmode */
1122 {4, 4, 8}, /* cost of storing fp registers
1123 in SFmode, DFmode and XFmode */
1124 2, /* cost of moving MMX register */
1125 {4, 4}, /* cost of loading MMX registers
1126 in SImode and DImode */
1127 {4, 4}, /* cost of storing MMX registers
1128 in SImode and DImode */
1129 2, /* cost of moving SSE register */
1130 {4, 4, 4}, /* cost of loading SSE registers
1131 in SImode, DImode and TImode */
1132 {4, 4, 4}, /* cost of storing SSE registers
1133 in SImode, DImode and TImode */
1134 2, /* MMX or SSE register to integer */
1135 /* On K8:
1136 MOVD reg64, xmmreg Double FSTORE 4
1137 MOVD reg32, xmmreg Double FSTORE 4
1138 On AMDFAM10:
1139 MOVD reg64, xmmreg Double FADD 3
1140 1/1 1/1
1141 MOVD reg32, xmmreg Double FADD 3
1142 1/1 1/1 */
1143 16, /* size of l1 cache. */
1144 2048, /* size of l2 cache. */
1145 64, /* size of prefetch block */
1146 /* New AMD processors never drop prefetches; if they cannot be performed
1147 immediately, they are queued. We set number of simultaneous prefetches
1148 to a large constant to reflect this (it probably is not a good idea not
1149 to limit number of prefetches at all, as their execution also takes some
1150 time). */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1160 bdver2_memcpy,
1161 bdver2_memset,
1162 6, /* scalar_stmt_cost. */
1163 4, /* scalar load_cost. */
1164 4, /* scalar_store_cost. */
1165 6, /* vec_stmt_cost. */
1166 0, /* vec_to_scalar_cost. */
1167 2, /* scalar_to_vec_cost. */
1168 4, /* vec_align_load_cost. */
1169 4, /* vec_unalign_load_cost. */
1170 4, /* vec_store_cost. */
1171 4, /* cond_taken_branch_cost. */
1172 2, /* cond_not_taken_branch_cost. */
1176 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1177 very small blocks it is better to use loop. For large blocks, libcall
1178 can do nontemporary accesses and beat inline considerably. */
1179 static stringop_algs bdver3_memcpy[2] = {
1180 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1181 {-1, rep_prefix_4_byte, false}}},
1182 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1183 {-1, libcall, false}}}};
1184 static stringop_algs bdver3_memset[2] = {
1185 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1186 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1187 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1188 {-1, libcall, false}}}};
1189 struct processor_costs bdver3_cost = {
1190 COSTS_N_INSNS (1), /* cost of an add instruction */
1191 COSTS_N_INSNS (1), /* cost of a lea instruction */
1192 COSTS_N_INSNS (1), /* variable shift costs */
1193 COSTS_N_INSNS (1), /* constant shift costs */
1194 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1195 COSTS_N_INSNS (4), /* HI */
1196 COSTS_N_INSNS (4), /* SI */
1197 COSTS_N_INSNS (6), /* DI */
1198 COSTS_N_INSNS (6)}, /* other */
1199 0, /* cost of multiply per each bit set */
1200 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1201 COSTS_N_INSNS (35), /* HI */
1202 COSTS_N_INSNS (51), /* SI */
1203 COSTS_N_INSNS (83), /* DI */
1204 COSTS_N_INSNS (83)}, /* other */
1205 COSTS_N_INSNS (1), /* cost of movsx */
1206 COSTS_N_INSNS (1), /* cost of movzx */
1207 8, /* "large" insn */
1208 9, /* MOVE_RATIO */
1209 4, /* cost for loading QImode using movzbl */
1210 {5, 5, 4}, /* cost of loading integer registers
1211 in QImode, HImode and SImode.
1212 Relative to reg-reg move (2). */
1213 {4, 4, 4}, /* cost of storing integer registers */
1214 2, /* cost of reg,reg fld/fst */
1215 {5, 5, 12}, /* cost of loading fp registers
1216 in SFmode, DFmode and XFmode */
1217 {4, 4, 8}, /* cost of storing fp registers
1218 in SFmode, DFmode and XFmode */
1219 2, /* cost of moving MMX register */
1220 {4, 4}, /* cost of loading MMX registers
1221 in SImode and DImode */
1222 {4, 4}, /* cost of storing MMX registers
1223 in SImode and DImode */
1224 2, /* cost of moving SSE register */
1225 {4, 4, 4}, /* cost of loading SSE registers
1226 in SImode, DImode and TImode */
1227 {4, 4, 4}, /* cost of storing SSE registers
1228 in SImode, DImode and TImode */
1229 2, /* MMX or SSE register to integer */
1230 16, /* size of l1 cache. */
1231 2048, /* size of l2 cache. */
1232 64, /* size of prefetch block */
1233 /* New AMD processors never drop prefetches; if they cannot be performed
1234 immediately, they are queued. We set number of simultaneous prefetches
1235 to a large constant to reflect this (it probably is not a good idea not
1236 to limit number of prefetches at all, as their execution also takes some
1237 time). */
1238 100, /* number of parallel prefetches */
1239 2, /* Branch cost */
1240 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1241 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1242 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1243 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1244 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1245 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1247 bdver3_memcpy,
1248 bdver3_memset,
1249 6, /* scalar_stmt_cost. */
1250 4, /* scalar load_cost. */
1251 4, /* scalar_store_cost. */
1252 6, /* vec_stmt_cost. */
1253 0, /* vec_to_scalar_cost. */
1254 2, /* scalar_to_vec_cost. */
1255 4, /* vec_align_load_cost. */
1256 4, /* vec_unalign_load_cost. */
1257 4, /* vec_store_cost. */
1258 4, /* cond_taken_branch_cost. */
1259 2, /* cond_not_taken_branch_cost. */
1262 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1263 very small blocks it is better to use loop. For large blocks, libcall
1264 can do nontemporary accesses and beat inline considerably. */
1265 static stringop_algs bdver4_memcpy[2] = {
1266 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1267 {-1, rep_prefix_4_byte, false}}},
1268 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1269 {-1, libcall, false}}}};
1270 static stringop_algs bdver4_memset[2] = {
1271 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1272 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1273 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1274 {-1, libcall, false}}}};
1275 struct processor_costs bdver4_cost = {
1276 COSTS_N_INSNS (1), /* cost of an add instruction */
1277 COSTS_N_INSNS (1), /* cost of a lea instruction */
1278 COSTS_N_INSNS (1), /* variable shift costs */
1279 COSTS_N_INSNS (1), /* constant shift costs */
1280 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1281 COSTS_N_INSNS (4), /* HI */
1282 COSTS_N_INSNS (4), /* SI */
1283 COSTS_N_INSNS (6), /* DI */
1284 COSTS_N_INSNS (6)}, /* other */
1285 0, /* cost of multiply per each bit set */
1286 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1287 COSTS_N_INSNS (35), /* HI */
1288 COSTS_N_INSNS (51), /* SI */
1289 COSTS_N_INSNS (83), /* DI */
1290 COSTS_N_INSNS (83)}, /* other */
1291 COSTS_N_INSNS (1), /* cost of movsx */
1292 COSTS_N_INSNS (1), /* cost of movzx */
1293 8, /* "large" insn */
1294 9, /* MOVE_RATIO */
1295 4, /* cost for loading QImode using movzbl */
1296 {5, 5, 4}, /* cost of loading integer registers
1297 in QImode, HImode and SImode.
1298 Relative to reg-reg move (2). */
1299 {4, 4, 4}, /* cost of storing integer registers */
1300 2, /* cost of reg,reg fld/fst */
1301 {5, 5, 12}, /* cost of loading fp registers
1302 in SFmode, DFmode and XFmode */
1303 {4, 4, 8}, /* cost of storing fp registers
1304 in SFmode, DFmode and XFmode */
1305 2, /* cost of moving MMX register */
1306 {4, 4}, /* cost of loading MMX registers
1307 in SImode and DImode */
1308 {4, 4}, /* cost of storing MMX registers
1309 in SImode and DImode */
1310 2, /* cost of moving SSE register */
1311 {4, 4, 4}, /* cost of loading SSE registers
1312 in SImode, DImode and TImode */
1313 {4, 4, 4}, /* cost of storing SSE registers
1314 in SImode, DImode and TImode */
1315 2, /* MMX or SSE register to integer */
1316 16, /* size of l1 cache. */
1317 2048, /* size of l2 cache. */
1318 64, /* size of prefetch block */
1319 /* New AMD processors never drop prefetches; if they cannot be performed
1320 immediately, they are queued. We set number of simultaneous prefetches
1321 to a large constant to reflect this (it probably is not a good idea not
1322 to limit number of prefetches at all, as their execution also takes some
1323 time). */
1324 100, /* number of parallel prefetches */
1325 2, /* Branch cost */
1326 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1327 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1328 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1329 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1330 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1331 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1333 bdver4_memcpy,
1334 bdver4_memset,
1335 6, /* scalar_stmt_cost. */
1336 4, /* scalar load_cost. */
1337 4, /* scalar_store_cost. */
1338 6, /* vec_stmt_cost. */
1339 0, /* vec_to_scalar_cost. */
1340 2, /* scalar_to_vec_cost. */
1341 4, /* vec_align_load_cost. */
1342 4, /* vec_unalign_load_cost. */
1343 4, /* vec_store_cost. */
1344 4, /* cond_taken_branch_cost. */
1345 2, /* cond_not_taken_branch_cost. */
1349 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1350 very small blocks it is better to use loop. For large blocks, libcall
1351 can do nontemporary accesses and beat inline considerably. */
1352 static stringop_algs znver1_memcpy[2] = {
1353 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1354 {-1, rep_prefix_4_byte, false}}},
1355 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1356 {-1, libcall, false}}}};
1357 static stringop_algs znver1_memset[2] = {
1358 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1359 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1360 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1361 {-1, libcall, false}}}};
1362 struct processor_costs znver1_cost = {
1363 COSTS_N_INSNS (1), /* cost of an add instruction. */
1364 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1365 COSTS_N_INSNS (1), /* variable shift costs. */
1366 COSTS_N_INSNS (1), /* constant shift costs. */
1367 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1368 COSTS_N_INSNS (3), /* HI. */
1369 COSTS_N_INSNS (3), /* SI. */
1370 COSTS_N_INSNS (4), /* DI. */
1371 COSTS_N_INSNS (4)}, /* other. */
1372 0, /* cost of multiply per each bit
1373 set. */
1374 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1375 COSTS_N_INSNS (35), /* HI. */
1376 COSTS_N_INSNS (51), /* SI. */
1377 COSTS_N_INSNS (83), /* DI. */
1378 COSTS_N_INSNS (83)}, /* other. */
1379 COSTS_N_INSNS (1), /* cost of movsx. */
1380 COSTS_N_INSNS (1), /* cost of movzx. */
1381 8, /* "large" insn. */
1382 9, /* MOVE_RATIO. */
1383 4, /* cost for loading QImode using
1384 movzbl. */
1385 {5, 5, 4}, /* cost of loading integer registers
1386 in QImode, HImode and SImode.
1387 Relative to reg-reg move (2). */
1388 {4, 4, 4}, /* cost of storing integer
1389 registers. */
1390 2, /* cost of reg,reg fld/fst. */
1391 {5, 5, 12}, /* cost of loading fp registers
1392 in SFmode, DFmode and XFmode. */
1393 {4, 4, 8}, /* cost of storing fp registers
1394 in SFmode, DFmode and XFmode. */
1395 2, /* cost of moving MMX register. */
1396 {4, 4}, /* cost of loading MMX registers
1397 in SImode and DImode. */
1398 {4, 4}, /* cost of storing MMX registers
1399 in SImode and DImode. */
1400 2, /* cost of moving SSE register. */
1401 {4, 4, 4}, /* cost of loading SSE registers
1402 in SImode, DImode and TImode. */
1403 {4, 4, 4}, /* cost of storing SSE registers
1404 in SImode, DImode and TImode. */
1405 2, /* MMX or SSE register to integer. */
1406 32, /* size of l1 cache. */
1407 512, /* size of l2 cache. */
1408 64, /* size of prefetch block. */
1409 /* New AMD processors never drop prefetches; if they cannot be performed
1410 immediately, they are queued. We set number of simultaneous prefetches
1411 to a large constant to reflect this (it probably is not a good idea not
1412 to limit number of prefetches at all, as their execution also takes some
1413 time). */
1414 100, /* number of parallel prefetches. */
1415 2, /* Branch cost. */
1416 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1417 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1418 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1421 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1423 znver1_memcpy,
1424 znver1_memset,
1425 6, /* scalar_stmt_cost. */
1426 4, /* scalar load_cost. */
1427 4, /* scalar_store_cost. */
1428 6, /* vec_stmt_cost. */
1429 0, /* vec_to_scalar_cost. */
1430 2, /* scalar_to_vec_cost. */
1431 4, /* vec_align_load_cost. */
1432 4, /* vec_unalign_load_cost. */
1433 4, /* vec_store_cost. */
1434 4, /* cond_taken_branch_cost. */
1435 2, /* cond_not_taken_branch_cost. */
1438 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1439 very small blocks it is better to use loop. For large blocks, libcall can
1440 do nontemporary accesses and beat inline considerably. */
1441 static stringop_algs btver1_memcpy[2] = {
1442 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1443 {-1, rep_prefix_4_byte, false}}},
1444 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1445 {-1, libcall, false}}}};
1446 static stringop_algs btver1_memset[2] = {
1447 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1448 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1449 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1450 {-1, libcall, false}}}};
1451 const struct processor_costs btver1_cost = {
1452 COSTS_N_INSNS (1), /* cost of an add instruction */
1453 COSTS_N_INSNS (2), /* cost of a lea instruction */
1454 COSTS_N_INSNS (1), /* variable shift costs */
1455 COSTS_N_INSNS (1), /* constant shift costs */
1456 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1457 COSTS_N_INSNS (4), /* HI */
1458 COSTS_N_INSNS (3), /* SI */
1459 COSTS_N_INSNS (4), /* DI */
1460 COSTS_N_INSNS (5)}, /* other */
1461 0, /* cost of multiply per each bit set */
1462 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1463 COSTS_N_INSNS (35), /* HI */
1464 COSTS_N_INSNS (51), /* SI */
1465 COSTS_N_INSNS (83), /* DI */
1466 COSTS_N_INSNS (83)}, /* other */
1467 COSTS_N_INSNS (1), /* cost of movsx */
1468 COSTS_N_INSNS (1), /* cost of movzx */
1469 8, /* "large" insn */
1470 9, /* MOVE_RATIO */
1471 4, /* cost for loading QImode using movzbl */
1472 {3, 4, 3}, /* cost of loading integer registers
1473 in QImode, HImode and SImode.
1474 Relative to reg-reg move (2). */
1475 {3, 4, 3}, /* cost of storing integer registers */
1476 4, /* cost of reg,reg fld/fst */
1477 {4, 4, 12}, /* cost of loading fp registers
1478 in SFmode, DFmode and XFmode */
1479 {6, 6, 8}, /* cost of storing fp registers
1480 in SFmode, DFmode and XFmode */
1481 2, /* cost of moving MMX register */
1482 {3, 3}, /* cost of loading MMX registers
1483 in SImode and DImode */
1484 {4, 4}, /* cost of storing MMX registers
1485 in SImode and DImode */
1486 2, /* cost of moving SSE register */
1487 {4, 4, 3}, /* cost of loading SSE registers
1488 in SImode, DImode and TImode */
1489 {4, 4, 5}, /* cost of storing SSE registers
1490 in SImode, DImode and TImode */
1491 3, /* MMX or SSE register to integer */
1492 /* On K8:
1493 MOVD reg64, xmmreg Double FSTORE 4
1494 MOVD reg32, xmmreg Double FSTORE 4
1495 On AMDFAM10:
1496 MOVD reg64, xmmreg Double FADD 3
1497 1/1 1/1
1498 MOVD reg32, xmmreg Double FADD 3
1499 1/1 1/1 */
1500 32, /* size of l1 cache. */
1501 512, /* size of l2 cache. */
1502 64, /* size of prefetch block */
1503 100, /* number of parallel prefetches */
1504 2, /* Branch cost */
1505 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1506 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1507 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1508 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1509 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1510 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1512 btver1_memcpy,
1513 btver1_memset,
1514 4, /* scalar_stmt_cost. */
1515 2, /* scalar load_cost. */
1516 2, /* scalar_store_cost. */
1517 6, /* vec_stmt_cost. */
1518 0, /* vec_to_scalar_cost. */
1519 2, /* scalar_to_vec_cost. */
1520 2, /* vec_align_load_cost. */
1521 2, /* vec_unalign_load_cost. */
1522 2, /* vec_store_cost. */
1523 2, /* cond_taken_branch_cost. */
1524 1, /* cond_not_taken_branch_cost. */
1527 static stringop_algs btver2_memcpy[2] = {
1528 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1529 {-1, rep_prefix_4_byte, false}}},
1530 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1531 {-1, libcall, false}}}};
1532 static stringop_algs btver2_memset[2] = {
1533 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1534 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1535 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1536 {-1, libcall, false}}}};
1537 const struct processor_costs btver2_cost = {
1538 COSTS_N_INSNS (1), /* cost of an add instruction */
1539 COSTS_N_INSNS (2), /* cost of a lea instruction */
1540 COSTS_N_INSNS (1), /* variable shift costs */
1541 COSTS_N_INSNS (1), /* constant shift costs */
1542 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1543 COSTS_N_INSNS (4), /* HI */
1544 COSTS_N_INSNS (3), /* SI */
1545 COSTS_N_INSNS (4), /* DI */
1546 COSTS_N_INSNS (5)}, /* other */
1547 0, /* cost of multiply per each bit set */
1548 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1549 COSTS_N_INSNS (35), /* HI */
1550 COSTS_N_INSNS (51), /* SI */
1551 COSTS_N_INSNS (83), /* DI */
1552 COSTS_N_INSNS (83)}, /* other */
1553 COSTS_N_INSNS (1), /* cost of movsx */
1554 COSTS_N_INSNS (1), /* cost of movzx */
1555 8, /* "large" insn */
1556 9, /* MOVE_RATIO */
1557 4, /* cost for loading QImode using movzbl */
1558 {3, 4, 3}, /* cost of loading integer registers
1559 in QImode, HImode and SImode.
1560 Relative to reg-reg move (2). */
1561 {3, 4, 3}, /* cost of storing integer registers */
1562 4, /* cost of reg,reg fld/fst */
1563 {4, 4, 12}, /* cost of loading fp registers
1564 in SFmode, DFmode and XFmode */
1565 {6, 6, 8}, /* cost of storing fp registers
1566 in SFmode, DFmode and XFmode */
1567 2, /* cost of moving MMX register */
1568 {3, 3}, /* cost of loading MMX registers
1569 in SImode and DImode */
1570 {4, 4}, /* cost of storing MMX registers
1571 in SImode and DImode */
1572 2, /* cost of moving SSE register */
1573 {4, 4, 3}, /* cost of loading SSE registers
1574 in SImode, DImode and TImode */
1575 {4, 4, 5}, /* cost of storing SSE registers
1576 in SImode, DImode and TImode */
1577 3, /* MMX or SSE register to integer */
1578 /* On K8:
1579 MOVD reg64, xmmreg Double FSTORE 4
1580 MOVD reg32, xmmreg Double FSTORE 4
1581 On AMDFAM10:
1582 MOVD reg64, xmmreg Double FADD 3
1583 1/1 1/1
1584 MOVD reg32, xmmreg Double FADD 3
1585 1/1 1/1 */
1586 32, /* size of l1 cache. */
1587 2048, /* size of l2 cache. */
1588 64, /* size of prefetch block */
1589 100, /* number of parallel prefetches */
1590 2, /* Branch cost */
1591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1597 btver2_memcpy,
1598 btver2_memset,
1599 4, /* scalar_stmt_cost. */
1600 2, /* scalar load_cost. */
1601 2, /* scalar_store_cost. */
1602 6, /* vec_stmt_cost. */
1603 0, /* vec_to_scalar_cost. */
1604 2, /* scalar_to_vec_cost. */
1605 2, /* vec_align_load_cost. */
1606 2, /* vec_unalign_load_cost. */
1607 2, /* vec_store_cost. */
1608 2, /* cond_taken_branch_cost. */
1609 1, /* cond_not_taken_branch_cost. */
1612 static stringop_algs pentium4_memcpy[2] = {
1613 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1614 DUMMY_STRINGOP_ALGS};
1615 static stringop_algs pentium4_memset[2] = {
1616 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1617 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1618 DUMMY_STRINGOP_ALGS};
1620 static const
1621 struct processor_costs pentium4_cost = {
1622 COSTS_N_INSNS (1), /* cost of an add instruction */
1623 COSTS_N_INSNS (3), /* cost of a lea instruction */
1624 COSTS_N_INSNS (4), /* variable shift costs */
1625 COSTS_N_INSNS (4), /* constant shift costs */
1626 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1627 COSTS_N_INSNS (15), /* HI */
1628 COSTS_N_INSNS (15), /* SI */
1629 COSTS_N_INSNS (15), /* DI */
1630 COSTS_N_INSNS (15)}, /* other */
1631 0, /* cost of multiply per each bit set */
1632 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1633 COSTS_N_INSNS (56), /* HI */
1634 COSTS_N_INSNS (56), /* SI */
1635 COSTS_N_INSNS (56), /* DI */
1636 COSTS_N_INSNS (56)}, /* other */
1637 COSTS_N_INSNS (1), /* cost of movsx */
1638 COSTS_N_INSNS (1), /* cost of movzx */
1639 16, /* "large" insn */
1640 6, /* MOVE_RATIO */
1641 2, /* cost for loading QImode using movzbl */
1642 {4, 5, 4}, /* cost of loading integer registers
1643 in QImode, HImode and SImode.
1644 Relative to reg-reg move (2). */
1645 {2, 3, 2}, /* cost of storing integer registers */
1646 2, /* cost of reg,reg fld/fst */
1647 {2, 2, 6}, /* cost of loading fp registers
1648 in SFmode, DFmode and XFmode */
1649 {4, 4, 6}, /* cost of storing fp registers
1650 in SFmode, DFmode and XFmode */
1651 2, /* cost of moving MMX register */
1652 {2, 2}, /* cost of loading MMX registers
1653 in SImode and DImode */
1654 {2, 2}, /* cost of storing MMX registers
1655 in SImode and DImode */
1656 12, /* cost of moving SSE register */
1657 {12, 12, 12}, /* cost of loading SSE registers
1658 in SImode, DImode and TImode */
1659 {2, 2, 8}, /* cost of storing SSE registers
1660 in SImode, DImode and TImode */
1661 10, /* MMX or SSE register to integer */
1662 8, /* size of l1 cache. */
1663 256, /* size of l2 cache. */
1664 64, /* size of prefetch block */
1665 6, /* number of parallel prefetches */
1666 2, /* Branch cost */
1667 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1668 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1669 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1670 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1671 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1672 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1673 pentium4_memcpy,
1674 pentium4_memset,
1675 1, /* scalar_stmt_cost. */
1676 1, /* scalar load_cost. */
1677 1, /* scalar_store_cost. */
1678 1, /* vec_stmt_cost. */
1679 1, /* vec_to_scalar_cost. */
1680 1, /* scalar_to_vec_cost. */
1681 1, /* vec_align_load_cost. */
1682 2, /* vec_unalign_load_cost. */
1683 1, /* vec_store_cost. */
1684 3, /* cond_taken_branch_cost. */
1685 1, /* cond_not_taken_branch_cost. */
1688 static stringop_algs nocona_memcpy[2] = {
1689 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1690 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1691 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1693 static stringop_algs nocona_memset[2] = {
1694 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1695 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1696 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1697 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1699 static const
1700 struct processor_costs nocona_cost = {
1701 COSTS_N_INSNS (1), /* cost of an add instruction */
1702 COSTS_N_INSNS (1), /* cost of a lea instruction */
1703 COSTS_N_INSNS (1), /* variable shift costs */
1704 COSTS_N_INSNS (1), /* constant shift costs */
1705 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1706 COSTS_N_INSNS (10), /* HI */
1707 COSTS_N_INSNS (10), /* SI */
1708 COSTS_N_INSNS (10), /* DI */
1709 COSTS_N_INSNS (10)}, /* other */
1710 0, /* cost of multiply per each bit set */
1711 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1712 COSTS_N_INSNS (66), /* HI */
1713 COSTS_N_INSNS (66), /* SI */
1714 COSTS_N_INSNS (66), /* DI */
1715 COSTS_N_INSNS (66)}, /* other */
1716 COSTS_N_INSNS (1), /* cost of movsx */
1717 COSTS_N_INSNS (1), /* cost of movzx */
1718 16, /* "large" insn */
1719 17, /* MOVE_RATIO */
1720 4, /* cost for loading QImode using movzbl */
1721 {4, 4, 4}, /* cost of loading integer registers
1722 in QImode, HImode and SImode.
1723 Relative to reg-reg move (2). */
1724 {4, 4, 4}, /* cost of storing integer registers */
1725 3, /* cost of reg,reg fld/fst */
1726 {12, 12, 12}, /* cost of loading fp registers
1727 in SFmode, DFmode and XFmode */
1728 {4, 4, 4}, /* cost of storing fp registers
1729 in SFmode, DFmode and XFmode */
1730 6, /* cost of moving MMX register */
1731 {12, 12}, /* cost of loading MMX registers
1732 in SImode and DImode */
1733 {12, 12}, /* cost of storing MMX registers
1734 in SImode and DImode */
1735 6, /* cost of moving SSE register */
1736 {12, 12, 12}, /* cost of loading SSE registers
1737 in SImode, DImode and TImode */
1738 {12, 12, 12}, /* cost of storing SSE registers
1739 in SImode, DImode and TImode */
1740 8, /* MMX or SSE register to integer */
1741 8, /* size of l1 cache. */
1742 1024, /* size of l2 cache. */
1743 64, /* size of prefetch block */
1744 8, /* number of parallel prefetches */
1745 1, /* Branch cost */
1746 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1747 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1748 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1749 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1750 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1751 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1752 nocona_memcpy,
1753 nocona_memset,
1754 1, /* scalar_stmt_cost. */
1755 1, /* scalar load_cost. */
1756 1, /* scalar_store_cost. */
1757 1, /* vec_stmt_cost. */
1758 1, /* vec_to_scalar_cost. */
1759 1, /* scalar_to_vec_cost. */
1760 1, /* vec_align_load_cost. */
1761 2, /* vec_unalign_load_cost. */
1762 1, /* vec_store_cost. */
1763 3, /* cond_taken_branch_cost. */
1764 1, /* cond_not_taken_branch_cost. */
1767 static stringop_algs atom_memcpy[2] = {
1768 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1769 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1770 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1771 static stringop_algs atom_memset[2] = {
1772 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1773 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1774 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1775 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1776 static const
1777 struct processor_costs atom_cost = {
1778 COSTS_N_INSNS (1), /* cost of an add instruction */
1779 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1780 COSTS_N_INSNS (1), /* variable shift costs */
1781 COSTS_N_INSNS (1), /* constant shift costs */
1782 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1783 COSTS_N_INSNS (4), /* HI */
1784 COSTS_N_INSNS (3), /* SI */
1785 COSTS_N_INSNS (4), /* DI */
1786 COSTS_N_INSNS (2)}, /* other */
1787 0, /* cost of multiply per each bit set */
1788 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1789 COSTS_N_INSNS (26), /* HI */
1790 COSTS_N_INSNS (42), /* SI */
1791 COSTS_N_INSNS (74), /* DI */
1792 COSTS_N_INSNS (74)}, /* other */
1793 COSTS_N_INSNS (1), /* cost of movsx */
1794 COSTS_N_INSNS (1), /* cost of movzx */
1795 8, /* "large" insn */
1796 17, /* MOVE_RATIO */
1797 4, /* cost for loading QImode using movzbl */
1798 {4, 4, 4}, /* cost of loading integer registers
1799 in QImode, HImode and SImode.
1800 Relative to reg-reg move (2). */
1801 {4, 4, 4}, /* cost of storing integer registers */
1802 4, /* cost of reg,reg fld/fst */
1803 {12, 12, 12}, /* cost of loading fp registers
1804 in SFmode, DFmode and XFmode */
1805 {6, 6, 8}, /* cost of storing fp registers
1806 in SFmode, DFmode and XFmode */
1807 2, /* cost of moving MMX register */
1808 {8, 8}, /* cost of loading MMX registers
1809 in SImode and DImode */
1810 {8, 8}, /* cost of storing MMX registers
1811 in SImode and DImode */
1812 2, /* cost of moving SSE register */
1813 {8, 8, 8}, /* cost of loading SSE registers
1814 in SImode, DImode and TImode */
1815 {8, 8, 8}, /* cost of storing SSE registers
1816 in SImode, DImode and TImode */
1817 5, /* MMX or SSE register to integer */
1818 32, /* size of l1 cache. */
1819 256, /* size of l2 cache. */
1820 64, /* size of prefetch block */
1821 6, /* number of parallel prefetches */
1822 3, /* Branch cost */
1823 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1824 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1825 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1826 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1827 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1828 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1829 atom_memcpy,
1830 atom_memset,
1831 1, /* scalar_stmt_cost. */
1832 1, /* scalar load_cost. */
1833 1, /* scalar_store_cost. */
1834 1, /* vec_stmt_cost. */
1835 1, /* vec_to_scalar_cost. */
1836 1, /* scalar_to_vec_cost. */
1837 1, /* vec_align_load_cost. */
1838 2, /* vec_unalign_load_cost. */
1839 1, /* vec_store_cost. */
1840 3, /* cond_taken_branch_cost. */
1841 1, /* cond_not_taken_branch_cost. */
1844 static stringop_algs slm_memcpy[2] = {
1845 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1846 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1847 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1848 static stringop_algs slm_memset[2] = {
1849 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1850 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1851 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1852 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1853 static const
1854 struct processor_costs slm_cost = {
1855 COSTS_N_INSNS (1), /* cost of an add instruction */
1856 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1857 COSTS_N_INSNS (1), /* variable shift costs */
1858 COSTS_N_INSNS (1), /* constant shift costs */
1859 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1860 COSTS_N_INSNS (3), /* HI */
1861 COSTS_N_INSNS (3), /* SI */
1862 COSTS_N_INSNS (4), /* DI */
1863 COSTS_N_INSNS (2)}, /* other */
1864 0, /* cost of multiply per each bit set */
1865 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1866 COSTS_N_INSNS (26), /* HI */
1867 COSTS_N_INSNS (42), /* SI */
1868 COSTS_N_INSNS (74), /* DI */
1869 COSTS_N_INSNS (74)}, /* other */
1870 COSTS_N_INSNS (1), /* cost of movsx */
1871 COSTS_N_INSNS (1), /* cost of movzx */
1872 8, /* "large" insn */
1873 17, /* MOVE_RATIO */
1874 4, /* cost for loading QImode using movzbl */
1875 {4, 4, 4}, /* cost of loading integer registers
1876 in QImode, HImode and SImode.
1877 Relative to reg-reg move (2). */
1878 {4, 4, 4}, /* cost of storing integer registers */
1879 4, /* cost of reg,reg fld/fst */
1880 {12, 12, 12}, /* cost of loading fp registers
1881 in SFmode, DFmode and XFmode */
1882 {6, 6, 8}, /* cost of storing fp registers
1883 in SFmode, DFmode and XFmode */
1884 2, /* cost of moving MMX register */
1885 {8, 8}, /* cost of loading MMX registers
1886 in SImode and DImode */
1887 {8, 8}, /* cost of storing MMX registers
1888 in SImode and DImode */
1889 2, /* cost of moving SSE register */
1890 {8, 8, 8}, /* cost of loading SSE registers
1891 in SImode, DImode and TImode */
1892 {8, 8, 8}, /* cost of storing SSE registers
1893 in SImode, DImode and TImode */
1894 5, /* MMX or SSE register to integer */
1895 32, /* size of l1 cache. */
1896 256, /* size of l2 cache. */
1897 64, /* size of prefetch block */
1898 6, /* number of parallel prefetches */
1899 3, /* Branch cost */
1900 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1901 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1902 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1903 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1904 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1905 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1906 slm_memcpy,
1907 slm_memset,
1908 1, /* scalar_stmt_cost. */
1909 1, /* scalar load_cost. */
1910 1, /* scalar_store_cost. */
1911 1, /* vec_stmt_cost. */
1912 4, /* vec_to_scalar_cost. */
1913 1, /* scalar_to_vec_cost. */
1914 1, /* vec_align_load_cost. */
1915 2, /* vec_unalign_load_cost. */
1916 1, /* vec_store_cost. */
1917 3, /* cond_taken_branch_cost. */
1918 1, /* cond_not_taken_branch_cost. */
1921 static stringop_algs intel_memcpy[2] = {
1922 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1923 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1924 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1925 static stringop_algs intel_memset[2] = {
1926 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1927 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1928 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1929 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1930 static const
1931 struct processor_costs intel_cost = {
1932 COSTS_N_INSNS (1), /* cost of an add instruction */
1933 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1934 COSTS_N_INSNS (1), /* variable shift costs */
1935 COSTS_N_INSNS (1), /* constant shift costs */
1936 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1937 COSTS_N_INSNS (3), /* HI */
1938 COSTS_N_INSNS (3), /* SI */
1939 COSTS_N_INSNS (4), /* DI */
1940 COSTS_N_INSNS (2)}, /* other */
1941 0, /* cost of multiply per each bit set */
1942 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1943 COSTS_N_INSNS (26), /* HI */
1944 COSTS_N_INSNS (42), /* SI */
1945 COSTS_N_INSNS (74), /* DI */
1946 COSTS_N_INSNS (74)}, /* other */
1947 COSTS_N_INSNS (1), /* cost of movsx */
1948 COSTS_N_INSNS (1), /* cost of movzx */
1949 8, /* "large" insn */
1950 17, /* MOVE_RATIO */
1951 4, /* cost for loading QImode using movzbl */
1952 {4, 4, 4}, /* cost of loading integer registers
1953 in QImode, HImode and SImode.
1954 Relative to reg-reg move (2). */
1955 {4, 4, 4}, /* cost of storing integer registers */
1956 4, /* cost of reg,reg fld/fst */
1957 {12, 12, 12}, /* cost of loading fp registers
1958 in SFmode, DFmode and XFmode */
1959 {6, 6, 8}, /* cost of storing fp registers
1960 in SFmode, DFmode and XFmode */
1961 2, /* cost of moving MMX register */
1962 {8, 8}, /* cost of loading MMX registers
1963 in SImode and DImode */
1964 {8, 8}, /* cost of storing MMX registers
1965 in SImode and DImode */
1966 2, /* cost of moving SSE register */
1967 {8, 8, 8}, /* cost of loading SSE registers
1968 in SImode, DImode and TImode */
1969 {8, 8, 8}, /* cost of storing SSE registers
1970 in SImode, DImode and TImode */
1971 5, /* MMX or SSE register to integer */
1972 32, /* size of l1 cache. */
1973 256, /* size of l2 cache. */
1974 64, /* size of prefetch block */
1975 6, /* number of parallel prefetches */
1976 3, /* Branch cost */
1977 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1978 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1979 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1980 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1981 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1982 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1983 intel_memcpy,
1984 intel_memset,
1985 1, /* scalar_stmt_cost. */
1986 1, /* scalar load_cost. */
1987 1, /* scalar_store_cost. */
1988 1, /* vec_stmt_cost. */
1989 4, /* vec_to_scalar_cost. */
1990 1, /* scalar_to_vec_cost. */
1991 1, /* vec_align_load_cost. */
1992 2, /* vec_unalign_load_cost. */
1993 1, /* vec_store_cost. */
1994 3, /* cond_taken_branch_cost. */
1995 1, /* cond_not_taken_branch_cost. */
1998 /* Generic should produce code tuned for Core-i7 (and newer chips)
1999 and btver1 (and newer chips). */
2001 static stringop_algs generic_memcpy[2] = {
2002 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2003 {-1, libcall, false}}},
2004 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2005 {-1, libcall, false}}}};
2006 static stringop_algs generic_memset[2] = {
2007 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2008 {-1, libcall, false}}},
2009 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2010 {-1, libcall, false}}}};
2011 static const
2012 struct processor_costs generic_cost = {
2013 COSTS_N_INSNS (1), /* cost of an add instruction */
2014 /* On all chips taken into consideration lea is 2 cycles and more. With
2015 this cost however our current implementation of synth_mult results in
2016 use of unnecessary temporary registers causing regression on several
2017 SPECfp benchmarks. */
2018 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2019 COSTS_N_INSNS (1), /* variable shift costs */
2020 COSTS_N_INSNS (1), /* constant shift costs */
2021 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2022 COSTS_N_INSNS (4), /* HI */
2023 COSTS_N_INSNS (3), /* SI */
2024 COSTS_N_INSNS (4), /* DI */
2025 COSTS_N_INSNS (2)}, /* other */
2026 0, /* cost of multiply per each bit set */
2027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2028 COSTS_N_INSNS (26), /* HI */
2029 COSTS_N_INSNS (42), /* SI */
2030 COSTS_N_INSNS (74), /* DI */
2031 COSTS_N_INSNS (74)}, /* other */
2032 COSTS_N_INSNS (1), /* cost of movsx */
2033 COSTS_N_INSNS (1), /* cost of movzx */
2034 8, /* "large" insn */
2035 17, /* MOVE_RATIO */
2036 4, /* cost for loading QImode using movzbl */
2037 {4, 4, 4}, /* cost of loading integer registers
2038 in QImode, HImode and SImode.
2039 Relative to reg-reg move (2). */
2040 {4, 4, 4}, /* cost of storing integer registers */
2041 4, /* cost of reg,reg fld/fst */
2042 {12, 12, 12}, /* cost of loading fp registers
2043 in SFmode, DFmode and XFmode */
2044 {6, 6, 8}, /* cost of storing fp registers
2045 in SFmode, DFmode and XFmode */
2046 2, /* cost of moving MMX register */
2047 {8, 8}, /* cost of loading MMX registers
2048 in SImode and DImode */
2049 {8, 8}, /* cost of storing MMX registers
2050 in SImode and DImode */
2051 2, /* cost of moving SSE register */
2052 {8, 8, 8}, /* cost of loading SSE registers
2053 in SImode, DImode and TImode */
2054 {8, 8, 8}, /* cost of storing SSE registers
2055 in SImode, DImode and TImode */
2056 5, /* MMX or SSE register to integer */
2057 32, /* size of l1 cache. */
2058 512, /* size of l2 cache. */
2059 64, /* size of prefetch block */
2060 6, /* number of parallel prefetches */
2061 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2062 value is increased to perhaps more appropriate value of 5. */
2063 3, /* Branch cost */
2064 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2065 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2066 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2067 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2068 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2069 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2070 generic_memcpy,
2071 generic_memset,
2072 1, /* scalar_stmt_cost. */
2073 1, /* scalar load_cost. */
2074 1, /* scalar_store_cost. */
2075 1, /* vec_stmt_cost. */
2076 1, /* vec_to_scalar_cost. */
2077 1, /* scalar_to_vec_cost. */
2078 1, /* vec_align_load_cost. */
2079 2, /* vec_unalign_load_cost. */
2080 1, /* vec_store_cost. */
2081 3, /* cond_taken_branch_cost. */
2082 1, /* cond_not_taken_branch_cost. */
2085 /* core_cost should produce code tuned for Core familly of CPUs. */
2086 static stringop_algs core_memcpy[2] = {
2087 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2088 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2089 {-1, libcall, false}}}};
2090 static stringop_algs core_memset[2] = {
2091 {libcall, {{6, loop_1_byte, true},
2092 {24, loop, true},
2093 {8192, rep_prefix_4_byte, true},
2094 {-1, libcall, false}}},
2095 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2096 {-1, libcall, false}}}};
2098 static const
2099 struct processor_costs core_cost = {
2100 COSTS_N_INSNS (1), /* cost of an add instruction */
2101 /* On all chips taken into consideration lea is 2 cycles and more. With
2102 this cost however our current implementation of synth_mult results in
2103 use of unnecessary temporary registers causing regression on several
2104 SPECfp benchmarks. */
2105 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2106 COSTS_N_INSNS (1), /* variable shift costs */
2107 COSTS_N_INSNS (1), /* constant shift costs */
2108 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2109 COSTS_N_INSNS (4), /* HI */
2110 COSTS_N_INSNS (3), /* SI */
2111 COSTS_N_INSNS (4), /* DI */
2112 COSTS_N_INSNS (2)}, /* other */
2113 0, /* cost of multiply per each bit set */
2114 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2115 COSTS_N_INSNS (26), /* HI */
2116 COSTS_N_INSNS (42), /* SI */
2117 COSTS_N_INSNS (74), /* DI */
2118 COSTS_N_INSNS (74)}, /* other */
2119 COSTS_N_INSNS (1), /* cost of movsx */
2120 COSTS_N_INSNS (1), /* cost of movzx */
2121 8, /* "large" insn */
2122 17, /* MOVE_RATIO */
2123 4, /* cost for loading QImode using movzbl */
2124 {4, 4, 4}, /* cost of loading integer registers
2125 in QImode, HImode and SImode.
2126 Relative to reg-reg move (2). */
2127 {4, 4, 4}, /* cost of storing integer registers */
2128 4, /* cost of reg,reg fld/fst */
2129 {12, 12, 12}, /* cost of loading fp registers
2130 in SFmode, DFmode and XFmode */
2131 {6, 6, 8}, /* cost of storing fp registers
2132 in SFmode, DFmode and XFmode */
2133 2, /* cost of moving MMX register */
2134 {8, 8}, /* cost of loading MMX registers
2135 in SImode and DImode */
2136 {8, 8}, /* cost of storing MMX registers
2137 in SImode and DImode */
2138 2, /* cost of moving SSE register */
2139 {8, 8, 8}, /* cost of loading SSE registers
2140 in SImode, DImode and TImode */
2141 {8, 8, 8}, /* cost of storing SSE registers
2142 in SImode, DImode and TImode */
2143 5, /* MMX or SSE register to integer */
2144 64, /* size of l1 cache. */
2145 512, /* size of l2 cache. */
2146 64, /* size of prefetch block */
2147 6, /* number of parallel prefetches */
2148 /* FIXME perhaps more appropriate value is 5. */
2149 3, /* Branch cost */
2150 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2151 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2152 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2153 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2154 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2155 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2156 core_memcpy,
2157 core_memset,
2158 1, /* scalar_stmt_cost. */
2159 1, /* scalar load_cost. */
2160 1, /* scalar_store_cost. */
2161 1, /* vec_stmt_cost. */
2162 1, /* vec_to_scalar_cost. */
2163 1, /* scalar_to_vec_cost. */
2164 1, /* vec_align_load_cost. */
2165 2, /* vec_unalign_load_cost. */
2166 1, /* vec_store_cost. */
2167 3, /* cond_taken_branch_cost. */
2168 1, /* cond_not_taken_branch_cost. */
2172 /* Set by -mtune. */
2173 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2175 /* Set by -mtune or -Os. */
2176 const struct processor_costs *ix86_cost = &pentium_cost;
2178 /* Processor feature/optimization bitmasks. */
2179 #define m_386 (1U<<PROCESSOR_I386)
2180 #define m_486 (1U<<PROCESSOR_I486)
2181 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2182 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2183 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2184 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2185 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2186 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2187 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2188 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2189 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2190 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2191 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2192 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2193 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2194 #define m_KNL (1U<<PROCESSOR_KNL)
2195 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2196 #define m_INTEL (1U<<PROCESSOR_INTEL)
2198 #define m_GEODE (1U<<PROCESSOR_GEODE)
2199 #define m_K6 (1U<<PROCESSOR_K6)
2200 #define m_K6_GEODE (m_K6 | m_GEODE)
2201 #define m_K8 (1U<<PROCESSOR_K8)
2202 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2203 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2204 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2205 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2206 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2207 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2208 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2209 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2210 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2211 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2212 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2213 #define m_BTVER (m_BTVER1 | m_BTVER2)
2214 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2215 | m_ZNVER1)
2217 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2219 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2220 #undef DEF_TUNE
2221 #define DEF_TUNE(tune, name, selector) name,
2222 #include "x86-tune.def"
2223 #undef DEF_TUNE
2226 /* Feature tests against the various tunings. */
2227 unsigned char ix86_tune_features[X86_TUNE_LAST];
2229 /* Feature tests against the various tunings used to create ix86_tune_features
2230 based on the processor mask. */
2231 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2232 #undef DEF_TUNE
2233 #define DEF_TUNE(tune, name, selector) selector,
2234 #include "x86-tune.def"
2235 #undef DEF_TUNE
2238 /* Feature tests against the various architecture variations. */
2239 unsigned char ix86_arch_features[X86_ARCH_LAST];
2241 /* Feature tests against the various architecture variations, used to create
2242 ix86_arch_features based on the processor mask. */
2243 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2244 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2245 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2247 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2248 ~m_386,
2250 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2251 ~(m_386 | m_486),
2253 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2254 ~m_386,
2256 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2257 ~m_386,
2260 /* In case the average insn count for single function invocation is
2261 lower than this constant, emit fast (but longer) prologue and
2262 epilogue code. */
2263 #define FAST_PROLOGUE_INSN_COUNT 20
2265 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2266 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2267 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2268 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2270 /* Array of the smallest class containing reg number REGNO, indexed by
2271 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2273 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2275 /* ax, dx, cx, bx */
2276 AREG, DREG, CREG, BREG,
2277 /* si, di, bp, sp */
2278 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2279 /* FP registers */
2280 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2281 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2282 /* arg pointer */
2283 NON_Q_REGS,
2284 /* flags, fpsr, fpcr, frame */
2285 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2286 /* SSE registers */
2287 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2288 SSE_REGS, SSE_REGS,
2289 /* MMX registers */
2290 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2291 MMX_REGS, MMX_REGS,
2292 /* REX registers */
2293 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2294 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2295 /* SSE REX registers */
2296 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2297 SSE_REGS, SSE_REGS,
2298 /* AVX-512 SSE registers */
2299 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2300 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2301 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2302 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2303 /* Mask registers. */
2304 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2305 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2306 /* MPX bound registers */
2307 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2310 /* The "default" register map used in 32bit mode. */
2312 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2314 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2315 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2316 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2317 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2318 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2319 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2320 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2321 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2322 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2323 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2324 101, 102, 103, 104, /* bound registers */
2327 /* The "default" register map used in 64bit mode. */
2329 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2331 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2332 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2333 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2334 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2335 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2336 8,9,10,11,12,13,14,15, /* extended integer registers */
2337 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2338 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2339 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2340 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2341 126, 127, 128, 129, /* bound registers */
2344 /* Define the register numbers to be used in Dwarf debugging information.
2345 The SVR4 reference port C compiler uses the following register numbers
2346 in its Dwarf output code:
2347 0 for %eax (gcc regno = 0)
2348 1 for %ecx (gcc regno = 2)
2349 2 for %edx (gcc regno = 1)
2350 3 for %ebx (gcc regno = 3)
2351 4 for %esp (gcc regno = 7)
2352 5 for %ebp (gcc regno = 6)
2353 6 for %esi (gcc regno = 4)
2354 7 for %edi (gcc regno = 5)
2355 The following three DWARF register numbers are never generated by
2356 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2357 believes these numbers have these meanings.
2358 8 for %eip (no gcc equivalent)
2359 9 for %eflags (gcc regno = 17)
2360 10 for %trapno (no gcc equivalent)
2361 It is not at all clear how we should number the FP stack registers
2362 for the x86 architecture. If the version of SDB on x86/svr4 were
2363 a bit less brain dead with respect to floating-point then we would
2364 have a precedent to follow with respect to DWARF register numbers
2365 for x86 FP registers, but the SDB on x86/svr4 is so completely
2366 broken with respect to FP registers that it is hardly worth thinking
2367 of it as something to strive for compatibility with.
2368 The version of x86/svr4 SDB I have at the moment does (partially)
2369 seem to believe that DWARF register number 11 is associated with
2370 the x86 register %st(0), but that's about all. Higher DWARF
2371 register numbers don't seem to be associated with anything in
2372 particular, and even for DWARF regno 11, SDB only seems to under-
2373 stand that it should say that a variable lives in %st(0) (when
2374 asked via an `=' command) if we said it was in DWARF regno 11,
2375 but SDB still prints garbage when asked for the value of the
2376 variable in question (via a `/' command).
2377 (Also note that the labels SDB prints for various FP stack regs
2378 when doing an `x' command are all wrong.)
2379 Note that these problems generally don't affect the native SVR4
2380 C compiler because it doesn't allow the use of -O with -g and
2381 because when it is *not* optimizing, it allocates a memory
2382 location for each floating-point variable, and the memory
2383 location is what gets described in the DWARF AT_location
2384 attribute for the variable in question.
2385 Regardless of the severe mental illness of the x86/svr4 SDB, we
2386 do something sensible here and we use the following DWARF
2387 register numbers. Note that these are all stack-top-relative
2388 numbers.
2389 11 for %st(0) (gcc regno = 8)
2390 12 for %st(1) (gcc regno = 9)
2391 13 for %st(2) (gcc regno = 10)
2392 14 for %st(3) (gcc regno = 11)
2393 15 for %st(4) (gcc regno = 12)
2394 16 for %st(5) (gcc regno = 13)
2395 17 for %st(6) (gcc regno = 14)
2396 18 for %st(7) (gcc regno = 15)
2398 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2400 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2401 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2402 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2403 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2404 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2405 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2406 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2407 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2408 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2409 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2410 101, 102, 103, 104, /* bound registers */
2413 /* Define parameter passing and return registers. */
2415 static int const x86_64_int_parameter_registers[6] =
2417 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2420 static int const x86_64_ms_abi_int_parameter_registers[4] =
2422 CX_REG, DX_REG, R8_REG, R9_REG
2425 static int const x86_64_int_return_registers[4] =
2427 AX_REG, DX_REG, DI_REG, SI_REG
2430 /* Additional registers that are clobbered by SYSV calls. */
2432 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2433 static int const x86_64_ms_sysv_extra_clobbered_registers
2434 [NUM_X86_64_MS_CLOBBERED_REGS] =
2436 SI_REG, DI_REG,
2437 XMM6_REG, XMM7_REG,
2438 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2439 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2442 enum xlogue_stub {
2443 XLOGUE_STUB_SAVE,
2444 XLOGUE_STUB_RESTORE,
2445 XLOGUE_STUB_RESTORE_TAIL,
2446 XLOGUE_STUB_SAVE_HFP,
2447 XLOGUE_STUB_RESTORE_HFP,
2448 XLOGUE_STUB_RESTORE_HFP_TAIL,
2450 XLOGUE_STUB_COUNT
2453 enum xlogue_stub_sets {
2454 XLOGUE_SET_ALIGNED,
2455 XLOGUE_SET_ALIGNED_PLUS_8,
2456 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2457 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2459 XLOGUE_SET_COUNT
2462 /* Register save/restore layout used by out-of-line stubs. */
2463 class xlogue_layout {
2464 public:
2465 struct reginfo
2467 unsigned regno;
2468 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2469 rsi) to where each register is stored. */
2472 unsigned get_nregs () const {return m_nregs;}
2473 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2475 const reginfo &get_reginfo (unsigned reg) const
2477 gcc_assert (reg < m_nregs);
2478 return m_regs[reg];
2481 static const char *get_stub_name (enum xlogue_stub stub,
2482 unsigned n_extra_args);
2484 /* Returns an rtx for the stub's symbol based upon
2485 1.) the specified stub (save, restore or restore_ret) and
2486 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2487 3.) rather or not stack alignment is being performed. */
2488 static rtx get_stub_rtx (enum xlogue_stub stub);
2490 /* Returns the amount of stack space (including padding) that the stub
2491 needs to store registers based upon data in the machine_function. */
2492 HOST_WIDE_INT get_stack_space_used () const
2494 const struct machine_function *m = cfun->machine;
2495 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2497 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2498 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
2501 /* Returns the offset for the base pointer used by the stub. */
2502 HOST_WIDE_INT get_stub_ptr_offset () const
2504 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2507 static const struct xlogue_layout &get_instance ();
2508 static unsigned count_stub_managed_regs ();
2509 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2511 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2512 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2513 static const unsigned MAX_REGS = 18;
2514 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2515 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2516 static const unsigned STUB_NAME_MAX_LEN = 16;
2517 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2518 static const unsigned REG_ORDER[MAX_REGS];
2519 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2521 private:
2522 xlogue_layout ();
2523 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2524 xlogue_layout (const xlogue_layout &);
2526 /* True if hard frame pointer is used. */
2527 bool m_hfp;
2529 /* Max number of register this layout manages. */
2530 unsigned m_nregs;
2532 /* Incoming offset from 16-byte alignment. */
2533 HOST_WIDE_INT m_stack_align_off_in;
2535 /* Register order and offsets. */
2536 struct reginfo m_regs[MAX_REGS];
2538 /* Lazy-inited cache of symbol names for stubs. */
2539 static char s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2540 [STUB_NAME_MAX_LEN];
2542 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2545 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2546 "savms64",
2547 "resms64",
2548 "resms64x",
2549 "savms64f",
2550 "resms64f",
2551 "resms64fx"
2554 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2555 /* The below offset values are where each register is stored for the layout
2556 relative to incoming stack pointer. The value of each m_regs[].offset will
2557 be relative to the incoming base pointer (rax or rsi) used by the stub.
2559 s_instances: 0 1 2 3
2560 Offset: realigned or aligned + 8
2561 Register aligned aligned + 8 aligned w/HFP w/HFP */
2562 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2563 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2564 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2565 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2566 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2567 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2568 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2569 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2570 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2571 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2572 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2573 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2574 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2575 BP_REG, /* 0xc0 0xc8 N/A N/A */
2576 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2577 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2578 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2579 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2582 /* Instantiate static const values. */
2583 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2584 const unsigned xlogue_layout::MIN_REGS;
2585 const unsigned xlogue_layout::MAX_REGS;
2586 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2587 const unsigned xlogue_layout::VARIANT_COUNT;
2588 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2590 /* Initialize xlogue_layout::s_stub_names to zero. */
2591 char xlogue_layout::s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2592 [STUB_NAME_MAX_LEN];
2594 /* Instantiates all xlogue_layout instances. */
2595 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2596 xlogue_layout (0, false),
2597 xlogue_layout (8, false),
2598 xlogue_layout (0, true),
2599 xlogue_layout (8, true)
2602 /* Return an appropriate const instance of xlogue_layout based upon values
2603 in cfun->machine and crtl. */
2604 const struct xlogue_layout &
2605 xlogue_layout::get_instance ()
2607 enum xlogue_stub_sets stub_set;
2608 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2610 if (stack_realign_fp)
2611 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2612 else if (frame_pointer_needed)
2613 stub_set = aligned_plus_8
2614 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2615 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2616 else
2617 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2619 return s_instances[stub_set];
2622 /* Determine how many clobbered registers can be saved by the stub.
2623 Returns the count of registers the stub will save and restore. */
2624 unsigned
2625 xlogue_layout::count_stub_managed_regs ()
2627 bool hfp = frame_pointer_needed || stack_realign_fp;
2628 unsigned i, count;
2629 unsigned regno;
2631 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2633 regno = REG_ORDER[i];
2634 if (regno == BP_REG && hfp)
2635 continue;
2636 if (!ix86_save_reg (regno, false, false))
2637 break;
2638 ++count;
2640 return count;
2643 /* Determine if register REGNO is a stub managed register given the
2644 total COUNT of stub managed registers. */
2645 bool
2646 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2648 bool hfp = frame_pointer_needed || stack_realign_fp;
2649 unsigned i;
2651 for (i = 0; i < count; ++i)
2653 gcc_assert (i < MAX_REGS);
2654 if (REG_ORDER[i] == BP_REG && hfp)
2655 ++count;
2656 else if (REG_ORDER[i] == regno)
2657 return true;
2659 return false;
2662 /* Constructor for xlogue_layout. */
2663 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2664 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2665 m_stack_align_off_in (stack_align_off_in)
2667 HOST_WIDE_INT offset = stack_align_off_in;
2668 unsigned i, j;
2670 for (i = j = 0; i < MAX_REGS; ++i)
2672 unsigned regno = REG_ORDER[i];
2674 if (regno == BP_REG && hfp)
2675 continue;
2676 if (SSE_REGNO_P (regno))
2678 offset += 16;
2679 /* Verify that SSE regs are always aligned. */
2680 gcc_assert (!((stack_align_off_in + offset) & 15));
2682 else
2683 offset += 8;
2685 m_regs[j].regno = regno;
2686 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2688 gcc_assert (j == m_nregs);
2691 const char *
2692 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2693 unsigned n_extra_regs)
2695 char *name = s_stub_names[stub][n_extra_regs];
2697 /* Lazy init */
2698 if (!*name)
2700 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%u",
2701 STUB_BASE_NAMES[stub], MIN_REGS + n_extra_regs);
2702 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2705 return name;
2708 /* Return rtx of a symbol ref for the entry point (based upon
2709 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2711 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2713 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2714 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2715 gcc_assert (stub < XLOGUE_STUB_COUNT);
2716 gcc_assert (crtl->stack_realign_finalized);
2718 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2721 /* Define the structure for the machine field in struct function. */
2723 struct GTY(()) stack_local_entry {
2724 unsigned short mode;
2725 unsigned short n;
2726 rtx rtl;
2727 struct stack_local_entry *next;
2730 /* Which cpu are we scheduling for. */
2731 enum attr_cpu ix86_schedule;
2733 /* Which cpu are we optimizing for. */
2734 enum processor_type ix86_tune;
2736 /* Which instruction set architecture to use. */
2737 enum processor_type ix86_arch;
2739 /* True if processor has SSE prefetch instruction. */
2740 unsigned char x86_prefetch_sse;
2742 /* -mstackrealign option */
2743 static const char ix86_force_align_arg_pointer_string[]
2744 = "force_align_arg_pointer";
2746 static rtx (*ix86_gen_leave) (void);
2747 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2748 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2749 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2750 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2751 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2752 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2753 static rtx (*ix86_gen_clzero) (rtx);
2754 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2756 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2757 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2758 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2759 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2761 /* Preferred alignment for stack boundary in bits. */
2762 unsigned int ix86_preferred_stack_boundary;
2764 /* Alignment for incoming stack boundary in bits specified at
2765 command line. */
2766 static unsigned int ix86_user_incoming_stack_boundary;
2768 /* Default alignment for incoming stack boundary in bits. */
2769 static unsigned int ix86_default_incoming_stack_boundary;
2771 /* Alignment for incoming stack boundary in bits. */
2772 unsigned int ix86_incoming_stack_boundary;
2774 /* Calling abi specific va_list type nodes. */
2775 static GTY(()) tree sysv_va_list_type_node;
2776 static GTY(()) tree ms_va_list_type_node;
2778 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2779 char internal_label_prefix[16];
2780 int internal_label_prefix_len;
2782 /* Fence to use after loop using movnt. */
2783 tree x86_mfence;
2785 /* Register class used for passing given 64bit part of the argument.
2786 These represent classes as documented by the PS ABI, with the exception
2787 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2788 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2790 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2791 whenever possible (upper half does contain padding). */
2792 enum x86_64_reg_class
2794 X86_64_NO_CLASS,
2795 X86_64_INTEGER_CLASS,
2796 X86_64_INTEGERSI_CLASS,
2797 X86_64_SSE_CLASS,
2798 X86_64_SSESF_CLASS,
2799 X86_64_SSEDF_CLASS,
2800 X86_64_SSEUP_CLASS,
2801 X86_64_X87_CLASS,
2802 X86_64_X87UP_CLASS,
2803 X86_64_COMPLEX_X87_CLASS,
2804 X86_64_MEMORY_CLASS
2807 #define MAX_CLASSES 8
2809 /* Table of constants used by fldpi, fldln2, etc.... */
2810 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2811 static bool ext_80387_constants_init;
2814 static struct machine_function * ix86_init_machine_status (void);
2815 static rtx ix86_function_value (const_tree, const_tree, bool);
2816 static bool ix86_function_value_regno_p (const unsigned int);
2817 static unsigned int ix86_function_arg_boundary (machine_mode,
2818 const_tree);
2819 static rtx ix86_static_chain (const_tree, bool);
2820 static int ix86_function_regparm (const_tree, const_tree);
2821 static void ix86_compute_frame_layout (void);
2822 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2823 rtx, rtx, int);
2824 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2825 static tree ix86_canonical_va_list_type (tree);
2826 static void predict_jump (int);
2827 static unsigned int split_stack_prologue_scratch_regno (void);
2828 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2830 enum ix86_function_specific_strings
2832 IX86_FUNCTION_SPECIFIC_ARCH,
2833 IX86_FUNCTION_SPECIFIC_TUNE,
2834 IX86_FUNCTION_SPECIFIC_MAX
2837 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2838 const char *, const char *, enum fpmath_unit,
2839 bool);
2840 static void ix86_function_specific_save (struct cl_target_option *,
2841 struct gcc_options *opts);
2842 static void ix86_function_specific_restore (struct gcc_options *opts,
2843 struct cl_target_option *);
2844 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2845 static void ix86_function_specific_print (FILE *, int,
2846 struct cl_target_option *);
2847 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2848 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2849 struct gcc_options *,
2850 struct gcc_options *,
2851 struct gcc_options *);
2852 static bool ix86_can_inline_p (tree, tree);
2853 static void ix86_set_current_function (tree);
2854 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2856 static enum calling_abi ix86_function_abi (const_tree);
2859 #ifndef SUBTARGET32_DEFAULT_CPU
2860 #define SUBTARGET32_DEFAULT_CPU "i386"
2861 #endif
2863 /* Whether -mtune= or -march= were specified */
2864 static int ix86_tune_defaulted;
2865 static int ix86_arch_specified;
2867 /* Vectorization library interface and handlers. */
2868 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2870 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2871 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2873 /* Processor target table, indexed by processor number */
2874 struct ptt
2876 const char *const name; /* processor name */
2877 const struct processor_costs *cost; /* Processor costs */
2878 const int align_loop; /* Default alignments. */
2879 const int align_loop_max_skip;
2880 const int align_jump;
2881 const int align_jump_max_skip;
2882 const int align_func;
2885 /* This table must be in sync with enum processor_type in i386.h. */
2886 static const struct ptt processor_target_table[PROCESSOR_max] =
2888 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2889 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2890 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2891 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2892 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2893 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2894 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2895 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2896 {"core2", &core_cost, 16, 10, 16, 10, 16},
2897 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2898 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2899 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2900 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2901 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2902 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2903 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2904 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2905 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2906 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2907 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2908 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2909 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2910 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2911 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2912 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2913 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2914 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2915 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2916 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2919 static unsigned int
2920 rest_of_handle_insert_vzeroupper (void)
2922 int i;
2924 /* vzeroupper instructions are inserted immediately after reload to
2925 account for possible spills from 256bit registers. The pass
2926 reuses mode switching infrastructure by re-running mode insertion
2927 pass, so disable entities that have already been processed. */
2928 for (i = 0; i < MAX_386_ENTITIES; i++)
2929 ix86_optimize_mode_switching[i] = 0;
2931 ix86_optimize_mode_switching[AVX_U128] = 1;
2933 /* Call optimize_mode_switching. */
2934 g->get_passes ()->execute_pass_mode_switching ();
2935 return 0;
2938 /* Return 1 if INSN uses or defines a hard register.
2939 Hard register uses in a memory address are ignored.
2940 Clobbers and flags definitions are ignored. */
2942 static bool
2943 has_non_address_hard_reg (rtx_insn *insn)
2945 df_ref ref;
2946 FOR_EACH_INSN_DEF (ref, insn)
2947 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2948 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2949 && DF_REF_REGNO (ref) != FLAGS_REG)
2950 return true;
2952 FOR_EACH_INSN_USE (ref, insn)
2953 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2954 return true;
2956 return false;
2959 /* Check if comparison INSN may be transformed
2960 into vector comparison. Currently we transform
2961 zero checks only which look like:
2963 (set (reg:CCZ 17 flags)
2964 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2965 (subreg:SI (reg:DI x) 0))
2966 (const_int 0 [0]))) */
2968 static bool
2969 convertible_comparison_p (rtx_insn *insn)
2971 if (!TARGET_SSE4_1)
2972 return false;
2974 rtx def_set = single_set (insn);
2976 gcc_assert (def_set);
2978 rtx src = SET_SRC (def_set);
2979 rtx dst = SET_DEST (def_set);
2981 gcc_assert (GET_CODE (src) == COMPARE);
2983 if (GET_CODE (dst) != REG
2984 || REGNO (dst) != FLAGS_REG
2985 || GET_MODE (dst) != CCZmode)
2986 return false;
2988 rtx op1 = XEXP (src, 0);
2989 rtx op2 = XEXP (src, 1);
2991 if (op2 != CONST0_RTX (GET_MODE (op2)))
2992 return false;
2994 if (GET_CODE (op1) != IOR)
2995 return false;
2997 op2 = XEXP (op1, 1);
2998 op1 = XEXP (op1, 0);
3000 if (!SUBREG_P (op1)
3001 || !SUBREG_P (op2)
3002 || GET_MODE (op1) != SImode
3003 || GET_MODE (op2) != SImode
3004 || ((SUBREG_BYTE (op1) != 0
3005 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3006 && (SUBREG_BYTE (op2) != 0
3007 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3008 return false;
3010 op1 = SUBREG_REG (op1);
3011 op2 = SUBREG_REG (op2);
3013 if (op1 != op2
3014 || !REG_P (op1)
3015 || GET_MODE (op1) != DImode)
3016 return false;
3018 return true;
3021 /* The DImode version of scalar_to_vector_candidate_p. */
3023 static bool
3024 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3026 rtx def_set = single_set (insn);
3028 if (!def_set)
3029 return false;
3031 if (has_non_address_hard_reg (insn))
3032 return false;
3034 rtx src = SET_SRC (def_set);
3035 rtx dst = SET_DEST (def_set);
3037 if (GET_CODE (src) == COMPARE)
3038 return convertible_comparison_p (insn);
3040 /* We are interested in DImode promotion only. */
3041 if ((GET_MODE (src) != DImode
3042 && !CONST_INT_P (src))
3043 || GET_MODE (dst) != DImode)
3044 return false;
3046 if (!REG_P (dst) && !MEM_P (dst))
3047 return false;
3049 switch (GET_CODE (src))
3051 case ASHIFTRT:
3052 if (!TARGET_AVX512VL)
3053 return false;
3054 /* FALLTHRU */
3056 case ASHIFT:
3057 case LSHIFTRT:
3058 if (!REG_P (XEXP (src, 1))
3059 && (!SUBREG_P (XEXP (src, 1))
3060 || SUBREG_BYTE (XEXP (src, 1)) != 0
3061 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3062 && (!CONST_INT_P (XEXP (src, 1))
3063 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3064 return false;
3066 if (GET_MODE (XEXP (src, 1)) != QImode
3067 && !CONST_INT_P (XEXP (src, 1)))
3068 return false;
3069 break;
3071 case PLUS:
3072 case MINUS:
3073 case IOR:
3074 case XOR:
3075 case AND:
3076 if (!REG_P (XEXP (src, 1))
3077 && !MEM_P (XEXP (src, 1))
3078 && !CONST_INT_P (XEXP (src, 1)))
3079 return false;
3081 if (GET_MODE (XEXP (src, 1)) != DImode
3082 && !CONST_INT_P (XEXP (src, 1)))
3083 return false;
3084 break;
3086 case NEG:
3087 case NOT:
3088 break;
3090 case REG:
3091 return true;
3093 case MEM:
3094 case CONST_INT:
3095 return REG_P (dst);
3097 default:
3098 return false;
3101 if (!REG_P (XEXP (src, 0))
3102 && !MEM_P (XEXP (src, 0))
3103 && !CONST_INT_P (XEXP (src, 0))
3104 /* Check for andnot case. */
3105 && (GET_CODE (src) != AND
3106 || GET_CODE (XEXP (src, 0)) != NOT
3107 || !REG_P (XEXP (XEXP (src, 0), 0))))
3108 return false;
3110 if (GET_MODE (XEXP (src, 0)) != DImode
3111 && !CONST_INT_P (XEXP (src, 0)))
3112 return false;
3114 return true;
3117 /* The TImode version of scalar_to_vector_candidate_p. */
3119 static bool
3120 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3122 rtx def_set = single_set (insn);
3124 if (!def_set)
3125 return false;
3127 if (has_non_address_hard_reg (insn))
3128 return false;
3130 rtx src = SET_SRC (def_set);
3131 rtx dst = SET_DEST (def_set);
3133 /* Only TImode load and store are allowed. */
3134 if (GET_MODE (dst) != TImode)
3135 return false;
3137 if (MEM_P (dst))
3139 /* Check for store. Memory must be aligned or unaligned store
3140 is optimal. Only support store from register, standard SSE
3141 constant or CONST_WIDE_INT generated from piecewise store.
3143 ??? Verify performance impact before enabling CONST_INT for
3144 __int128 store. */
3145 if (misaligned_operand (dst, TImode)
3146 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3147 return false;
3149 switch (GET_CODE (src))
3151 default:
3152 return false;
3154 case REG:
3155 case CONST_WIDE_INT:
3156 return true;
3158 case CONST_INT:
3159 return standard_sse_constant_p (src, TImode);
3162 else if (MEM_P (src))
3164 /* Check for load. Memory must be aligned or unaligned load is
3165 optimal. */
3166 return (REG_P (dst)
3167 && (!misaligned_operand (src, TImode)
3168 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3171 return false;
3174 /* Return 1 if INSN may be converted into vector
3175 instruction. */
3177 static bool
3178 scalar_to_vector_candidate_p (rtx_insn *insn)
3180 if (TARGET_64BIT)
3181 return timode_scalar_to_vector_candidate_p (insn);
3182 else
3183 return dimode_scalar_to_vector_candidate_p (insn);
3186 /* The DImode version of remove_non_convertible_regs. */
3188 static void
3189 dimode_remove_non_convertible_regs (bitmap candidates)
3191 bitmap_iterator bi;
3192 unsigned id;
3193 bitmap regs = BITMAP_ALLOC (NULL);
3195 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3197 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3198 rtx reg = SET_DEST (def_set);
3200 if (!REG_P (reg)
3201 || bitmap_bit_p (regs, REGNO (reg))
3202 || HARD_REGISTER_P (reg))
3203 continue;
3205 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3206 def;
3207 def = DF_REF_NEXT_REG (def))
3209 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3211 if (dump_file)
3212 fprintf (dump_file,
3213 "r%d has non convertible definition in insn %d\n",
3214 REGNO (reg), DF_REF_INSN_UID (def));
3216 bitmap_set_bit (regs, REGNO (reg));
3217 break;
3222 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3224 for (df_ref def = DF_REG_DEF_CHAIN (id);
3225 def;
3226 def = DF_REF_NEXT_REG (def))
3227 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3229 if (dump_file)
3230 fprintf (dump_file, "Removing insn %d from candidates list\n",
3231 DF_REF_INSN_UID (def));
3233 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3237 BITMAP_FREE (regs);
3240 /* For a register REGNO, scan instructions for its defs and uses.
3241 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3243 static void
3244 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3245 unsigned int regno)
3247 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3248 def;
3249 def = DF_REF_NEXT_REG (def))
3251 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3253 if (dump_file)
3254 fprintf (dump_file,
3255 "r%d has non convertible def in insn %d\n",
3256 regno, DF_REF_INSN_UID (def));
3258 bitmap_set_bit (regs, regno);
3259 break;
3263 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3264 ref;
3265 ref = DF_REF_NEXT_REG (ref))
3267 /* Debug instructions are skipped. */
3268 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3269 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3271 if (dump_file)
3272 fprintf (dump_file,
3273 "r%d has non convertible use in insn %d\n",
3274 regno, DF_REF_INSN_UID (ref));
3276 bitmap_set_bit (regs, regno);
3277 break;
3282 /* The TImode version of remove_non_convertible_regs. */
3284 static void
3285 timode_remove_non_convertible_regs (bitmap candidates)
3287 bitmap_iterator bi;
3288 unsigned id;
3289 bitmap regs = BITMAP_ALLOC (NULL);
3291 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3293 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3294 rtx dest = SET_DEST (def_set);
3295 rtx src = SET_SRC (def_set);
3297 if ((!REG_P (dest)
3298 || bitmap_bit_p (regs, REGNO (dest))
3299 || HARD_REGISTER_P (dest))
3300 && (!REG_P (src)
3301 || bitmap_bit_p (regs, REGNO (src))
3302 || HARD_REGISTER_P (src)))
3303 continue;
3305 if (REG_P (dest))
3306 timode_check_non_convertible_regs (candidates, regs,
3307 REGNO (dest));
3309 if (REG_P (src))
3310 timode_check_non_convertible_regs (candidates, regs,
3311 REGNO (src));
3314 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3316 for (df_ref def = DF_REG_DEF_CHAIN (id);
3317 def;
3318 def = DF_REF_NEXT_REG (def))
3319 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3321 if (dump_file)
3322 fprintf (dump_file, "Removing insn %d from candidates list\n",
3323 DF_REF_INSN_UID (def));
3325 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3328 for (df_ref ref = DF_REG_USE_CHAIN (id);
3329 ref;
3330 ref = DF_REF_NEXT_REG (ref))
3331 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3333 if (dump_file)
3334 fprintf (dump_file, "Removing insn %d from candidates list\n",
3335 DF_REF_INSN_UID (ref));
3337 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3341 BITMAP_FREE (regs);
3344 /* For a given bitmap of insn UIDs scans all instruction and
3345 remove insn from CANDIDATES in case it has both convertible
3346 and not convertible definitions.
3348 All insns in a bitmap are conversion candidates according to
3349 scalar_to_vector_candidate_p. Currently it implies all insns
3350 are single_set. */
3352 static void
3353 remove_non_convertible_regs (bitmap candidates)
3355 if (TARGET_64BIT)
3356 timode_remove_non_convertible_regs (candidates);
3357 else
3358 dimode_remove_non_convertible_regs (candidates);
3361 class scalar_chain
3363 public:
3364 scalar_chain ();
3365 virtual ~scalar_chain ();
3367 static unsigned max_id;
3369 /* ID of a chain. */
3370 unsigned int chain_id;
3371 /* A queue of instructions to be included into a chain. */
3372 bitmap queue;
3373 /* Instructions included into a chain. */
3374 bitmap insns;
3375 /* All registers defined by a chain. */
3376 bitmap defs;
3377 /* Registers used in both vector and sclar modes. */
3378 bitmap defs_conv;
3380 void build (bitmap candidates, unsigned insn_uid);
3381 virtual int compute_convert_gain () = 0;
3382 int convert ();
3384 protected:
3385 void add_to_queue (unsigned insn_uid);
3386 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3388 private:
3389 void add_insn (bitmap candidates, unsigned insn_uid);
3390 void analyze_register_chain (bitmap candidates, df_ref ref);
3391 virtual void mark_dual_mode_def (df_ref def) = 0;
3392 virtual void convert_insn (rtx_insn *insn) = 0;
3393 virtual void convert_registers () = 0;
3396 class dimode_scalar_chain : public scalar_chain
3398 public:
3399 int compute_convert_gain ();
3400 private:
3401 void mark_dual_mode_def (df_ref def);
3402 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3403 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3404 void convert_insn (rtx_insn *insn);
3405 void convert_op (rtx *op, rtx_insn *insn);
3406 void convert_reg (unsigned regno);
3407 void make_vector_copies (unsigned regno);
3408 void convert_registers ();
3409 int vector_const_cost (rtx exp);
3412 class timode_scalar_chain : public scalar_chain
3414 public:
3415 /* Convert from TImode to V1TImode is always faster. */
3416 int compute_convert_gain () { return 1; }
3418 private:
3419 void mark_dual_mode_def (df_ref def);
3420 void fix_debug_reg_uses (rtx reg);
3421 void convert_insn (rtx_insn *insn);
3422 /* We don't convert registers to difference size. */
3423 void convert_registers () {}
3426 unsigned scalar_chain::max_id = 0;
3428 /* Initialize new chain. */
3430 scalar_chain::scalar_chain ()
3432 chain_id = ++max_id;
3434 if (dump_file)
3435 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3437 bitmap_obstack_initialize (NULL);
3438 insns = BITMAP_ALLOC (NULL);
3439 defs = BITMAP_ALLOC (NULL);
3440 defs_conv = BITMAP_ALLOC (NULL);
3441 queue = NULL;
3444 /* Free chain's data. */
3446 scalar_chain::~scalar_chain ()
3448 BITMAP_FREE (insns);
3449 BITMAP_FREE (defs);
3450 BITMAP_FREE (defs_conv);
3451 bitmap_obstack_release (NULL);
3454 /* Add instruction into chains' queue. */
3456 void
3457 scalar_chain::add_to_queue (unsigned insn_uid)
3459 if (bitmap_bit_p (insns, insn_uid)
3460 || bitmap_bit_p (queue, insn_uid))
3461 return;
3463 if (dump_file)
3464 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3465 insn_uid, chain_id);
3466 bitmap_set_bit (queue, insn_uid);
3469 /* For DImode conversion, mark register defined by DEF as requiring
3470 conversion. */
3472 void
3473 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3475 gcc_assert (DF_REF_REG_DEF_P (def));
3477 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3478 return;
3480 if (dump_file)
3481 fprintf (dump_file,
3482 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3483 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3485 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3488 /* For TImode conversion, it is unused. */
3490 void
3491 timode_scalar_chain::mark_dual_mode_def (df_ref)
3493 gcc_unreachable ();
3496 /* Check REF's chain to add new insns into a queue
3497 and find registers requiring conversion. */
3499 void
3500 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3502 df_link *chain;
3504 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3505 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3506 add_to_queue (DF_REF_INSN_UID (ref));
3508 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3510 unsigned uid = DF_REF_INSN_UID (chain->ref);
3512 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3513 continue;
3515 if (!DF_REF_REG_MEM_P (chain->ref))
3517 if (bitmap_bit_p (insns, uid))
3518 continue;
3520 if (bitmap_bit_p (candidates, uid))
3522 add_to_queue (uid);
3523 continue;
3527 if (DF_REF_REG_DEF_P (chain->ref))
3529 if (dump_file)
3530 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3531 DF_REF_REGNO (chain->ref), uid);
3532 mark_dual_mode_def (chain->ref);
3534 else
3536 if (dump_file)
3537 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3538 DF_REF_REGNO (chain->ref), uid);
3539 mark_dual_mode_def (ref);
3544 /* Add instruction into a chain. */
3546 void
3547 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3549 if (bitmap_bit_p (insns, insn_uid))
3550 return;
3552 if (dump_file)
3553 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3555 bitmap_set_bit (insns, insn_uid);
3557 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3558 rtx def_set = single_set (insn);
3559 if (def_set && REG_P (SET_DEST (def_set))
3560 && !HARD_REGISTER_P (SET_DEST (def_set)))
3561 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3563 df_ref ref;
3564 df_ref def;
3565 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3566 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3567 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3568 def;
3569 def = DF_REF_NEXT_REG (def))
3570 analyze_register_chain (candidates, def);
3571 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3572 if (!DF_REF_REG_MEM_P (ref))
3573 analyze_register_chain (candidates, ref);
3576 /* Build new chain starting from insn INSN_UID recursively
3577 adding all dependent uses and definitions. */
3579 void
3580 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3582 queue = BITMAP_ALLOC (NULL);
3583 bitmap_set_bit (queue, insn_uid);
3585 if (dump_file)
3586 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3588 while (!bitmap_empty_p (queue))
3590 insn_uid = bitmap_first_set_bit (queue);
3591 bitmap_clear_bit (queue, insn_uid);
3592 bitmap_clear_bit (candidates, insn_uid);
3593 add_insn (candidates, insn_uid);
3596 if (dump_file)
3598 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3599 fprintf (dump_file, " insns: ");
3600 dump_bitmap (dump_file, insns);
3601 if (!bitmap_empty_p (defs_conv))
3603 bitmap_iterator bi;
3604 unsigned id;
3605 const char *comma = "";
3606 fprintf (dump_file, " defs to convert: ");
3607 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3609 fprintf (dump_file, "%sr%d", comma, id);
3610 comma = ", ";
3612 fprintf (dump_file, "\n");
3616 BITMAP_FREE (queue);
3619 /* Return a cost of building a vector costant
3620 instead of using a scalar one. */
3623 dimode_scalar_chain::vector_const_cost (rtx exp)
3625 gcc_assert (CONST_INT_P (exp));
3627 if (standard_sse_constant_p (exp, V2DImode))
3628 return COSTS_N_INSNS (1);
3629 return ix86_cost->sse_load[1];
3632 /* Compute a gain for chain conversion. */
3635 dimode_scalar_chain::compute_convert_gain ()
3637 bitmap_iterator bi;
3638 unsigned insn_uid;
3639 int gain = 0;
3640 int cost = 0;
3642 if (dump_file)
3643 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3645 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3647 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3648 rtx def_set = single_set (insn);
3649 rtx src = SET_SRC (def_set);
3650 rtx dst = SET_DEST (def_set);
3652 if (REG_P (src) && REG_P (dst))
3653 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3654 else if (REG_P (src) && MEM_P (dst))
3655 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3656 else if (MEM_P (src) && REG_P (dst))
3657 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3658 else if (GET_CODE (src) == ASHIFT
3659 || GET_CODE (src) == ASHIFTRT
3660 || GET_CODE (src) == LSHIFTRT)
3662 if (CONST_INT_P (XEXP (src, 0)))
3663 gain -= vector_const_cost (XEXP (src, 0));
3664 if (CONST_INT_P (XEXP (src, 1)))
3666 gain += ix86_cost->shift_const;
3667 if (INTVAL (XEXP (src, 1)) >= 32)
3668 gain -= COSTS_N_INSNS (1);
3670 else
3671 /* Additional gain for omitting two CMOVs. */
3672 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3674 else if (GET_CODE (src) == PLUS
3675 || GET_CODE (src) == MINUS
3676 || GET_CODE (src) == IOR
3677 || GET_CODE (src) == XOR
3678 || GET_CODE (src) == AND)
3680 gain += ix86_cost->add;
3681 /* Additional gain for andnot for targets without BMI. */
3682 if (GET_CODE (XEXP (src, 0)) == NOT
3683 && !TARGET_BMI)
3684 gain += 2 * ix86_cost->add;
3686 if (CONST_INT_P (XEXP (src, 0)))
3687 gain -= vector_const_cost (XEXP (src, 0));
3688 if (CONST_INT_P (XEXP (src, 1)))
3689 gain -= vector_const_cost (XEXP (src, 1));
3691 else if (GET_CODE (src) == NEG
3692 || GET_CODE (src) == NOT)
3693 gain += ix86_cost->add - COSTS_N_INSNS (1);
3694 else if (GET_CODE (src) == COMPARE)
3696 /* Assume comparison cost is the same. */
3698 else if (CONST_INT_P (src))
3700 if (REG_P (dst))
3701 gain += COSTS_N_INSNS (2);
3702 else if (MEM_P (dst))
3703 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3704 gain -= vector_const_cost (src);
3706 else
3707 gcc_unreachable ();
3710 if (dump_file)
3711 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3713 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3714 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3716 if (dump_file)
3717 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3719 gain -= cost;
3721 if (dump_file)
3722 fprintf (dump_file, " Total gain: %d\n", gain);
3724 return gain;
3727 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3730 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3732 if (x == reg)
3733 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3735 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3736 int i, j;
3737 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3739 if (fmt[i] == 'e')
3740 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3741 else if (fmt[i] == 'E')
3742 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3743 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3744 reg, new_reg);
3747 return x;
3750 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3752 void
3753 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3754 rtx reg, rtx new_reg)
3756 replace_with_subreg (single_set (insn), reg, new_reg);
3759 /* Insert generated conversion instruction sequence INSNS
3760 after instruction AFTER. New BB may be required in case
3761 instruction has EH region attached. */
3763 void
3764 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3766 if (!control_flow_insn_p (after))
3768 emit_insn_after (insns, after);
3769 return;
3772 basic_block bb = BLOCK_FOR_INSN (after);
3773 edge e = find_fallthru_edge (bb->succs);
3774 gcc_assert (e);
3776 basic_block new_bb = split_edge (e);
3777 emit_insn_after (insns, BB_HEAD (new_bb));
3780 /* Make vector copies for all register REGNO definitions
3781 and replace its uses in a chain. */
3783 void
3784 dimode_scalar_chain::make_vector_copies (unsigned regno)
3786 rtx reg = regno_reg_rtx[regno];
3787 rtx vreg = gen_reg_rtx (DImode);
3788 bool count_reg = false;
3789 df_ref ref;
3791 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3792 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3794 df_ref use;
3796 /* Detect the count register of a shift instruction. */
3797 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3798 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3800 rtx_insn *insn = DF_REF_INSN (use);
3801 rtx def_set = single_set (insn);
3803 gcc_assert (def_set);
3805 rtx src = SET_SRC (def_set);
3807 if ((GET_CODE (src) == ASHIFT
3808 || GET_CODE (src) == ASHIFTRT
3809 || GET_CODE (src) == LSHIFTRT)
3810 && !CONST_INT_P (XEXP (src, 1))
3811 && reg_or_subregno (XEXP (src, 1)) == regno)
3812 count_reg = true;
3815 start_sequence ();
3816 if (count_reg)
3818 rtx qreg = gen_lowpart (QImode, reg);
3819 rtx tmp = gen_reg_rtx (SImode);
3821 if (TARGET_ZERO_EXTEND_WITH_AND
3822 && optimize_function_for_speed_p (cfun))
3824 emit_move_insn (tmp, const0_rtx);
3825 emit_insn (gen_movstrictqi
3826 (gen_lowpart (QImode, tmp), qreg));
3828 else
3829 emit_insn (gen_rtx_SET
3830 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3832 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3834 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3835 emit_move_insn (slot, tmp);
3836 tmp = copy_rtx (slot);
3839 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3841 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3843 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3844 emit_move_insn (adjust_address (tmp, SImode, 0),
3845 gen_rtx_SUBREG (SImode, reg, 0));
3846 emit_move_insn (adjust_address (tmp, SImode, 4),
3847 gen_rtx_SUBREG (SImode, reg, 4));
3848 emit_move_insn (vreg, tmp);
3850 else if (TARGET_SSE4_1)
3852 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3853 CONST0_RTX (V4SImode),
3854 gen_rtx_SUBREG (SImode, reg, 0)));
3855 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3856 gen_rtx_SUBREG (V4SImode, vreg, 0),
3857 gen_rtx_SUBREG (SImode, reg, 4),
3858 GEN_INT (2)));
3860 else
3862 rtx tmp = gen_reg_rtx (DImode);
3863 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3864 CONST0_RTX (V4SImode),
3865 gen_rtx_SUBREG (SImode, reg, 0)));
3866 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3867 CONST0_RTX (V4SImode),
3868 gen_rtx_SUBREG (SImode, reg, 4)));
3869 emit_insn (gen_vec_interleave_lowv4si
3870 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3871 gen_rtx_SUBREG (V4SImode, vreg, 0),
3872 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3874 rtx_insn *seq = get_insns ();
3875 end_sequence ();
3876 rtx_insn *insn = DF_REF_INSN (ref);
3877 emit_conversion_insns (seq, insn);
3879 if (dump_file)
3880 fprintf (dump_file,
3881 " Copied r%d to a vector register r%d for insn %d\n",
3882 regno, REGNO (vreg), INSN_UID (insn));
3885 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3886 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3888 rtx_insn *insn = DF_REF_INSN (ref);
3889 if (count_reg)
3891 rtx def_set = single_set (insn);
3892 gcc_assert (def_set);
3894 rtx src = SET_SRC (def_set);
3896 if ((GET_CODE (src) == ASHIFT
3897 || GET_CODE (src) == ASHIFTRT
3898 || GET_CODE (src) == LSHIFTRT)
3899 && !CONST_INT_P (XEXP (src, 1))
3900 && reg_or_subregno (XEXP (src, 1)) == regno)
3901 XEXP (src, 1) = vreg;
3903 else
3904 replace_with_subreg_in_insn (insn, reg, vreg);
3906 if (dump_file)
3907 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3908 regno, REGNO (vreg), INSN_UID (insn));
3912 /* Convert all definitions of register REGNO
3913 and fix its uses. Scalar copies may be created
3914 in case register is used in not convertible insn. */
3916 void
3917 dimode_scalar_chain::convert_reg (unsigned regno)
3919 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3920 rtx reg = regno_reg_rtx[regno];
3921 rtx scopy = NULL_RTX;
3922 df_ref ref;
3923 bitmap conv;
3925 conv = BITMAP_ALLOC (NULL);
3926 bitmap_copy (conv, insns);
3928 if (scalar_copy)
3929 scopy = gen_reg_rtx (DImode);
3931 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3933 rtx_insn *insn = DF_REF_INSN (ref);
3934 rtx def_set = single_set (insn);
3935 rtx src = SET_SRC (def_set);
3936 rtx reg = DF_REF_REG (ref);
3938 if (!MEM_P (src))
3940 replace_with_subreg_in_insn (insn, reg, reg);
3941 bitmap_clear_bit (conv, INSN_UID (insn));
3944 if (scalar_copy)
3946 start_sequence ();
3947 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3949 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3950 emit_move_insn (tmp, reg);
3951 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3952 adjust_address (tmp, SImode, 0));
3953 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3954 adjust_address (tmp, SImode, 4));
3956 else if (TARGET_SSE4_1)
3958 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3959 emit_insn
3960 (gen_rtx_SET
3961 (gen_rtx_SUBREG (SImode, scopy, 0),
3962 gen_rtx_VEC_SELECT (SImode,
3963 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3965 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3966 emit_insn
3967 (gen_rtx_SET
3968 (gen_rtx_SUBREG (SImode, scopy, 4),
3969 gen_rtx_VEC_SELECT (SImode,
3970 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3972 else
3974 rtx vcopy = gen_reg_rtx (V2DImode);
3975 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3976 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3977 gen_rtx_SUBREG (SImode, vcopy, 0));
3978 emit_move_insn (vcopy,
3979 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3980 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3981 gen_rtx_SUBREG (SImode, vcopy, 0));
3983 rtx_insn *seq = get_insns ();
3984 end_sequence ();
3985 emit_conversion_insns (seq, insn);
3987 if (dump_file)
3988 fprintf (dump_file,
3989 " Copied r%d to a scalar register r%d for insn %d\n",
3990 regno, REGNO (scopy), INSN_UID (insn));
3994 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3995 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3997 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3999 rtx_insn *insn = DF_REF_INSN (ref);
4001 rtx def_set = single_set (insn);
4002 gcc_assert (def_set);
4004 rtx src = SET_SRC (def_set);
4005 rtx dst = SET_DEST (def_set);
4007 if ((GET_CODE (src) == ASHIFT
4008 || GET_CODE (src) == ASHIFTRT
4009 || GET_CODE (src) == LSHIFTRT)
4010 && !CONST_INT_P (XEXP (src, 1))
4011 && reg_or_subregno (XEXP (src, 1)) == regno)
4013 rtx tmp2 = gen_reg_rtx (V2DImode);
4015 start_sequence ();
4017 if (TARGET_SSE4_1)
4018 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4019 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4020 else
4022 rtx vec_cst
4023 = gen_rtx_CONST_VECTOR (V2DImode,
4024 gen_rtvec (2, GEN_INT (0xff),
4025 const0_rtx));
4026 vec_cst
4027 = validize_mem (force_const_mem (V2DImode, vec_cst));
4029 emit_insn (gen_rtx_SET
4030 (tmp2,
4031 gen_rtx_AND (V2DImode,
4032 gen_rtx_SUBREG (V2DImode, reg, 0),
4033 vec_cst)));
4035 rtx_insn *seq = get_insns ();
4036 end_sequence ();
4038 emit_insn_before (seq, insn);
4040 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4042 else if (!MEM_P (dst) || !REG_P (src))
4043 replace_with_subreg_in_insn (insn, reg, reg);
4045 bitmap_clear_bit (conv, INSN_UID (insn));
4048 /* Skip debug insns and uninitialized uses. */
4049 else if (DF_REF_CHAIN (ref)
4050 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4052 gcc_assert (scopy);
4053 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4054 df_insn_rescan (DF_REF_INSN (ref));
4057 BITMAP_FREE (conv);
4060 /* Convert operand OP in INSN. We should handle
4061 memory operands and uninitialized registers.
4062 All other register uses are converted during
4063 registers conversion. */
4065 void
4066 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4068 *op = copy_rtx_if_shared (*op);
4070 if (GET_CODE (*op) == NOT)
4072 convert_op (&XEXP (*op, 0), insn);
4073 PUT_MODE (*op, V2DImode);
4075 else if (MEM_P (*op))
4077 rtx tmp = gen_reg_rtx (DImode);
4079 emit_insn_before (gen_move_insn (tmp, *op), insn);
4080 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4082 if (dump_file)
4083 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4084 INSN_UID (insn), REGNO (tmp));
4086 else if (REG_P (*op))
4088 /* We may have not converted register usage in case
4089 this register has no definition. Otherwise it
4090 should be converted in convert_reg. */
4091 df_ref ref;
4092 FOR_EACH_INSN_USE (ref, insn)
4093 if (DF_REF_REGNO (ref) == REGNO (*op))
4095 gcc_assert (!DF_REF_CHAIN (ref));
4096 break;
4098 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4100 else if (CONST_INT_P (*op))
4102 rtx vec_cst;
4103 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4105 /* Prefer all ones vector in case of -1. */
4106 if (constm1_operand (*op, GET_MODE (*op)))
4107 vec_cst = CONSTM1_RTX (V2DImode);
4108 else
4109 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4110 gen_rtvec (2, *op, const0_rtx));
4112 if (!standard_sse_constant_p (vec_cst, V2DImode))
4114 start_sequence ();
4115 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4116 rtx_insn *seq = get_insns ();
4117 end_sequence ();
4118 emit_insn_before (seq, insn);
4121 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4122 *op = tmp;
4124 else
4126 gcc_assert (SUBREG_P (*op));
4127 gcc_assert (GET_MODE (*op) == V2DImode);
4131 /* Convert INSN to vector mode. */
4133 void
4134 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4136 rtx def_set = single_set (insn);
4137 rtx src = SET_SRC (def_set);
4138 rtx dst = SET_DEST (def_set);
4139 rtx subreg;
4141 if (MEM_P (dst) && !REG_P (src))
4143 /* There are no scalar integer instructions and therefore
4144 temporary register usage is required. */
4145 rtx tmp = gen_reg_rtx (DImode);
4146 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4147 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4150 switch (GET_CODE (src))
4152 case ASHIFT:
4153 case ASHIFTRT:
4154 case LSHIFTRT:
4155 convert_op (&XEXP (src, 0), insn);
4156 PUT_MODE (src, V2DImode);
4157 break;
4159 case PLUS:
4160 case MINUS:
4161 case IOR:
4162 case XOR:
4163 case AND:
4164 convert_op (&XEXP (src, 0), insn);
4165 convert_op (&XEXP (src, 1), insn);
4166 PUT_MODE (src, V2DImode);
4167 break;
4169 case NEG:
4170 src = XEXP (src, 0);
4171 convert_op (&src, insn);
4172 subreg = gen_reg_rtx (V2DImode);
4173 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4174 src = gen_rtx_MINUS (V2DImode, subreg, src);
4175 break;
4177 case NOT:
4178 src = XEXP (src, 0);
4179 convert_op (&src, insn);
4180 subreg = gen_reg_rtx (V2DImode);
4181 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4182 src = gen_rtx_XOR (V2DImode, src, subreg);
4183 break;
4185 case MEM:
4186 if (!REG_P (dst))
4187 convert_op (&src, insn);
4188 break;
4190 case REG:
4191 if (!MEM_P (dst))
4192 convert_op (&src, insn);
4193 break;
4195 case SUBREG:
4196 gcc_assert (GET_MODE (src) == V2DImode);
4197 break;
4199 case COMPARE:
4200 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4202 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4203 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4205 if (REG_P (src))
4206 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4207 else
4208 subreg = copy_rtx_if_shared (src);
4209 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4210 copy_rtx_if_shared (subreg),
4211 copy_rtx_if_shared (subreg)),
4212 insn);
4213 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4214 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4215 copy_rtx_if_shared (src)),
4216 UNSPEC_PTEST);
4217 break;
4219 case CONST_INT:
4220 convert_op (&src, insn);
4221 break;
4223 default:
4224 gcc_unreachable ();
4227 SET_SRC (def_set) = src;
4228 SET_DEST (def_set) = dst;
4230 /* Drop possible dead definitions. */
4231 PATTERN (insn) = def_set;
4233 INSN_CODE (insn) = -1;
4234 recog_memoized (insn);
4235 df_insn_rescan (insn);
4238 /* Fix uses of converted REG in debug insns. */
4240 void
4241 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4243 if (!flag_var_tracking)
4244 return;
4246 df_ref ref, next;
4247 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4249 rtx_insn *insn = DF_REF_INSN (ref);
4250 /* Make sure the next ref is for a different instruction,
4251 so that we're not affected by the rescan. */
4252 next = DF_REF_NEXT_REG (ref);
4253 while (next && DF_REF_INSN (next) == insn)
4254 next = DF_REF_NEXT_REG (next);
4256 if (DEBUG_INSN_P (insn))
4258 /* It may be a debug insn with a TImode variable in
4259 register. */
4260 bool changed = false;
4261 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4263 rtx *loc = DF_REF_LOC (ref);
4264 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4266 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4267 changed = true;
4270 if (changed)
4271 df_insn_rescan (insn);
4276 /* Convert INSN from TImode to V1T1mode. */
4278 void
4279 timode_scalar_chain::convert_insn (rtx_insn *insn)
4281 rtx def_set = single_set (insn);
4282 rtx src = SET_SRC (def_set);
4283 rtx dst = SET_DEST (def_set);
4285 switch (GET_CODE (dst))
4287 case REG:
4289 rtx tmp = find_reg_equal_equiv_note (insn);
4290 if (tmp)
4291 PUT_MODE (XEXP (tmp, 0), V1TImode);
4292 PUT_MODE (dst, V1TImode);
4293 fix_debug_reg_uses (dst);
4295 break;
4296 case MEM:
4297 PUT_MODE (dst, V1TImode);
4298 break;
4300 default:
4301 gcc_unreachable ();
4304 switch (GET_CODE (src))
4306 case REG:
4307 PUT_MODE (src, V1TImode);
4308 /* Call fix_debug_reg_uses only if SRC is never defined. */
4309 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4310 fix_debug_reg_uses (src);
4311 break;
4313 case MEM:
4314 PUT_MODE (src, V1TImode);
4315 break;
4317 case CONST_WIDE_INT:
4318 if (NONDEBUG_INSN_P (insn))
4320 /* Since there are no instructions to store 128-bit constant,
4321 temporary register usage is required. */
4322 rtx tmp = gen_reg_rtx (V1TImode);
4323 start_sequence ();
4324 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4325 src = validize_mem (force_const_mem (V1TImode, src));
4326 rtx_insn *seq = get_insns ();
4327 end_sequence ();
4328 if (seq)
4329 emit_insn_before (seq, insn);
4330 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4331 dst = tmp;
4333 break;
4335 case CONST_INT:
4336 switch (standard_sse_constant_p (src, TImode))
4338 case 1:
4339 src = CONST0_RTX (GET_MODE (dst));
4340 break;
4341 case 2:
4342 src = CONSTM1_RTX (GET_MODE (dst));
4343 break;
4344 default:
4345 gcc_unreachable ();
4347 if (NONDEBUG_INSN_P (insn))
4349 rtx tmp = gen_reg_rtx (V1TImode);
4350 /* Since there are no instructions to store standard SSE
4351 constant, temporary register usage is required. */
4352 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4353 dst = tmp;
4355 break;
4357 default:
4358 gcc_unreachable ();
4361 SET_SRC (def_set) = src;
4362 SET_DEST (def_set) = dst;
4364 /* Drop possible dead definitions. */
4365 PATTERN (insn) = def_set;
4367 INSN_CODE (insn) = -1;
4368 recog_memoized (insn);
4369 df_insn_rescan (insn);
4372 void
4373 dimode_scalar_chain::convert_registers ()
4375 bitmap_iterator bi;
4376 unsigned id;
4378 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4379 convert_reg (id);
4381 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4382 make_vector_copies (id);
4385 /* Convert whole chain creating required register
4386 conversions and copies. */
4389 scalar_chain::convert ()
4391 bitmap_iterator bi;
4392 unsigned id;
4393 int converted_insns = 0;
4395 if (!dbg_cnt (stv_conversion))
4396 return 0;
4398 if (dump_file)
4399 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4401 convert_registers ();
4403 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4405 convert_insn (DF_INSN_UID_GET (id)->insn);
4406 converted_insns++;
4409 return converted_insns;
4412 /* Main STV pass function. Find and convert scalar
4413 instructions into vector mode when profitable. */
4415 static unsigned int
4416 convert_scalars_to_vector ()
4418 basic_block bb;
4419 bitmap candidates;
4420 int converted_insns = 0;
4422 bitmap_obstack_initialize (NULL);
4423 candidates = BITMAP_ALLOC (NULL);
4425 calculate_dominance_info (CDI_DOMINATORS);
4426 df_set_flags (DF_DEFER_INSN_RESCAN);
4427 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4428 df_md_add_problem ();
4429 df_analyze ();
4431 /* Find all instructions we want to convert into vector mode. */
4432 if (dump_file)
4433 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4435 FOR_EACH_BB_FN (bb, cfun)
4437 rtx_insn *insn;
4438 FOR_BB_INSNS (bb, insn)
4439 if (scalar_to_vector_candidate_p (insn))
4441 if (dump_file)
4442 fprintf (dump_file, " insn %d is marked as a candidate\n",
4443 INSN_UID (insn));
4445 bitmap_set_bit (candidates, INSN_UID (insn));
4449 remove_non_convertible_regs (candidates);
4451 if (bitmap_empty_p (candidates))
4452 if (dump_file)
4453 fprintf (dump_file, "There are no candidates for optimization.\n");
4455 while (!bitmap_empty_p (candidates))
4457 unsigned uid = bitmap_first_set_bit (candidates);
4458 scalar_chain *chain;
4460 if (TARGET_64BIT)
4461 chain = new timode_scalar_chain;
4462 else
4463 chain = new dimode_scalar_chain;
4465 /* Find instructions chain we want to convert to vector mode.
4466 Check all uses and definitions to estimate all required
4467 conversions. */
4468 chain->build (candidates, uid);
4470 if (chain->compute_convert_gain () > 0)
4471 converted_insns += chain->convert ();
4472 else
4473 if (dump_file)
4474 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4475 chain->chain_id);
4477 delete chain;
4480 if (dump_file)
4481 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4483 BITMAP_FREE (candidates);
4484 bitmap_obstack_release (NULL);
4485 df_process_deferred_rescans ();
4487 /* Conversion means we may have 128bit register spills/fills
4488 which require aligned stack. */
4489 if (converted_insns)
4491 if (crtl->stack_alignment_needed < 128)
4492 crtl->stack_alignment_needed = 128;
4493 if (crtl->stack_alignment_estimated < 128)
4494 crtl->stack_alignment_estimated = 128;
4495 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4496 if (TARGET_64BIT)
4497 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4498 parm; parm = DECL_CHAIN (parm))
4500 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4501 continue;
4502 if (DECL_RTL_SET_P (parm)
4503 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4505 rtx r = DECL_RTL (parm);
4506 if (REG_P (r))
4507 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4509 if (DECL_INCOMING_RTL (parm)
4510 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4512 rtx r = DECL_INCOMING_RTL (parm);
4513 if (REG_P (r))
4514 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4519 return 0;
4522 namespace {
4524 const pass_data pass_data_insert_vzeroupper =
4526 RTL_PASS, /* type */
4527 "vzeroupper", /* name */
4528 OPTGROUP_NONE, /* optinfo_flags */
4529 TV_MACH_DEP, /* tv_id */
4530 0, /* properties_required */
4531 0, /* properties_provided */
4532 0, /* properties_destroyed */
4533 0, /* todo_flags_start */
4534 TODO_df_finish, /* todo_flags_finish */
4537 class pass_insert_vzeroupper : public rtl_opt_pass
4539 public:
4540 pass_insert_vzeroupper(gcc::context *ctxt)
4541 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4544 /* opt_pass methods: */
4545 virtual bool gate (function *)
4547 return TARGET_AVX && !TARGET_AVX512F
4548 && TARGET_VZEROUPPER && flag_expensive_optimizations
4549 && !optimize_size;
4552 virtual unsigned int execute (function *)
4554 return rest_of_handle_insert_vzeroupper ();
4557 }; // class pass_insert_vzeroupper
4559 const pass_data pass_data_stv =
4561 RTL_PASS, /* type */
4562 "stv", /* name */
4563 OPTGROUP_NONE, /* optinfo_flags */
4564 TV_MACH_DEP, /* tv_id */
4565 0, /* properties_required */
4566 0, /* properties_provided */
4567 0, /* properties_destroyed */
4568 0, /* todo_flags_start */
4569 TODO_df_finish, /* todo_flags_finish */
4572 class pass_stv : public rtl_opt_pass
4574 public:
4575 pass_stv (gcc::context *ctxt)
4576 : rtl_opt_pass (pass_data_stv, ctxt),
4577 timode_p (false)
4580 /* opt_pass methods: */
4581 virtual bool gate (function *)
4583 return (timode_p == !!TARGET_64BIT
4584 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4587 virtual unsigned int execute (function *)
4589 return convert_scalars_to_vector ();
4592 opt_pass *clone ()
4594 return new pass_stv (m_ctxt);
4597 void set_pass_param (unsigned int n, bool param)
4599 gcc_assert (n == 0);
4600 timode_p = param;
4603 private:
4604 bool timode_p;
4605 }; // class pass_stv
4607 } // anon namespace
4609 rtl_opt_pass *
4610 make_pass_insert_vzeroupper (gcc::context *ctxt)
4612 return new pass_insert_vzeroupper (ctxt);
4615 rtl_opt_pass *
4616 make_pass_stv (gcc::context *ctxt)
4618 return new pass_stv (ctxt);
4621 /* Return true if a red-zone is in use. */
4623 bool
4624 ix86_using_red_zone (void)
4626 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4629 /* Return a string that documents the current -m options. The caller is
4630 responsible for freeing the string. */
4632 static char *
4633 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4634 int flags, int flags2,
4635 const char *arch, const char *tune,
4636 enum fpmath_unit fpmath, bool add_nl_p)
4638 struct ix86_target_opts
4640 const char *option; /* option string */
4641 HOST_WIDE_INT mask; /* isa mask options */
4644 /* This table is ordered so that options like -msse4.2 that imply other
4645 ISAs come first. Target string will be displayed in the same order. */
4646 static struct ix86_target_opts isa2_opts[] =
4648 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4649 { "-msgx", OPTION_MASK_ISA_SGX },
4650 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4651 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4652 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4654 static struct ix86_target_opts isa_opts[] =
4656 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4657 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4658 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4659 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4660 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4661 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4662 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4663 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4664 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4665 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4666 { "-mfma", OPTION_MASK_ISA_FMA },
4667 { "-mxop", OPTION_MASK_ISA_XOP },
4668 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4669 { "-mf16c", OPTION_MASK_ISA_F16C },
4670 { "-mavx", OPTION_MASK_ISA_AVX },
4671 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4672 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4673 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4674 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4675 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4676 { "-msse3", OPTION_MASK_ISA_SSE3 },
4677 { "-maes", OPTION_MASK_ISA_AES },
4678 { "-msha", OPTION_MASK_ISA_SHA },
4679 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4680 { "-msse2", OPTION_MASK_ISA_SSE2 },
4681 { "-msse", OPTION_MASK_ISA_SSE },
4682 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4683 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4684 { "-mmmx", OPTION_MASK_ISA_MMX },
4685 { "-mrtm", OPTION_MASK_ISA_RTM },
4686 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4687 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4688 { "-madx", OPTION_MASK_ISA_ADX },
4689 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4690 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4691 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4692 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4693 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4694 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4695 { "-mabm", OPTION_MASK_ISA_ABM },
4696 { "-mbmi", OPTION_MASK_ISA_BMI },
4697 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4698 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4699 { "-mtbm", OPTION_MASK_ISA_TBM },
4700 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4701 { "-mcx16", OPTION_MASK_ISA_CX16 },
4702 { "-msahf", OPTION_MASK_ISA_SAHF },
4703 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4704 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4705 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4706 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4707 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4708 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4709 { "-mpku", OPTION_MASK_ISA_PKU },
4710 { "-mlwp", OPTION_MASK_ISA_LWP },
4711 { "-mhle", OPTION_MASK_ISA_HLE },
4712 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4713 { "-mmpx", OPTION_MASK_ISA_MPX },
4714 { "-mclwb", OPTION_MASK_ISA_CLWB }
4717 /* Flag options. */
4718 static struct ix86_target_opts flag_opts[] =
4720 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4721 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4722 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4723 { "-m80387", MASK_80387 },
4724 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4725 { "-malign-double", MASK_ALIGN_DOUBLE },
4726 { "-mcld", MASK_CLD },
4727 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4728 { "-mieee-fp", MASK_IEEE_FP },
4729 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4730 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4731 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4732 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4733 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4734 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4735 { "-mno-red-zone", MASK_NO_RED_ZONE },
4736 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4737 { "-mrecip", MASK_RECIP },
4738 { "-mrtd", MASK_RTD },
4739 { "-msseregparm", MASK_SSEREGPARM },
4740 { "-mstack-arg-probe", MASK_STACK_PROBE },
4741 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4742 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4743 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4744 { "-mvzeroupper", MASK_VZEROUPPER },
4745 { "-mstv", MASK_STV },
4746 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4747 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4748 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4749 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4752 /* Additional flag options. */
4753 static struct ix86_target_opts flag2_opts[] =
4755 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4758 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4759 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4761 char isa_other[40];
4762 char isa2_other[40];
4763 char flags_other[40];
4764 char flags2_other[40];
4765 unsigned num = 0;
4766 unsigned i, j;
4767 char *ret;
4768 char *ptr;
4769 size_t len;
4770 size_t line_len;
4771 size_t sep_len;
4772 const char *abi;
4774 memset (opts, '\0', sizeof (opts));
4776 /* Add -march= option. */
4777 if (arch)
4779 opts[num][0] = "-march=";
4780 opts[num++][1] = arch;
4783 /* Add -mtune= option. */
4784 if (tune)
4786 opts[num][0] = "-mtune=";
4787 opts[num++][1] = tune;
4790 /* Add -m32/-m64/-mx32. */
4791 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4793 if ((isa & OPTION_MASK_ABI_64) != 0)
4794 abi = "-m64";
4795 else
4796 abi = "-mx32";
4797 isa &= ~ (OPTION_MASK_ISA_64BIT
4798 | OPTION_MASK_ABI_64
4799 | OPTION_MASK_ABI_X32);
4801 else
4802 abi = "-m32";
4803 opts[num++][0] = abi;
4805 /* Pick out the options in isa2 options. */
4806 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4808 if ((isa2 & isa2_opts[i].mask) != 0)
4810 opts[num++][0] = isa2_opts[i].option;
4811 isa2 &= ~ isa2_opts[i].mask;
4815 if (isa2 && add_nl_p)
4817 opts[num++][0] = isa2_other;
4818 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4821 /* Pick out the options in isa options. */
4822 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4824 if ((isa & isa_opts[i].mask) != 0)
4826 opts[num++][0] = isa_opts[i].option;
4827 isa &= ~ isa_opts[i].mask;
4831 if (isa && add_nl_p)
4833 opts[num++][0] = isa_other;
4834 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4837 /* Add flag options. */
4838 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4840 if ((flags & flag_opts[i].mask) != 0)
4842 opts[num++][0] = flag_opts[i].option;
4843 flags &= ~ flag_opts[i].mask;
4847 if (flags && add_nl_p)
4849 opts[num++][0] = flags_other;
4850 sprintf (flags_other, "(other flags: %#x)", flags);
4853 /* Add additional flag options. */
4854 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4856 if ((flags2 & flag2_opts[i].mask) != 0)
4858 opts[num++][0] = flag2_opts[i].option;
4859 flags2 &= ~ flag2_opts[i].mask;
4863 if (flags2 && add_nl_p)
4865 opts[num++][0] = flags2_other;
4866 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4869 /* Add -fpmath= option. */
4870 if (fpmath)
4872 opts[num][0] = "-mfpmath=";
4873 switch ((int) fpmath)
4875 case FPMATH_387:
4876 opts[num++][1] = "387";
4877 break;
4879 case FPMATH_SSE:
4880 opts[num++][1] = "sse";
4881 break;
4883 case FPMATH_387 | FPMATH_SSE:
4884 opts[num++][1] = "sse+387";
4885 break;
4887 default:
4888 gcc_unreachable ();
4892 /* Any options? */
4893 if (num == 0)
4894 return NULL;
4896 gcc_assert (num < ARRAY_SIZE (opts));
4898 /* Size the string. */
4899 len = 0;
4900 sep_len = (add_nl_p) ? 3 : 1;
4901 for (i = 0; i < num; i++)
4903 len += sep_len;
4904 for (j = 0; j < 2; j++)
4905 if (opts[i][j])
4906 len += strlen (opts[i][j]);
4909 /* Build the string. */
4910 ret = ptr = (char *) xmalloc (len);
4911 line_len = 0;
4913 for (i = 0; i < num; i++)
4915 size_t len2[2];
4917 for (j = 0; j < 2; j++)
4918 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4920 if (i != 0)
4922 *ptr++ = ' ';
4923 line_len++;
4925 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4927 *ptr++ = '\\';
4928 *ptr++ = '\n';
4929 line_len = 0;
4933 for (j = 0; j < 2; j++)
4934 if (opts[i][j])
4936 memcpy (ptr, opts[i][j], len2[j]);
4937 ptr += len2[j];
4938 line_len += len2[j];
4942 *ptr = '\0';
4943 gcc_assert (ret + len >= ptr);
4945 return ret;
4948 /* Return true, if profiling code should be emitted before
4949 prologue. Otherwise it returns false.
4950 Note: For x86 with "hotfix" it is sorried. */
4951 static bool
4952 ix86_profile_before_prologue (void)
4954 return flag_fentry != 0;
4957 /* Function that is callable from the debugger to print the current
4958 options. */
4959 void ATTRIBUTE_UNUSED
4960 ix86_debug_options (void)
4962 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4963 target_flags, ix86_target_flags,
4964 ix86_arch_string,ix86_tune_string,
4965 ix86_fpmath, true);
4967 if (opts)
4969 fprintf (stderr, "%s\n\n", opts);
4970 free (opts);
4972 else
4973 fputs ("<no options>\n\n", stderr);
4975 return;
4978 /* Return true if T is one of the bytes we should avoid with
4979 -fmitigate-rop. */
4981 static bool
4982 ix86_rop_should_change_byte_p (int t)
4984 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4987 static const char *stringop_alg_names[] = {
4988 #define DEF_ENUM
4989 #define DEF_ALG(alg, name) #name,
4990 #include "stringop.def"
4991 #undef DEF_ENUM
4992 #undef DEF_ALG
4995 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4996 The string is of the following form (or comma separated list of it):
4998 strategy_alg:max_size:[align|noalign]
5000 where the full size range for the strategy is either [0, max_size] or
5001 [min_size, max_size], in which min_size is the max_size + 1 of the
5002 preceding range. The last size range must have max_size == -1.
5004 Examples:
5007 -mmemcpy-strategy=libcall:-1:noalign
5009 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5013 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5015 This is to tell the compiler to use the following strategy for memset
5016 1) when the expected size is between [1, 16], use rep_8byte strategy;
5017 2) when the size is between [17, 2048], use vector_loop;
5018 3) when the size is > 2048, use libcall. */
5020 struct stringop_size_range
5022 int max;
5023 stringop_alg alg;
5024 bool noalign;
5027 static void
5028 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5030 const struct stringop_algs *default_algs;
5031 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5032 char *curr_range_str, *next_range_str;
5033 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5034 int i = 0, n = 0;
5036 if (is_memset)
5037 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5038 else
5039 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5041 curr_range_str = strategy_str;
5045 int maxs;
5046 char alg_name[128];
5047 char align[16];
5048 next_range_str = strchr (curr_range_str, ',');
5049 if (next_range_str)
5050 *next_range_str++ = '\0';
5052 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5053 alg_name, &maxs, align))
5055 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5056 return;
5059 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5061 error ("size ranges of option %qs should be increasing", opt);
5062 return;
5065 for (i = 0; i < last_alg; i++)
5066 if (!strcmp (alg_name, stringop_alg_names[i]))
5067 break;
5069 if (i == last_alg)
5071 error ("wrong strategy name %qs specified for option %qs",
5072 alg_name, opt);
5074 auto_vec <const char *> candidates;
5075 for (i = 0; i < last_alg; i++)
5076 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5077 candidates.safe_push (stringop_alg_names[i]);
5079 char *s;
5080 const char *hint
5081 = candidates_list_and_hint (alg_name, s, candidates);
5082 if (hint)
5083 inform (input_location,
5084 "valid arguments to %qs are: %s; did you mean %qs?",
5085 opt, s, hint);
5086 else
5087 inform (input_location, "valid arguments to %qs are: %s",
5088 opt, s);
5089 XDELETEVEC (s);
5090 return;
5093 if ((stringop_alg) i == rep_prefix_8_byte
5094 && !TARGET_64BIT)
5096 /* rep; movq isn't available in 32-bit code. */
5097 error ("strategy name %qs specified for option %qs "
5098 "not supported for 32-bit code", alg_name, opt);
5099 return;
5102 input_ranges[n].max = maxs;
5103 input_ranges[n].alg = (stringop_alg) i;
5104 if (!strcmp (align, "align"))
5105 input_ranges[n].noalign = false;
5106 else if (!strcmp (align, "noalign"))
5107 input_ranges[n].noalign = true;
5108 else
5110 error ("unknown alignment %qs specified for option %qs", align, opt);
5111 return;
5113 n++;
5114 curr_range_str = next_range_str;
5116 while (curr_range_str);
5118 if (input_ranges[n - 1].max != -1)
5120 error ("the max value for the last size range should be -1"
5121 " for option %qs", opt);
5122 return;
5125 if (n > MAX_STRINGOP_ALGS)
5127 error ("too many size ranges specified in option %qs", opt);
5128 return;
5131 /* Now override the default algs array. */
5132 for (i = 0; i < n; i++)
5134 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5135 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5136 = input_ranges[i].alg;
5137 *const_cast<int *>(&default_algs->size[i].noalign)
5138 = input_ranges[i].noalign;
5143 /* parse -mtune-ctrl= option. When DUMP is true,
5144 print the features that are explicitly set. */
5146 static void
5147 parse_mtune_ctrl_str (bool dump)
5149 if (!ix86_tune_ctrl_string)
5150 return;
5152 char *next_feature_string = NULL;
5153 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5154 char *orig = curr_feature_string;
5155 int i;
5158 bool clear = false;
5160 next_feature_string = strchr (curr_feature_string, ',');
5161 if (next_feature_string)
5162 *next_feature_string++ = '\0';
5163 if (*curr_feature_string == '^')
5165 curr_feature_string++;
5166 clear = true;
5168 for (i = 0; i < X86_TUNE_LAST; i++)
5170 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5172 ix86_tune_features[i] = !clear;
5173 if (dump)
5174 fprintf (stderr, "Explicitly %s feature %s\n",
5175 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5176 break;
5179 if (i == X86_TUNE_LAST)
5180 error ("Unknown parameter to option -mtune-ctrl: %s",
5181 clear ? curr_feature_string - 1 : curr_feature_string);
5182 curr_feature_string = next_feature_string;
5184 while (curr_feature_string);
5185 free (orig);
5188 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5189 processor type. */
5191 static void
5192 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5194 unsigned int ix86_tune_mask = 1u << ix86_tune;
5195 int i;
5197 for (i = 0; i < X86_TUNE_LAST; ++i)
5199 if (ix86_tune_no_default)
5200 ix86_tune_features[i] = 0;
5201 else
5202 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5205 if (dump)
5207 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5208 for (i = 0; i < X86_TUNE_LAST; i++)
5209 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5210 ix86_tune_features[i] ? "on" : "off");
5213 parse_mtune_ctrl_str (dump);
5217 /* Default align_* from the processor table. */
5219 static void
5220 ix86_default_align (struct gcc_options *opts)
5222 if (opts->x_align_loops == 0)
5224 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5225 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5227 if (opts->x_align_jumps == 0)
5229 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5230 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5232 if (opts->x_align_functions == 0)
5234 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5238 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5240 static void
5241 ix86_override_options_after_change (void)
5243 ix86_default_align (&global_options);
5246 /* Override various settings based on options. If MAIN_ARGS_P, the
5247 options are from the command line, otherwise they are from
5248 attributes. Return true if there's an error related to march
5249 option. */
5251 static bool
5252 ix86_option_override_internal (bool main_args_p,
5253 struct gcc_options *opts,
5254 struct gcc_options *opts_set)
5256 int i;
5257 unsigned int ix86_arch_mask;
5258 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5260 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5261 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5262 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5263 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5264 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5265 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5266 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5267 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5268 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5269 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5270 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5271 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5272 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5273 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5274 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5275 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5276 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5277 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5278 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5279 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5280 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5281 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5282 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5283 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5284 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5285 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5286 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5287 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5288 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5289 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5290 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5291 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5292 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5293 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5294 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5295 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5296 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5297 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5298 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5299 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5300 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5301 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5302 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5303 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5304 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5305 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5306 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5307 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5308 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5309 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5310 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5311 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5312 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5313 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5314 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5315 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5316 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5317 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5318 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5319 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5320 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5321 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5322 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5323 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5325 #define PTA_CORE2 \
5326 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5327 | PTA_CX16 | PTA_FXSR)
5328 #define PTA_NEHALEM \
5329 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5330 #define PTA_WESTMERE \
5331 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5332 #define PTA_SANDYBRIDGE \
5333 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5334 #define PTA_IVYBRIDGE \
5335 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5336 #define PTA_HASWELL \
5337 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5338 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5339 #define PTA_BROADWELL \
5340 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5341 #define PTA_SKYLAKE \
5342 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5343 #define PTA_SKYLAKE_AVX512 \
5344 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5345 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5346 #define PTA_KNL \
5347 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5348 #define PTA_BONNELL \
5349 (PTA_CORE2 | PTA_MOVBE)
5350 #define PTA_SILVERMONT \
5351 (PTA_WESTMERE | PTA_MOVBE)
5353 /* if this reaches 64, need to widen struct pta flags below */
5355 static struct pta
5357 const char *const name; /* processor name or nickname. */
5358 const enum processor_type processor;
5359 const enum attr_cpu schedule;
5360 const unsigned HOST_WIDE_INT flags;
5362 const processor_alias_table[] =
5364 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5365 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5366 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5367 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5368 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5369 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5370 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5371 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5372 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5373 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5374 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5375 PTA_MMX | PTA_SSE | PTA_FXSR},
5376 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5377 PTA_MMX | PTA_SSE | PTA_FXSR},
5378 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5379 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5380 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5381 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5382 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5383 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5384 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5385 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5386 PTA_MMX | PTA_SSE | PTA_FXSR},
5387 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5388 PTA_MMX | PTA_SSE | PTA_FXSR},
5389 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5390 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5391 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5392 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5393 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5394 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5395 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5396 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5397 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5398 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5399 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5400 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5401 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5402 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5403 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5404 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5405 PTA_SANDYBRIDGE},
5406 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5407 PTA_SANDYBRIDGE},
5408 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5409 PTA_IVYBRIDGE},
5410 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5411 PTA_IVYBRIDGE},
5412 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5413 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5414 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5415 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5416 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5417 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5418 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5419 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5420 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5421 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5422 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5423 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5424 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5425 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5426 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5427 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5428 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5429 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5430 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5431 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5432 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5433 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5434 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5435 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5436 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5437 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5438 {"x86-64", PROCESSOR_K8, CPU_K8,
5439 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5440 {"eden-x2", PROCESSOR_K8, CPU_K8,
5441 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5442 {"nano", PROCESSOR_K8, CPU_K8,
5443 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5444 | PTA_SSSE3 | PTA_FXSR},
5445 {"nano-1000", PROCESSOR_K8, CPU_K8,
5446 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5447 | PTA_SSSE3 | PTA_FXSR},
5448 {"nano-2000", PROCESSOR_K8, CPU_K8,
5449 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5450 | PTA_SSSE3 | PTA_FXSR},
5451 {"nano-3000", PROCESSOR_K8, CPU_K8,
5452 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5453 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5454 {"nano-x2", PROCESSOR_K8, CPU_K8,
5455 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5456 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5457 {"eden-x4", PROCESSOR_K8, CPU_K8,
5458 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5459 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5460 {"nano-x4", PROCESSOR_K8, CPU_K8,
5461 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5462 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5463 {"k8", PROCESSOR_K8, CPU_K8,
5464 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5465 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5466 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5467 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5468 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5469 {"opteron", PROCESSOR_K8, CPU_K8,
5470 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5471 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5472 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5473 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5474 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5475 {"athlon64", PROCESSOR_K8, CPU_K8,
5476 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5477 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5478 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5479 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5480 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5481 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5482 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5483 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5484 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5485 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5486 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5487 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5488 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5489 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5490 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5491 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5492 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5493 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5494 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5495 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5496 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5497 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5498 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5499 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5500 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5501 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5502 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5503 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5504 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5505 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5506 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5507 | PTA_XSAVEOPT | PTA_FSGSBASE},
5508 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5509 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5510 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5511 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5512 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5513 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5514 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5515 | PTA_MOVBE | PTA_MWAITX},
5516 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5517 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5518 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5519 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5520 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5521 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5522 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5523 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5524 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5525 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5526 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5527 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5528 | PTA_FXSR | PTA_XSAVE},
5529 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5530 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5531 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5532 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5533 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5534 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5536 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5537 PTA_64BIT
5538 | PTA_HLE /* flags are only used for -march switch. */ },
5541 /* -mrecip options. */
5542 static struct
5544 const char *string; /* option name */
5545 unsigned int mask; /* mask bits to set */
5547 const recip_options[] =
5549 { "all", RECIP_MASK_ALL },
5550 { "none", RECIP_MASK_NONE },
5551 { "div", RECIP_MASK_DIV },
5552 { "sqrt", RECIP_MASK_SQRT },
5553 { "vec-div", RECIP_MASK_VEC_DIV },
5554 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5557 int const pta_size = ARRAY_SIZE (processor_alias_table);
5559 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5560 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5561 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5562 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5563 #ifdef TARGET_BI_ARCH
5564 else
5566 #if TARGET_BI_ARCH == 1
5567 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5568 is on and OPTION_MASK_ABI_X32 is off. We turn off
5569 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5570 -mx32. */
5571 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5572 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5573 #else
5574 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5575 on and OPTION_MASK_ABI_64 is off. We turn off
5576 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5577 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5578 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5579 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5580 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5581 #endif
5582 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5583 && TARGET_IAMCU_P (opts->x_target_flags))
5584 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5585 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5587 #endif
5589 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5591 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5592 OPTION_MASK_ABI_64 for TARGET_X32. */
5593 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5594 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5596 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5597 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5598 | OPTION_MASK_ABI_X32
5599 | OPTION_MASK_ABI_64);
5600 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5602 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5603 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5605 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5608 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5609 SUBTARGET_OVERRIDE_OPTIONS;
5610 #endif
5612 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5613 SUBSUBTARGET_OVERRIDE_OPTIONS;
5614 #endif
5616 /* -fPIC is the default for x86_64. */
5617 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5618 opts->x_flag_pic = 2;
5620 /* Need to check -mtune=generic first. */
5621 if (opts->x_ix86_tune_string)
5623 /* As special support for cross compilers we read -mtune=native
5624 as -mtune=generic. With native compilers we won't see the
5625 -mtune=native, as it was changed by the driver. */
5626 if (!strcmp (opts->x_ix86_tune_string, "native"))
5628 opts->x_ix86_tune_string = "generic";
5630 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5631 warning (OPT_Wdeprecated,
5632 main_args_p
5633 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5634 "or %<-mtune=generic%> instead as appropriate")
5635 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5636 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5637 " instead as appropriate"));
5639 else
5641 if (opts->x_ix86_arch_string)
5642 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5643 if (!opts->x_ix86_tune_string)
5645 opts->x_ix86_tune_string
5646 = processor_target_table[TARGET_CPU_DEFAULT].name;
5647 ix86_tune_defaulted = 1;
5650 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5651 or defaulted. We need to use a sensible tune option. */
5652 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5654 opts->x_ix86_tune_string = "generic";
5658 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5659 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5661 /* rep; movq isn't available in 32-bit code. */
5662 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5663 opts->x_ix86_stringop_alg = no_stringop;
5666 if (!opts->x_ix86_arch_string)
5667 opts->x_ix86_arch_string
5668 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5669 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5670 else
5671 ix86_arch_specified = 1;
5673 if (opts_set->x_ix86_pmode)
5675 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5676 && opts->x_ix86_pmode == PMODE_SI)
5677 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5678 && opts->x_ix86_pmode == PMODE_DI))
5679 error ("address mode %qs not supported in the %s bit mode",
5680 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5681 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5683 else
5684 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5685 ? PMODE_DI : PMODE_SI;
5687 if (!opts_set->x_ix86_abi)
5688 opts->x_ix86_abi = DEFAULT_ABI;
5690 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
5691 error ("-mabi=ms not supported with X32 ABI");
5692 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
5694 /* For targets using ms ABI enable ms-extensions, if not
5695 explicit turned off. For non-ms ABI we turn off this
5696 option. */
5697 if (!opts_set->x_flag_ms_extensions)
5698 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5700 if (opts_set->x_ix86_cmodel)
5702 switch (opts->x_ix86_cmodel)
5704 case CM_SMALL:
5705 case CM_SMALL_PIC:
5706 if (opts->x_flag_pic)
5707 opts->x_ix86_cmodel = CM_SMALL_PIC;
5708 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5709 error ("code model %qs not supported in the %s bit mode",
5710 "small", "32");
5711 break;
5713 case CM_MEDIUM:
5714 case CM_MEDIUM_PIC:
5715 if (opts->x_flag_pic)
5716 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5717 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5718 error ("code model %qs not supported in the %s bit mode",
5719 "medium", "32");
5720 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5721 error ("code model %qs not supported in x32 mode",
5722 "medium");
5723 break;
5725 case CM_LARGE:
5726 case CM_LARGE_PIC:
5727 if (opts->x_flag_pic)
5728 opts->x_ix86_cmodel = CM_LARGE_PIC;
5729 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5730 error ("code model %qs not supported in the %s bit mode",
5731 "large", "32");
5732 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5733 error ("code model %qs not supported in x32 mode",
5734 "large");
5735 break;
5737 case CM_32:
5738 if (opts->x_flag_pic)
5739 error ("code model %s does not support PIC mode", "32");
5740 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5741 error ("code model %qs not supported in the %s bit mode",
5742 "32", "64");
5743 break;
5745 case CM_KERNEL:
5746 if (opts->x_flag_pic)
5748 error ("code model %s does not support PIC mode", "kernel");
5749 opts->x_ix86_cmodel = CM_32;
5751 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5752 error ("code model %qs not supported in the %s bit mode",
5753 "kernel", "32");
5754 break;
5756 default:
5757 gcc_unreachable ();
5760 else
5762 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5763 use of rip-relative addressing. This eliminates fixups that
5764 would otherwise be needed if this object is to be placed in a
5765 DLL, and is essentially just as efficient as direct addressing. */
5766 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5767 && (TARGET_RDOS || TARGET_PECOFF))
5768 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5769 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5770 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5771 else
5772 opts->x_ix86_cmodel = CM_32;
5774 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5776 error ("-masm=intel not supported in this configuration");
5777 opts->x_ix86_asm_dialect = ASM_ATT;
5779 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5780 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5781 sorry ("%i-bit mode not compiled in",
5782 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5784 for (i = 0; i < pta_size; i++)
5785 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5787 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5789 error (main_args_p
5790 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5791 "switch")
5792 : G_("%<generic%> CPU can be used only for "
5793 "%<target(\"tune=\")%> attribute"));
5794 return false;
5796 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5798 error (main_args_p
5799 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5800 "switch")
5801 : G_("%<intel%> CPU can be used only for "
5802 "%<target(\"tune=\")%> attribute"));
5803 return false;
5806 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5807 && !(processor_alias_table[i].flags & PTA_64BIT))
5809 error ("CPU you selected does not support x86-64 "
5810 "instruction set");
5811 return false;
5814 ix86_schedule = processor_alias_table[i].schedule;
5815 ix86_arch = processor_alias_table[i].processor;
5816 /* Default cpu tuning to the architecture. */
5817 ix86_tune = ix86_arch;
5819 if (processor_alias_table[i].flags & PTA_MMX
5820 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5821 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5822 if (processor_alias_table[i].flags & PTA_3DNOW
5823 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5824 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5825 if (processor_alias_table[i].flags & PTA_3DNOW_A
5826 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5827 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5828 if (processor_alias_table[i].flags & PTA_SSE
5829 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5830 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5831 if (processor_alias_table[i].flags & PTA_SSE2
5832 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5833 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5834 if (processor_alias_table[i].flags & PTA_SSE3
5835 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5836 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5837 if (processor_alias_table[i].flags & PTA_SSSE3
5838 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5839 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5840 if (processor_alias_table[i].flags & PTA_SSE4_1
5841 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5842 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5843 if (processor_alias_table[i].flags & PTA_SSE4_2
5844 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5846 if (processor_alias_table[i].flags & PTA_AVX
5847 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5848 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5849 if (processor_alias_table[i].flags & PTA_AVX2
5850 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5851 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5852 if (processor_alias_table[i].flags & PTA_FMA
5853 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5854 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5855 if (processor_alias_table[i].flags & PTA_SSE4A
5856 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5857 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5858 if (processor_alias_table[i].flags & PTA_FMA4
5859 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5860 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5861 if (processor_alias_table[i].flags & PTA_XOP
5862 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5863 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5864 if (processor_alias_table[i].flags & PTA_LWP
5865 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5866 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5867 if (processor_alias_table[i].flags & PTA_ABM
5868 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5869 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5870 if (processor_alias_table[i].flags & PTA_BMI
5871 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5872 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5873 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5874 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5875 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5876 if (processor_alias_table[i].flags & PTA_TBM
5877 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5878 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5879 if (processor_alias_table[i].flags & PTA_BMI2
5880 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5881 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5882 if (processor_alias_table[i].flags & PTA_CX16
5883 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5884 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5885 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5886 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5887 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5888 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5889 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5890 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5891 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5892 if (processor_alias_table[i].flags & PTA_MOVBE
5893 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5894 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5895 if (processor_alias_table[i].flags & PTA_AES
5896 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5897 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5898 if (processor_alias_table[i].flags & PTA_SHA
5899 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5900 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5901 if (processor_alias_table[i].flags & PTA_PCLMUL
5902 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5903 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5904 if (processor_alias_table[i].flags & PTA_FSGSBASE
5905 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5906 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5907 if (processor_alias_table[i].flags & PTA_RDRND
5908 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5909 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5910 if (processor_alias_table[i].flags & PTA_F16C
5911 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5912 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5913 if (processor_alias_table[i].flags & PTA_RTM
5914 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5915 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5916 if (processor_alias_table[i].flags & PTA_HLE
5917 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5918 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5919 if (processor_alias_table[i].flags & PTA_PRFCHW
5920 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5921 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5922 if (processor_alias_table[i].flags & PTA_RDSEED
5923 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5924 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5925 if (processor_alias_table[i].flags & PTA_ADX
5926 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5927 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5928 if (processor_alias_table[i].flags & PTA_FXSR
5929 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5930 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5931 if (processor_alias_table[i].flags & PTA_XSAVE
5932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5934 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5937 if (processor_alias_table[i].flags & PTA_AVX512F
5938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5940 if (processor_alias_table[i].flags & PTA_AVX512ER
5941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5943 if (processor_alias_table[i].flags & PTA_AVX512PF
5944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5946 if (processor_alias_table[i].flags & PTA_AVX512CD
5947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5949 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5952 if (processor_alias_table[i].flags & PTA_CLWB
5953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5955 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5958 if (processor_alias_table[i].flags & PTA_CLZERO
5959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5961 if (processor_alias_table[i].flags & PTA_XSAVEC
5962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5964 if (processor_alias_table[i].flags & PTA_XSAVES
5965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5967 if (processor_alias_table[i].flags & PTA_AVX512DQ
5968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5970 if (processor_alias_table[i].flags & PTA_AVX512BW
5971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5973 if (processor_alias_table[i].flags & PTA_AVX512VL
5974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5976 if (processor_alias_table[i].flags & PTA_MPX
5977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5979 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5982 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5986 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5987 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5988 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5989 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5990 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5991 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5992 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5993 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5994 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5995 if (processor_alias_table[i].flags & PTA_SGX
5996 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5997 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5999 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
6000 x86_prefetch_sse = true;
6001 if (processor_alias_table[i].flags & PTA_MWAITX
6002 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
6003 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
6004 if (processor_alias_table[i].flags & PTA_PKU
6005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6008 /* Don't enable x87 instructions if only
6009 general registers are allowed. */
6010 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6011 && !(opts_set->x_target_flags & MASK_80387))
6013 if (processor_alias_table[i].flags & PTA_NO_80387)
6014 opts->x_target_flags &= ~MASK_80387;
6015 else
6016 opts->x_target_flags |= MASK_80387;
6018 break;
6021 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6022 error ("Intel MPX does not support x32");
6024 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6025 error ("Intel MPX does not support x32");
6027 if (i == pta_size)
6029 error (main_args_p
6030 ? G_("bad value (%qs) for %<-march=%> switch")
6031 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6032 opts->x_ix86_arch_string);
6034 auto_vec <const char *> candidates;
6035 for (i = 0; i < pta_size; i++)
6036 if (strcmp (processor_alias_table[i].name, "generic")
6037 && strcmp (processor_alias_table[i].name, "intel")
6038 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6039 || (processor_alias_table[i].flags & PTA_64BIT)))
6040 candidates.safe_push (processor_alias_table[i].name);
6042 char *s;
6043 const char *hint
6044 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6045 if (hint)
6046 inform (input_location,
6047 main_args_p
6048 ? G_("valid arguments to %<-march=%> switch are: "
6049 "%s; did you mean %qs?")
6050 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6051 "%s; did you mean %qs?"), s, hint);
6052 else
6053 inform (input_location,
6054 main_args_p
6055 ? G_("valid arguments to %<-march=%> switch are: %s")
6056 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6057 "are: %s"), s);
6058 XDELETEVEC (s);
6061 ix86_arch_mask = 1u << ix86_arch;
6062 for (i = 0; i < X86_ARCH_LAST; ++i)
6063 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6065 for (i = 0; i < pta_size; i++)
6066 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6068 ix86_schedule = processor_alias_table[i].schedule;
6069 ix86_tune = processor_alias_table[i].processor;
6070 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6072 if (!(processor_alias_table[i].flags & PTA_64BIT))
6074 if (ix86_tune_defaulted)
6076 opts->x_ix86_tune_string = "x86-64";
6077 for (i = 0; i < pta_size; i++)
6078 if (! strcmp (opts->x_ix86_tune_string,
6079 processor_alias_table[i].name))
6080 break;
6081 ix86_schedule = processor_alias_table[i].schedule;
6082 ix86_tune = processor_alias_table[i].processor;
6084 else
6085 error ("CPU you selected does not support x86-64 "
6086 "instruction set");
6089 /* Intel CPUs have always interpreted SSE prefetch instructions as
6090 NOPs; so, we can enable SSE prefetch instructions even when
6091 -mtune (rather than -march) points us to a processor that has them.
6092 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6093 higher processors. */
6094 if (TARGET_CMOV
6095 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6096 x86_prefetch_sse = true;
6097 break;
6100 if (ix86_tune_specified && i == pta_size)
6102 error (main_args_p
6103 ? G_("bad value (%qs) for %<-mtune=%> switch")
6104 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6105 opts->x_ix86_tune_string);
6107 auto_vec <const char *> candidates;
6108 for (i = 0; i < pta_size; i++)
6109 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6110 || (processor_alias_table[i].flags & PTA_64BIT))
6111 candidates.safe_push (processor_alias_table[i].name);
6113 char *s;
6114 const char *hint
6115 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6116 if (hint)
6117 inform (input_location,
6118 main_args_p
6119 ? G_("valid arguments to %<-mtune=%> switch are: "
6120 "%s; did you mean %qs?")
6121 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6122 "%s; did you mean %qs?"), s, hint);
6123 else
6124 inform (input_location,
6125 main_args_p
6126 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6127 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6128 "are: %s"), s);
6129 XDELETEVEC (s);
6132 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6134 #ifndef USE_IX86_FRAME_POINTER
6135 #define USE_IX86_FRAME_POINTER 0
6136 #endif
6138 #ifndef USE_X86_64_FRAME_POINTER
6139 #define USE_X86_64_FRAME_POINTER 0
6140 #endif
6142 /* Set the default values for switches whose default depends on TARGET_64BIT
6143 in case they weren't overwritten by command line options. */
6144 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6146 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6147 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6148 if (opts->x_flag_asynchronous_unwind_tables
6149 && !opts_set->x_flag_unwind_tables
6150 && TARGET_64BIT_MS_ABI)
6151 opts->x_flag_unwind_tables = 1;
6152 if (opts->x_flag_asynchronous_unwind_tables == 2)
6153 opts->x_flag_unwind_tables
6154 = opts->x_flag_asynchronous_unwind_tables = 1;
6155 if (opts->x_flag_pcc_struct_return == 2)
6156 opts->x_flag_pcc_struct_return = 0;
6158 else
6160 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6161 opts->x_flag_omit_frame_pointer
6162 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6163 if (opts->x_flag_asynchronous_unwind_tables == 2)
6164 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6165 if (opts->x_flag_pcc_struct_return == 2)
6167 /* Intel MCU psABI specifies that -freg-struct-return should
6168 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6169 we check -miamcu so that -freg-struct-return is always
6170 turned on if -miamcu is used. */
6171 if (TARGET_IAMCU_P (opts->x_target_flags))
6172 opts->x_flag_pcc_struct_return = 0;
6173 else
6174 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6178 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6179 /* TODO: ix86_cost should be chosen at instruction or function granuality
6180 so for cold code we use size_cost even in !optimize_size compilation. */
6181 if (opts->x_optimize_size)
6182 ix86_cost = &ix86_size_cost;
6183 else
6184 ix86_cost = ix86_tune_cost;
6186 /* Arrange to set up i386_stack_locals for all functions. */
6187 init_machine_status = ix86_init_machine_status;
6189 /* Validate -mregparm= value. */
6190 if (opts_set->x_ix86_regparm)
6192 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6193 warning (0, "-mregparm is ignored in 64-bit mode");
6194 else if (TARGET_IAMCU_P (opts->x_target_flags))
6195 warning (0, "-mregparm is ignored for Intel MCU psABI");
6196 if (opts->x_ix86_regparm > REGPARM_MAX)
6198 error ("-mregparm=%d is not between 0 and %d",
6199 opts->x_ix86_regparm, REGPARM_MAX);
6200 opts->x_ix86_regparm = 0;
6203 if (TARGET_IAMCU_P (opts->x_target_flags)
6204 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6205 opts->x_ix86_regparm = REGPARM_MAX;
6207 /* Default align_* from the processor table. */
6208 ix86_default_align (opts);
6210 /* Provide default for -mbranch-cost= value. */
6211 if (!opts_set->x_ix86_branch_cost)
6212 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6214 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6216 opts->x_target_flags
6217 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6219 /* Enable by default the SSE and MMX builtins. Do allow the user to
6220 explicitly disable any of these. In particular, disabling SSE and
6221 MMX for kernel code is extremely useful. */
6222 if (!ix86_arch_specified)
6223 opts->x_ix86_isa_flags
6224 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6225 | TARGET_SUBTARGET64_ISA_DEFAULT)
6226 & ~opts->x_ix86_isa_flags_explicit);
6228 if (TARGET_RTD_P (opts->x_target_flags))
6229 warning (0,
6230 main_args_p
6231 ? G_("%<-mrtd%> is ignored in 64bit mode")
6232 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6234 else
6236 opts->x_target_flags
6237 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6239 if (!ix86_arch_specified)
6240 opts->x_ix86_isa_flags
6241 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6243 /* i386 ABI does not specify red zone. It still makes sense to use it
6244 when programmer takes care to stack from being destroyed. */
6245 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6246 opts->x_target_flags |= MASK_NO_RED_ZONE;
6249 /* Keep nonleaf frame pointers. */
6250 if (opts->x_flag_omit_frame_pointer)
6251 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6252 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6253 opts->x_flag_omit_frame_pointer = 1;
6255 /* If we're doing fast math, we don't care about comparison order
6256 wrt NaNs. This lets us use a shorter comparison sequence. */
6257 if (opts->x_flag_finite_math_only)
6258 opts->x_target_flags &= ~MASK_IEEE_FP;
6260 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6261 since the insns won't need emulation. */
6262 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6263 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6265 /* Likewise, if the target doesn't have a 387, or we've specified
6266 software floating point, don't use 387 inline intrinsics. */
6267 if (!TARGET_80387_P (opts->x_target_flags))
6268 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6270 /* Turn on MMX builtins for -msse. */
6271 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6272 opts->x_ix86_isa_flags
6273 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6275 /* Enable SSE prefetch. */
6276 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6277 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6278 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6279 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6280 x86_prefetch_sse = true;
6282 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6283 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6284 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6285 opts->x_ix86_isa_flags
6286 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6288 /* Enable lzcnt instruction for -mabm. */
6289 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6290 opts->x_ix86_isa_flags
6291 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6293 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6294 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6295 opts->x_ix86_isa_flags
6296 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6297 & ~opts->x_ix86_isa_flags_explicit);
6299 /* Validate -mpreferred-stack-boundary= value or default it to
6300 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6301 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6302 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6304 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6305 int max = TARGET_SEH ? 4 : 12;
6307 if (opts->x_ix86_preferred_stack_boundary_arg < min
6308 || opts->x_ix86_preferred_stack_boundary_arg > max)
6310 if (min == max)
6311 error ("-mpreferred-stack-boundary is not supported "
6312 "for this target");
6313 else
6314 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6315 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6317 else
6318 ix86_preferred_stack_boundary
6319 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6322 /* Set the default value for -mstackrealign. */
6323 if (!opts_set->x_ix86_force_align_arg_pointer)
6324 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6326 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6328 /* Validate -mincoming-stack-boundary= value or default it to
6329 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6330 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6331 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6333 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6335 if (opts->x_ix86_incoming_stack_boundary_arg < min
6336 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6337 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6338 opts->x_ix86_incoming_stack_boundary_arg, min);
6339 else
6341 ix86_user_incoming_stack_boundary
6342 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6343 ix86_incoming_stack_boundary
6344 = ix86_user_incoming_stack_boundary;
6348 #ifndef NO_PROFILE_COUNTERS
6349 if (flag_nop_mcount)
6350 error ("-mnop-mcount is not compatible with this target");
6351 #endif
6352 if (flag_nop_mcount && flag_pic)
6353 error ("-mnop-mcount is not implemented for -fPIC");
6355 /* Accept -msseregparm only if at least SSE support is enabled. */
6356 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6357 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6358 error (main_args_p
6359 ? G_("%<-msseregparm%> used without SSE enabled")
6360 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6362 if (opts_set->x_ix86_fpmath)
6364 if (opts->x_ix86_fpmath & FPMATH_SSE)
6366 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6368 if (TARGET_80387_P (opts->x_target_flags))
6370 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6371 opts->x_ix86_fpmath = FPMATH_387;
6374 else if ((opts->x_ix86_fpmath & FPMATH_387)
6375 && !TARGET_80387_P (opts->x_target_flags))
6377 warning (0, "387 instruction set disabled, using SSE arithmetics");
6378 opts->x_ix86_fpmath = FPMATH_SSE;
6382 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6383 fpmath=387. The second is however default at many targets since the
6384 extra 80bit precision of temporaries is considered to be part of ABI.
6385 Overwrite the default at least for -ffast-math.
6386 TODO: -mfpmath=both seems to produce same performing code with bit
6387 smaller binaries. It is however not clear if register allocation is
6388 ready for this setting.
6389 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6390 codegen. We may switch to 387 with -ffast-math for size optimized
6391 functions. */
6392 else if (fast_math_flags_set_p (&global_options)
6393 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6394 opts->x_ix86_fpmath = FPMATH_SSE;
6395 else
6396 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6398 /* Use external vectorized library in vectorizing intrinsics. */
6399 if (opts_set->x_ix86_veclibabi_type)
6400 switch (opts->x_ix86_veclibabi_type)
6402 case ix86_veclibabi_type_svml:
6403 ix86_veclib_handler = ix86_veclibabi_svml;
6404 break;
6406 case ix86_veclibabi_type_acml:
6407 ix86_veclib_handler = ix86_veclibabi_acml;
6408 break;
6410 default:
6411 gcc_unreachable ();
6414 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6415 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6416 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6418 /* If stack probes are required, the space used for large function
6419 arguments on the stack must also be probed, so enable
6420 -maccumulate-outgoing-args so this happens in the prologue. */
6421 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6422 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6424 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6425 warning (0,
6426 main_args_p
6427 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6428 "for correctness")
6429 : G_("stack probing requires "
6430 "%<target(\"accumulate-outgoing-args\")%> for "
6431 "correctness"));
6432 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6435 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6436 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6437 if (fixed_regs[BP_REG]
6438 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6440 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6441 warning (0,
6442 main_args_p
6443 ? G_("fixed ebp register requires "
6444 "%<-maccumulate-outgoing-args%>")
6445 : G_("fixed ebp register requires "
6446 "%<target(\"accumulate-outgoing-args\")%>"));
6447 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6450 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6452 char *p;
6453 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6454 p = strchr (internal_label_prefix, 'X');
6455 internal_label_prefix_len = p - internal_label_prefix;
6456 *p = '\0';
6459 /* When scheduling description is not available, disable scheduler pass
6460 so it won't slow down the compilation and make x87 code slower. */
6461 if (!TARGET_SCHEDULE)
6462 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6464 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6465 ix86_tune_cost->simultaneous_prefetches,
6466 opts->x_param_values,
6467 opts_set->x_param_values);
6468 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6469 ix86_tune_cost->prefetch_block,
6470 opts->x_param_values,
6471 opts_set->x_param_values);
6472 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6473 ix86_tune_cost->l1_cache_size,
6474 opts->x_param_values,
6475 opts_set->x_param_values);
6476 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6477 ix86_tune_cost->l2_cache_size,
6478 opts->x_param_values,
6479 opts_set->x_param_values);
6481 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6482 if (opts->x_flag_prefetch_loop_arrays < 0
6483 && HAVE_prefetch
6484 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6485 && !opts->x_optimize_size
6486 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6487 opts->x_flag_prefetch_loop_arrays = 1;
6489 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6490 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6491 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6492 targetm.expand_builtin_va_start = NULL;
6494 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6496 ix86_gen_leave = gen_leave_rex64;
6497 if (Pmode == DImode)
6499 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6500 ix86_gen_tls_local_dynamic_base_64
6501 = gen_tls_local_dynamic_base_64_di;
6503 else
6505 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6506 ix86_gen_tls_local_dynamic_base_64
6507 = gen_tls_local_dynamic_base_64_si;
6510 else
6511 ix86_gen_leave = gen_leave;
6513 if (Pmode == DImode)
6515 ix86_gen_add3 = gen_adddi3;
6516 ix86_gen_sub3 = gen_subdi3;
6517 ix86_gen_sub3_carry = gen_subdi3_carry;
6518 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6519 ix86_gen_andsp = gen_anddi3;
6520 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6521 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6522 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6523 ix86_gen_monitor = gen_sse3_monitor_di;
6524 ix86_gen_monitorx = gen_monitorx_di;
6525 ix86_gen_clzero = gen_clzero_di;
6527 else
6529 ix86_gen_add3 = gen_addsi3;
6530 ix86_gen_sub3 = gen_subsi3;
6531 ix86_gen_sub3_carry = gen_subsi3_carry;
6532 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6533 ix86_gen_andsp = gen_andsi3;
6534 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6535 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6536 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6537 ix86_gen_monitor = gen_sse3_monitor_si;
6538 ix86_gen_monitorx = gen_monitorx_si;
6539 ix86_gen_clzero = gen_clzero_si;
6542 #ifdef USE_IX86_CLD
6543 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6544 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6545 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6546 #endif
6548 /* Set the default value for -mfentry. */
6549 if (!opts_set->x_flag_fentry)
6550 opts->x_flag_fentry = TARGET_SEH;
6551 else
6553 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
6554 && opts->x_flag_fentry)
6555 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6556 "with -fpic");
6557 else if (TARGET_SEH && !opts->x_flag_fentry)
6558 sorry ("-mno-fentry isn%'t compatible with SEH");
6561 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6562 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6564 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6565 opts->x_target_flags |= MASK_VZEROUPPER;
6566 if (!(opts_set->x_target_flags & MASK_STV))
6567 opts->x_target_flags |= MASK_STV;
6568 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6569 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6570 stack realignment will be extra cost the pass doesn't take into
6571 account and the pass can't realign the stack. */
6572 if (ix86_preferred_stack_boundary < 128
6573 || ix86_incoming_stack_boundary < 128
6574 || opts->x_ix86_force_align_arg_pointer)
6575 opts->x_target_flags &= ~MASK_STV;
6576 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6577 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6578 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6579 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6580 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6581 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6582 /* Enable 128-bit AVX instruction generation
6583 for the auto-vectorizer. */
6584 if (TARGET_AVX128_OPTIMAL
6585 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6586 opts->x_target_flags |= MASK_PREFER_AVX128;
6588 if (opts->x_ix86_recip_name)
6590 char *p = ASTRDUP (opts->x_ix86_recip_name);
6591 char *q;
6592 unsigned int mask, i;
6593 bool invert;
6595 while ((q = strtok (p, ",")) != NULL)
6597 p = NULL;
6598 if (*q == '!')
6600 invert = true;
6601 q++;
6603 else
6604 invert = false;
6606 if (!strcmp (q, "default"))
6607 mask = RECIP_MASK_ALL;
6608 else
6610 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6611 if (!strcmp (q, recip_options[i].string))
6613 mask = recip_options[i].mask;
6614 break;
6617 if (i == ARRAY_SIZE (recip_options))
6619 error ("unknown option for -mrecip=%s", q);
6620 invert = false;
6621 mask = RECIP_MASK_NONE;
6625 opts->x_recip_mask_explicit |= mask;
6626 if (invert)
6627 opts->x_recip_mask &= ~mask;
6628 else
6629 opts->x_recip_mask |= mask;
6633 if (TARGET_RECIP_P (opts->x_target_flags))
6634 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6635 else if (opts_set->x_target_flags & MASK_RECIP)
6636 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6638 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6639 for 64-bit Bionic. Also default long double to 64-bit for Intel
6640 MCU psABI. */
6641 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6642 && !(opts_set->x_target_flags
6643 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6644 opts->x_target_flags |= (TARGET_64BIT
6645 ? MASK_LONG_DOUBLE_128
6646 : MASK_LONG_DOUBLE_64);
6648 /* Only one of them can be active. */
6649 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6650 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6652 /* Handle stack protector */
6653 if (!opts_set->x_ix86_stack_protector_guard)
6654 opts->x_ix86_stack_protector_guard
6655 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6657 #ifdef TARGET_THREAD_SSP_OFFSET
6658 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
6659 #endif
6661 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
6663 char *endp;
6664 const char *str = ix86_stack_protector_guard_offset_str;
6666 errno = 0;
6667 int64_t offset;
6669 #if defined(INT64_T_IS_LONG)
6670 offset = strtol (str, &endp, 0);
6671 #else
6672 offset = strtoll (str, &endp, 0);
6673 #endif
6675 if (!*str || *endp || errno)
6676 error ("%qs is not a valid number "
6677 "in -mstack-protector-guard-offset=", str);
6679 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
6680 HOST_WIDE_INT_C (0x7fffffff)))
6681 error ("%qs is not a valid offset "
6682 "in -mstack-protector-guard-offset=", str);
6684 ix86_stack_protector_guard_offset = offset;
6687 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
6689 /* The kernel uses a different segment register for performance
6690 reasons; a system call would not have to trash the userspace
6691 segment register, which would be expensive. */
6692 if (ix86_cmodel == CM_KERNEL)
6693 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
6695 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
6697 const char *str = ix86_stack_protector_guard_reg_str;
6698 addr_space_t seg = ADDR_SPACE_GENERIC;
6700 /* Discard optional register prefix. */
6701 if (str[0] == '%')
6702 str++;
6704 if (strlen (str) == 2 && str[1] == 's')
6706 if (str[0] == 'f')
6707 seg = ADDR_SPACE_SEG_FS;
6708 else if (str[0] == 'g')
6709 seg = ADDR_SPACE_SEG_GS;
6712 if (seg == ADDR_SPACE_GENERIC)
6713 error ("%qs is not a valid base register "
6714 "in -mstack-protector-guard-reg=",
6715 ix86_stack_protector_guard_reg_str);
6717 ix86_stack_protector_guard_reg = seg;
6720 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6721 if (opts->x_ix86_tune_memcpy_strategy)
6723 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6724 ix86_parse_stringop_strategy_string (str, false);
6725 free (str);
6728 if (opts->x_ix86_tune_memset_strategy)
6730 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6731 ix86_parse_stringop_strategy_string (str, true);
6732 free (str);
6735 /* Save the initial options in case the user does function specific
6736 options. */
6737 if (main_args_p)
6738 target_option_default_node = target_option_current_node
6739 = build_target_option_node (opts);
6741 return true;
6744 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6746 static void
6747 ix86_option_override (void)
6749 ix86_option_override_internal (true, &global_options, &global_options_set);
6752 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6753 static char *
6754 ix86_offload_options (void)
6756 if (TARGET_LP64)
6757 return xstrdup ("-foffload-abi=lp64");
6758 return xstrdup ("-foffload-abi=ilp32");
6761 /* Update register usage after having seen the compiler flags. */
6763 static void
6764 ix86_conditional_register_usage (void)
6766 int i, c_mask;
6768 /* If there are no caller-saved registers, preserve all registers.
6769 except fixed_regs and registers used for function return value
6770 since aggregate_value_p checks call_used_regs[regno] on return
6771 value. */
6772 if (cfun && cfun->machine->no_caller_saved_registers)
6773 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6774 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6775 call_used_regs[i] = 0;
6777 /* For 32-bit targets, squash the REX registers. */
6778 if (! TARGET_64BIT)
6780 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6781 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6782 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6783 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6784 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6785 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6788 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6789 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6791 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6793 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6795 /* Set/reset conditionally defined registers from
6796 CALL_USED_REGISTERS initializer. */
6797 if (call_used_regs[i] > 1)
6798 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6800 /* Calculate registers of CLOBBERED_REGS register set
6801 as call used registers from GENERAL_REGS register set. */
6802 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6803 && call_used_regs[i])
6804 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6807 /* If MMX is disabled, squash the registers. */
6808 if (! TARGET_MMX)
6809 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6810 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6811 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6813 /* If SSE is disabled, squash the registers. */
6814 if (! TARGET_SSE)
6815 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6816 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6817 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6819 /* If the FPU is disabled, squash the registers. */
6820 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6821 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6822 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6823 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6825 /* If AVX512F is disabled, squash the registers. */
6826 if (! TARGET_AVX512F)
6828 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6829 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6831 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6832 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6835 /* If MPX is disabled, squash the registers. */
6836 if (! TARGET_MPX)
6837 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6838 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6842 /* Save the current options */
6844 static void
6845 ix86_function_specific_save (struct cl_target_option *ptr,
6846 struct gcc_options *opts)
6848 ptr->arch = ix86_arch;
6849 ptr->schedule = ix86_schedule;
6850 ptr->prefetch_sse = x86_prefetch_sse;
6851 ptr->tune = ix86_tune;
6852 ptr->branch_cost = ix86_branch_cost;
6853 ptr->tune_defaulted = ix86_tune_defaulted;
6854 ptr->arch_specified = ix86_arch_specified;
6855 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6856 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6857 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6858 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6859 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6860 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6861 ptr->x_ix86_abi = opts->x_ix86_abi;
6862 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6863 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6864 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6865 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6866 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6867 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6868 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6869 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6870 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6871 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6872 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6873 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6874 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6875 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6876 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6877 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6878 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6879 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6880 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6881 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6883 /* The fields are char but the variables are not; make sure the
6884 values fit in the fields. */
6885 gcc_assert (ptr->arch == ix86_arch);
6886 gcc_assert (ptr->schedule == ix86_schedule);
6887 gcc_assert (ptr->tune == ix86_tune);
6888 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6891 /* Restore the current options */
6893 static void
6894 ix86_function_specific_restore (struct gcc_options *opts,
6895 struct cl_target_option *ptr)
6897 enum processor_type old_tune = ix86_tune;
6898 enum processor_type old_arch = ix86_arch;
6899 unsigned int ix86_arch_mask;
6900 int i;
6902 /* We don't change -fPIC. */
6903 opts->x_flag_pic = flag_pic;
6905 ix86_arch = (enum processor_type) ptr->arch;
6906 ix86_schedule = (enum attr_cpu) ptr->schedule;
6907 ix86_tune = (enum processor_type) ptr->tune;
6908 x86_prefetch_sse = ptr->prefetch_sse;
6909 opts->x_ix86_branch_cost = ptr->branch_cost;
6910 ix86_tune_defaulted = ptr->tune_defaulted;
6911 ix86_arch_specified = ptr->arch_specified;
6912 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6913 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6914 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6915 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6916 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6917 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6918 opts->x_ix86_abi = ptr->x_ix86_abi;
6919 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6920 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6921 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6922 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6923 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6924 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6925 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6926 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6927 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6928 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6929 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6930 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6931 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6932 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6933 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6934 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6935 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6936 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6937 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6938 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6939 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6940 /* TODO: ix86_cost should be chosen at instruction or function granuality
6941 so for cold code we use size_cost even in !optimize_size compilation. */
6942 if (opts->x_optimize_size)
6943 ix86_cost = &ix86_size_cost;
6944 else
6945 ix86_cost = ix86_tune_cost;
6947 /* Recreate the arch feature tests if the arch changed */
6948 if (old_arch != ix86_arch)
6950 ix86_arch_mask = 1u << ix86_arch;
6951 for (i = 0; i < X86_ARCH_LAST; ++i)
6952 ix86_arch_features[i]
6953 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6956 /* Recreate the tune optimization tests */
6957 if (old_tune != ix86_tune)
6958 set_ix86_tune_features (ix86_tune, false);
6961 /* Adjust target options after streaming them in. This is mainly about
6962 reconciling them with global options. */
6964 static void
6965 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6967 /* flag_pic is a global option, but ix86_cmodel is target saved option
6968 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6969 for PIC, or error out. */
6970 if (flag_pic)
6971 switch (ptr->x_ix86_cmodel)
6973 case CM_SMALL:
6974 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6975 break;
6977 case CM_MEDIUM:
6978 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6979 break;
6981 case CM_LARGE:
6982 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6983 break;
6985 case CM_KERNEL:
6986 error ("code model %s does not support PIC mode", "kernel");
6987 break;
6989 default:
6990 break;
6992 else
6993 switch (ptr->x_ix86_cmodel)
6995 case CM_SMALL_PIC:
6996 ptr->x_ix86_cmodel = CM_SMALL;
6997 break;
6999 case CM_MEDIUM_PIC:
7000 ptr->x_ix86_cmodel = CM_MEDIUM;
7001 break;
7003 case CM_LARGE_PIC:
7004 ptr->x_ix86_cmodel = CM_LARGE;
7005 break;
7007 default:
7008 break;
7012 /* Print the current options */
7014 static void
7015 ix86_function_specific_print (FILE *file, int indent,
7016 struct cl_target_option *ptr)
7018 char *target_string
7019 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
7020 ptr->x_target_flags, ptr->x_ix86_target_flags,
7021 NULL, NULL, ptr->x_ix86_fpmath, false);
7023 gcc_assert (ptr->arch < PROCESSOR_max);
7024 fprintf (file, "%*sarch = %d (%s)\n",
7025 indent, "",
7026 ptr->arch, processor_target_table[ptr->arch].name);
7028 gcc_assert (ptr->tune < PROCESSOR_max);
7029 fprintf (file, "%*stune = %d (%s)\n",
7030 indent, "",
7031 ptr->tune, processor_target_table[ptr->tune].name);
7033 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
7035 if (target_string)
7037 fprintf (file, "%*s%s\n", indent, "", target_string);
7038 free (target_string);
7043 /* Inner function to process the attribute((target(...))), take an argument and
7044 set the current options from the argument. If we have a list, recursively go
7045 over the list. */
7047 static bool
7048 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
7049 struct gcc_options *opts,
7050 struct gcc_options *opts_set,
7051 struct gcc_options *enum_opts_set)
7053 char *next_optstr;
7054 bool ret = true;
7056 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
7057 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
7058 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
7059 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
7060 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7062 enum ix86_opt_type
7064 ix86_opt_unknown,
7065 ix86_opt_yes,
7066 ix86_opt_no,
7067 ix86_opt_str,
7068 ix86_opt_enum,
7069 ix86_opt_isa
7072 static const struct
7074 const char *string;
7075 size_t len;
7076 enum ix86_opt_type type;
7077 int opt;
7078 int mask;
7079 } attrs[] = {
7080 /* isa options */
7081 IX86_ATTR_ISA ("sgx", OPT_msgx),
7082 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7083 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7084 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7086 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7087 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7088 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7089 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7090 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7091 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7092 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7093 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7094 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7095 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7096 IX86_ATTR_ISA ("fma", OPT_mfma),
7097 IX86_ATTR_ISA ("xop", OPT_mxop),
7098 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7099 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7100 IX86_ATTR_ISA ("avx", OPT_mavx),
7101 IX86_ATTR_ISA ("sse4", OPT_msse4),
7102 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7103 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7104 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7105 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7106 IX86_ATTR_ISA ("sse3", OPT_msse3),
7107 IX86_ATTR_ISA ("aes", OPT_maes),
7108 IX86_ATTR_ISA ("sha", OPT_msha),
7109 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7110 IX86_ATTR_ISA ("sse2", OPT_msse2),
7111 IX86_ATTR_ISA ("sse", OPT_msse),
7112 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7113 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7114 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7115 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7116 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7117 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7118 IX86_ATTR_ISA ("adx", OPT_madx),
7119 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7120 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7121 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7122 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7123 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7124 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7125 IX86_ATTR_ISA ("abm", OPT_mabm),
7126 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7127 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7128 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7129 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7130 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7131 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7132 IX86_ATTR_ISA ("sahf", OPT_msahf),
7133 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7134 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7135 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7136 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7137 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7138 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7139 IX86_ATTR_ISA ("pku", OPT_mpku),
7140 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7141 IX86_ATTR_ISA ("hle", OPT_mhle),
7142 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7143 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7144 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7145 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7147 /* enum options */
7148 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7150 /* string options */
7151 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7152 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7154 /* flag options */
7155 IX86_ATTR_YES ("cld",
7156 OPT_mcld,
7157 MASK_CLD),
7159 IX86_ATTR_NO ("fancy-math-387",
7160 OPT_mfancy_math_387,
7161 MASK_NO_FANCY_MATH_387),
7163 IX86_ATTR_YES ("ieee-fp",
7164 OPT_mieee_fp,
7165 MASK_IEEE_FP),
7167 IX86_ATTR_YES ("inline-all-stringops",
7168 OPT_minline_all_stringops,
7169 MASK_INLINE_ALL_STRINGOPS),
7171 IX86_ATTR_YES ("inline-stringops-dynamically",
7172 OPT_minline_stringops_dynamically,
7173 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7175 IX86_ATTR_NO ("align-stringops",
7176 OPT_mno_align_stringops,
7177 MASK_NO_ALIGN_STRINGOPS),
7179 IX86_ATTR_YES ("recip",
7180 OPT_mrecip,
7181 MASK_RECIP),
7185 /* If this is a list, recurse to get the options. */
7186 if (TREE_CODE (args) == TREE_LIST)
7188 bool ret = true;
7190 for (; args; args = TREE_CHAIN (args))
7191 if (TREE_VALUE (args)
7192 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7193 p_strings, opts, opts_set,
7194 enum_opts_set))
7195 ret = false;
7197 return ret;
7200 else if (TREE_CODE (args) != STRING_CST)
7202 error ("attribute %<target%> argument not a string");
7203 return false;
7206 /* Handle multiple arguments separated by commas. */
7207 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7209 while (next_optstr && *next_optstr != '\0')
7211 char *p = next_optstr;
7212 char *orig_p = p;
7213 char *comma = strchr (next_optstr, ',');
7214 const char *opt_string;
7215 size_t len, opt_len;
7216 int opt;
7217 bool opt_set_p;
7218 char ch;
7219 unsigned i;
7220 enum ix86_opt_type type = ix86_opt_unknown;
7221 int mask = 0;
7223 if (comma)
7225 *comma = '\0';
7226 len = comma - next_optstr;
7227 next_optstr = comma + 1;
7229 else
7231 len = strlen (p);
7232 next_optstr = NULL;
7235 /* Recognize no-xxx. */
7236 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7238 opt_set_p = false;
7239 p += 3;
7240 len -= 3;
7242 else
7243 opt_set_p = true;
7245 /* Find the option. */
7246 ch = *p;
7247 opt = N_OPTS;
7248 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7250 type = attrs[i].type;
7251 opt_len = attrs[i].len;
7252 if (ch == attrs[i].string[0]
7253 && ((type != ix86_opt_str && type != ix86_opt_enum)
7254 ? len == opt_len
7255 : len > opt_len)
7256 && memcmp (p, attrs[i].string, opt_len) == 0)
7258 opt = attrs[i].opt;
7259 mask = attrs[i].mask;
7260 opt_string = attrs[i].string;
7261 break;
7265 /* Process the option. */
7266 if (opt == N_OPTS)
7268 error ("attribute(target(\"%s\")) is unknown", orig_p);
7269 ret = false;
7272 else if (type == ix86_opt_isa)
7274 struct cl_decoded_option decoded;
7276 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7277 ix86_handle_option (opts, opts_set,
7278 &decoded, input_location);
7281 else if (type == ix86_opt_yes || type == ix86_opt_no)
7283 if (type == ix86_opt_no)
7284 opt_set_p = !opt_set_p;
7286 if (opt_set_p)
7287 opts->x_target_flags |= mask;
7288 else
7289 opts->x_target_flags &= ~mask;
7292 else if (type == ix86_opt_str)
7294 if (p_strings[opt])
7296 error ("option(\"%s\") was already specified", opt_string);
7297 ret = false;
7299 else
7300 p_strings[opt] = xstrdup (p + opt_len);
7303 else if (type == ix86_opt_enum)
7305 bool arg_ok;
7306 int value;
7308 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7309 if (arg_ok)
7310 set_option (opts, enum_opts_set, opt, value,
7311 p + opt_len, DK_UNSPECIFIED, input_location,
7312 global_dc);
7313 else
7315 error ("attribute(target(\"%s\")) is unknown", orig_p);
7316 ret = false;
7320 else
7321 gcc_unreachable ();
7324 return ret;
7327 /* Release allocated strings. */
7328 static void
7329 release_options_strings (char **option_strings)
7331 /* Free up memory allocated to hold the strings */
7332 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7333 free (option_strings[i]);
7336 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7338 tree
7339 ix86_valid_target_attribute_tree (tree args,
7340 struct gcc_options *opts,
7341 struct gcc_options *opts_set)
7343 const char *orig_arch_string = opts->x_ix86_arch_string;
7344 const char *orig_tune_string = opts->x_ix86_tune_string;
7345 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7346 int orig_tune_defaulted = ix86_tune_defaulted;
7347 int orig_arch_specified = ix86_arch_specified;
7348 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7349 tree t = NULL_TREE;
7350 struct cl_target_option *def
7351 = TREE_TARGET_OPTION (target_option_default_node);
7352 struct gcc_options enum_opts_set;
7354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7356 /* Process each of the options on the chain. */
7357 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7358 opts_set, &enum_opts_set))
7359 return error_mark_node;
7361 /* If the changed options are different from the default, rerun
7362 ix86_option_override_internal, and then save the options away.
7363 The string options are attribute options, and will be undone
7364 when we copy the save structure. */
7365 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7366 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7367 || opts->x_target_flags != def->x_target_flags
7368 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7369 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7370 || enum_opts_set.x_ix86_fpmath)
7372 /* If we are using the default tune= or arch=, undo the string assigned,
7373 and use the default. */
7374 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7376 opts->x_ix86_arch_string
7377 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7379 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7380 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7381 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7382 | OPTION_MASK_ABI_64
7383 | OPTION_MASK_ABI_X32
7384 | OPTION_MASK_CODE16);
7385 opts->x_ix86_isa_flags2 = 0;
7387 else if (!orig_arch_specified)
7388 opts->x_ix86_arch_string = NULL;
7390 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7391 opts->x_ix86_tune_string
7392 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7393 else if (orig_tune_defaulted)
7394 opts->x_ix86_tune_string = NULL;
7396 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7397 if (enum_opts_set.x_ix86_fpmath)
7398 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7400 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7401 bool r = ix86_option_override_internal (false, opts, opts_set);
7402 if (!r)
7404 release_options_strings (option_strings);
7405 return error_mark_node;
7408 /* Add any builtin functions with the new isa if any. */
7409 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7411 /* Save the current options unless we are validating options for
7412 #pragma. */
7413 t = build_target_option_node (opts);
7415 opts->x_ix86_arch_string = orig_arch_string;
7416 opts->x_ix86_tune_string = orig_tune_string;
7417 opts_set->x_ix86_fpmath = orig_fpmath_set;
7419 release_options_strings (option_strings);
7422 return t;
7425 /* Hook to validate attribute((target("string"))). */
7427 static bool
7428 ix86_valid_target_attribute_p (tree fndecl,
7429 tree ARG_UNUSED (name),
7430 tree args,
7431 int ARG_UNUSED (flags))
7433 struct gcc_options func_options;
7434 tree new_target, new_optimize;
7435 bool ret = true;
7437 /* attribute((target("default"))) does nothing, beyond
7438 affecting multi-versioning. */
7439 if (TREE_VALUE (args)
7440 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7441 && TREE_CHAIN (args) == NULL_TREE
7442 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7443 return true;
7445 tree old_optimize = build_optimization_node (&global_options);
7447 /* Get the optimization options of the current function. */
7448 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7450 if (!func_optimize)
7451 func_optimize = old_optimize;
7453 /* Init func_options. */
7454 memset (&func_options, 0, sizeof (func_options));
7455 init_options_struct (&func_options, NULL);
7456 lang_hooks.init_options_struct (&func_options);
7458 cl_optimization_restore (&func_options,
7459 TREE_OPTIMIZATION (func_optimize));
7461 /* Initialize func_options to the default before its target options can
7462 be set. */
7463 cl_target_option_restore (&func_options,
7464 TREE_TARGET_OPTION (target_option_default_node));
7466 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7467 &global_options_set);
7469 new_optimize = build_optimization_node (&func_options);
7471 if (new_target == error_mark_node)
7472 ret = false;
7474 else if (fndecl && new_target)
7476 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7478 if (old_optimize != new_optimize)
7479 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7482 finalize_options_struct (&func_options);
7484 return ret;
7488 /* Hook to determine if one function can safely inline another. */
7490 static bool
7491 ix86_can_inline_p (tree caller, tree callee)
7493 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7494 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7495 if (!callee_tree)
7496 callee_tree = target_option_default_node;
7497 if (!caller_tree)
7498 caller_tree = target_option_default_node;
7499 if (callee_tree == caller_tree)
7500 return true;
7502 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7503 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7504 bool ret = false;
7506 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7507 function can inline a SSE2 function but a SSE2 function can't inline
7508 a SSE4 function. */
7509 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7510 != callee_opts->x_ix86_isa_flags)
7511 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7512 != callee_opts->x_ix86_isa_flags2))
7513 ret = false;
7515 /* See if we have the same non-isa options. */
7516 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7517 ret = false;
7519 /* See if arch, tune, etc. are the same. */
7520 else if (caller_opts->arch != callee_opts->arch)
7521 ret = false;
7523 else if (caller_opts->tune != callee_opts->tune)
7524 ret = false;
7526 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
7527 /* If the calle doesn't use FP expressions differences in
7528 ix86_fpmath can be ignored. We are called from FEs
7529 for multi-versioning call optimization, so beware of
7530 ipa_fn_summaries not available. */
7531 && (! ipa_fn_summaries
7532 || ipa_fn_summaries->get
7533 (cgraph_node::get (callee))->fp_expressions))
7534 ret = false;
7536 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7537 ret = false;
7539 else
7540 ret = true;
7542 return ret;
7546 /* Remember the last target of ix86_set_current_function. */
7547 static GTY(()) tree ix86_previous_fndecl;
7549 /* Set targets globals to the default (or current #pragma GCC target
7550 if active). Invalidate ix86_previous_fndecl cache. */
7552 void
7553 ix86_reset_previous_fndecl (void)
7555 tree new_tree = target_option_current_node;
7556 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7557 if (TREE_TARGET_GLOBALS (new_tree))
7558 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7559 else if (new_tree == target_option_default_node)
7560 restore_target_globals (&default_target_globals);
7561 else
7562 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7563 ix86_previous_fndecl = NULL_TREE;
7566 /* Set the func_type field from the function FNDECL. */
7568 static void
7569 ix86_set_func_type (tree fndecl)
7571 if (cfun->machine->func_type == TYPE_UNKNOWN)
7573 if (lookup_attribute ("interrupt",
7574 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7576 if (ix86_function_naked (fndecl))
7577 error_at (DECL_SOURCE_LOCATION (fndecl),
7578 "interrupt and naked attributes are not compatible");
7580 int nargs = 0;
7581 for (tree arg = DECL_ARGUMENTS (fndecl);
7582 arg;
7583 arg = TREE_CHAIN (arg))
7584 nargs++;
7585 cfun->machine->no_caller_saved_registers = true;
7586 cfun->machine->func_type
7587 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7589 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7591 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7592 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7593 sorry ("Only DWARF debug format is supported for interrupt "
7594 "service routine.");
7596 else
7598 cfun->machine->func_type = TYPE_NORMAL;
7599 if (lookup_attribute ("no_caller_saved_registers",
7600 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7601 cfun->machine->no_caller_saved_registers = true;
7606 /* Establish appropriate back-end context for processing the function
7607 FNDECL. The argument might be NULL to indicate processing at top
7608 level, outside of any function scope. */
7609 static void
7610 ix86_set_current_function (tree fndecl)
7612 /* Only change the context if the function changes. This hook is called
7613 several times in the course of compiling a function, and we don't want to
7614 slow things down too much or call target_reinit when it isn't safe. */
7615 if (fndecl == ix86_previous_fndecl)
7617 /* There may be 2 function bodies for the same function FNDECL,
7618 one is extern inline and one isn't. Call ix86_set_func_type
7619 to set the func_type field. */
7620 if (fndecl != NULL_TREE)
7621 ix86_set_func_type (fndecl);
7622 return;
7625 tree old_tree;
7626 if (ix86_previous_fndecl == NULL_TREE)
7627 old_tree = target_option_current_node;
7628 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7629 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7630 else
7631 old_tree = target_option_default_node;
7633 if (fndecl == NULL_TREE)
7635 if (old_tree != target_option_current_node)
7636 ix86_reset_previous_fndecl ();
7637 return;
7640 ix86_set_func_type (fndecl);
7642 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7643 if (new_tree == NULL_TREE)
7644 new_tree = target_option_default_node;
7646 if (old_tree != new_tree)
7648 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7649 if (TREE_TARGET_GLOBALS (new_tree))
7650 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7651 else if (new_tree == target_option_default_node)
7652 restore_target_globals (&default_target_globals);
7653 else
7654 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7656 ix86_previous_fndecl = fndecl;
7658 static bool prev_no_caller_saved_registers;
7660 /* 64-bit MS and SYSV ABI have different set of call used registers.
7661 Avoid expensive re-initialization of init_regs each time we switch
7662 function context. */
7663 if (TARGET_64BIT
7664 && (call_used_regs[SI_REG]
7665 == (cfun->machine->call_abi == MS_ABI)))
7666 reinit_regs ();
7667 /* Need to re-initialize init_regs if caller-saved registers are
7668 changed. */
7669 else if (prev_no_caller_saved_registers
7670 != cfun->machine->no_caller_saved_registers)
7671 reinit_regs ();
7673 if (cfun->machine->func_type != TYPE_NORMAL
7674 || cfun->machine->no_caller_saved_registers)
7676 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7677 may change processor state. */
7678 const char *isa;
7679 if (TARGET_MPX)
7680 isa = "MPX";
7681 else if (TARGET_SSE)
7682 isa = "SSE";
7683 else if (TARGET_MMX)
7684 isa = "MMX/3Dnow";
7685 else if (TARGET_80387)
7686 isa = "80387";
7687 else
7688 isa = NULL;
7689 if (isa != NULL)
7691 if (cfun->machine->func_type != TYPE_NORMAL)
7692 sorry ("%s instructions aren't allowed in %s service routine",
7693 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7694 ? "exception" : "interrupt"));
7695 else
7696 sorry ("%s instructions aren't allowed in function with "
7697 "no_caller_saved_registers attribute", isa);
7698 /* Don't issue the same error twice. */
7699 cfun->machine->func_type = TYPE_NORMAL;
7700 cfun->machine->no_caller_saved_registers = false;
7704 prev_no_caller_saved_registers
7705 = cfun->machine->no_caller_saved_registers;
7709 /* Return true if this goes in large data/bss. */
7711 static bool
7712 ix86_in_large_data_p (tree exp)
7714 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7715 return false;
7717 if (exp == NULL_TREE)
7718 return false;
7720 /* Functions are never large data. */
7721 if (TREE_CODE (exp) == FUNCTION_DECL)
7722 return false;
7724 /* Automatic variables are never large data. */
7725 if (VAR_P (exp) && !is_global_var (exp))
7726 return false;
7728 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7730 const char *section = DECL_SECTION_NAME (exp);
7731 if (strcmp (section, ".ldata") == 0
7732 || strcmp (section, ".lbss") == 0)
7733 return true;
7734 return false;
7736 else
7738 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7740 /* If this is an incomplete type with size 0, then we can't put it
7741 in data because it might be too big when completed. Also,
7742 int_size_in_bytes returns -1 if size can vary or is larger than
7743 an integer in which case also it is safer to assume that it goes in
7744 large data. */
7745 if (size <= 0 || size > ix86_section_threshold)
7746 return true;
7749 return false;
7752 /* i386-specific section flag to mark large sections. */
7753 #define SECTION_LARGE SECTION_MACH_DEP
7755 /* Switch to the appropriate section for output of DECL.
7756 DECL is either a `VAR_DECL' node or a constant of some sort.
7757 RELOC indicates whether forming the initial value of DECL requires
7758 link-time relocations. */
7760 ATTRIBUTE_UNUSED static section *
7761 x86_64_elf_select_section (tree decl, int reloc,
7762 unsigned HOST_WIDE_INT align)
7764 if (ix86_in_large_data_p (decl))
7766 const char *sname = NULL;
7767 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7768 switch (categorize_decl_for_section (decl, reloc))
7770 case SECCAT_DATA:
7771 sname = ".ldata";
7772 break;
7773 case SECCAT_DATA_REL:
7774 sname = ".ldata.rel";
7775 break;
7776 case SECCAT_DATA_REL_LOCAL:
7777 sname = ".ldata.rel.local";
7778 break;
7779 case SECCAT_DATA_REL_RO:
7780 sname = ".ldata.rel.ro";
7781 break;
7782 case SECCAT_DATA_REL_RO_LOCAL:
7783 sname = ".ldata.rel.ro.local";
7784 break;
7785 case SECCAT_BSS:
7786 sname = ".lbss";
7787 flags |= SECTION_BSS;
7788 break;
7789 case SECCAT_RODATA:
7790 case SECCAT_RODATA_MERGE_STR:
7791 case SECCAT_RODATA_MERGE_STR_INIT:
7792 case SECCAT_RODATA_MERGE_CONST:
7793 sname = ".lrodata";
7794 flags &= ~SECTION_WRITE;
7795 break;
7796 case SECCAT_SRODATA:
7797 case SECCAT_SDATA:
7798 case SECCAT_SBSS:
7799 gcc_unreachable ();
7800 case SECCAT_TEXT:
7801 case SECCAT_TDATA:
7802 case SECCAT_TBSS:
7803 /* We don't split these for medium model. Place them into
7804 default sections and hope for best. */
7805 break;
7807 if (sname)
7809 /* We might get called with string constants, but get_named_section
7810 doesn't like them as they are not DECLs. Also, we need to set
7811 flags in that case. */
7812 if (!DECL_P (decl))
7813 return get_section (sname, flags, NULL);
7814 return get_named_section (decl, sname, reloc);
7817 return default_elf_select_section (decl, reloc, align);
7820 /* Select a set of attributes for section NAME based on the properties
7821 of DECL and whether or not RELOC indicates that DECL's initializer
7822 might contain runtime relocations. */
7824 static unsigned int ATTRIBUTE_UNUSED
7825 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7827 unsigned int flags = default_section_type_flags (decl, name, reloc);
7829 if (ix86_in_large_data_p (decl))
7830 flags |= SECTION_LARGE;
7832 if (decl == NULL_TREE
7833 && (strcmp (name, ".ldata.rel.ro") == 0
7834 || strcmp (name, ".ldata.rel.ro.local") == 0))
7835 flags |= SECTION_RELRO;
7837 if (strcmp (name, ".lbss") == 0
7838 || strncmp (name, ".lbss.", 5) == 0
7839 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7840 flags |= SECTION_BSS;
7842 return flags;
7845 /* Build up a unique section name, expressed as a
7846 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7847 RELOC indicates whether the initial value of EXP requires
7848 link-time relocations. */
7850 static void ATTRIBUTE_UNUSED
7851 x86_64_elf_unique_section (tree decl, int reloc)
7853 if (ix86_in_large_data_p (decl))
7855 const char *prefix = NULL;
7856 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7857 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7859 switch (categorize_decl_for_section (decl, reloc))
7861 case SECCAT_DATA:
7862 case SECCAT_DATA_REL:
7863 case SECCAT_DATA_REL_LOCAL:
7864 case SECCAT_DATA_REL_RO:
7865 case SECCAT_DATA_REL_RO_LOCAL:
7866 prefix = one_only ? ".ld" : ".ldata";
7867 break;
7868 case SECCAT_BSS:
7869 prefix = one_only ? ".lb" : ".lbss";
7870 break;
7871 case SECCAT_RODATA:
7872 case SECCAT_RODATA_MERGE_STR:
7873 case SECCAT_RODATA_MERGE_STR_INIT:
7874 case SECCAT_RODATA_MERGE_CONST:
7875 prefix = one_only ? ".lr" : ".lrodata";
7876 break;
7877 case SECCAT_SRODATA:
7878 case SECCAT_SDATA:
7879 case SECCAT_SBSS:
7880 gcc_unreachable ();
7881 case SECCAT_TEXT:
7882 case SECCAT_TDATA:
7883 case SECCAT_TBSS:
7884 /* We don't split these for medium model. Place them into
7885 default sections and hope for best. */
7886 break;
7888 if (prefix)
7890 const char *name, *linkonce;
7891 char *string;
7893 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7894 name = targetm.strip_name_encoding (name);
7896 /* If we're using one_only, then there needs to be a .gnu.linkonce
7897 prefix to the section name. */
7898 linkonce = one_only ? ".gnu.linkonce" : "";
7900 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7902 set_decl_section_name (decl, string);
7903 return;
7906 default_unique_section (decl, reloc);
7909 #ifdef COMMON_ASM_OP
7911 #ifndef LARGECOMM_SECTION_ASM_OP
7912 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7913 #endif
7915 /* This says how to output assembler code to declare an
7916 uninitialized external linkage data object.
7918 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7919 large objects. */
7920 void
7921 x86_elf_aligned_decl_common (FILE *file, tree decl,
7922 const char *name, unsigned HOST_WIDE_INT size,
7923 int align)
7925 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7926 && size > (unsigned int)ix86_section_threshold)
7928 switch_to_section (get_named_section (decl, ".lbss", 0));
7929 fputs (LARGECOMM_SECTION_ASM_OP, file);
7931 else
7932 fputs (COMMON_ASM_OP, file);
7933 assemble_name (file, name);
7934 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7935 size, align / BITS_PER_UNIT);
7937 #endif
7939 /* Utility function for targets to use in implementing
7940 ASM_OUTPUT_ALIGNED_BSS. */
7942 void
7943 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7944 unsigned HOST_WIDE_INT size, int align)
7946 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7947 && size > (unsigned int)ix86_section_threshold)
7948 switch_to_section (get_named_section (decl, ".lbss", 0));
7949 else
7950 switch_to_section (bss_section);
7951 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7952 #ifdef ASM_DECLARE_OBJECT_NAME
7953 last_assemble_variable_decl = decl;
7954 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7955 #else
7956 /* Standard thing is just output label for the object. */
7957 ASM_OUTPUT_LABEL (file, name);
7958 #endif /* ASM_DECLARE_OBJECT_NAME */
7959 ASM_OUTPUT_SKIP (file, size ? size : 1);
7962 /* Decide whether we must probe the stack before any space allocation
7963 on this target. It's essentially TARGET_STACK_PROBE except when
7964 -fstack-check causes the stack to be already probed differently. */
7966 bool
7967 ix86_target_stack_probe (void)
7969 /* Do not probe the stack twice if static stack checking is enabled. */
7970 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7971 return false;
7973 return TARGET_STACK_PROBE;
7976 /* Decide whether we can make a sibling call to a function. DECL is the
7977 declaration of the function being targeted by the call and EXP is the
7978 CALL_EXPR representing the call. */
7980 static bool
7981 ix86_function_ok_for_sibcall (tree decl, tree exp)
7983 tree type, decl_or_type;
7984 rtx a, b;
7985 bool bind_global = decl && !targetm.binds_local_p (decl);
7987 if (ix86_function_naked (current_function_decl))
7988 return false;
7990 /* Sibling call isn't OK if there are no caller-saved registers
7991 since all registers must be preserved before return. */
7992 if (cfun->machine->no_caller_saved_registers)
7993 return false;
7995 /* If we are generating position-independent code, we cannot sibcall
7996 optimize direct calls to global functions, as the PLT requires
7997 %ebx be live. (Darwin does not have a PLT.) */
7998 if (!TARGET_MACHO
7999 && !TARGET_64BIT
8000 && flag_pic
8001 && flag_plt
8002 && bind_global)
8003 return false;
8005 /* If we need to align the outgoing stack, then sibcalling would
8006 unalign the stack, which may break the called function. */
8007 if (ix86_minimum_incoming_stack_boundary (true)
8008 < PREFERRED_STACK_BOUNDARY)
8009 return false;
8011 if (decl)
8013 decl_or_type = decl;
8014 type = TREE_TYPE (decl);
8016 else
8018 /* We're looking at the CALL_EXPR, we need the type of the function. */
8019 type = CALL_EXPR_FN (exp); /* pointer expression */
8020 type = TREE_TYPE (type); /* pointer type */
8021 type = TREE_TYPE (type); /* function type */
8022 decl_or_type = type;
8025 /* Check that the return value locations are the same. Like
8026 if we are returning floats on the 80387 register stack, we cannot
8027 make a sibcall from a function that doesn't return a float to a
8028 function that does or, conversely, from a function that does return
8029 a float to a function that doesn't; the necessary stack adjustment
8030 would not be executed. This is also the place we notice
8031 differences in the return value ABI. Note that it is ok for one
8032 of the functions to have void return type as long as the return
8033 value of the other is passed in a register. */
8034 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
8035 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
8036 cfun->decl, false);
8037 if (STACK_REG_P (a) || STACK_REG_P (b))
8039 if (!rtx_equal_p (a, b))
8040 return false;
8042 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
8044 else if (!rtx_equal_p (a, b))
8045 return false;
8047 if (TARGET_64BIT)
8049 /* The SYSV ABI has more call-clobbered registers;
8050 disallow sibcalls from MS to SYSV. */
8051 if (cfun->machine->call_abi == MS_ABI
8052 && ix86_function_type_abi (type) == SYSV_ABI)
8053 return false;
8055 else
8057 /* If this call is indirect, we'll need to be able to use a
8058 call-clobbered register for the address of the target function.
8059 Make sure that all such registers are not used for passing
8060 parameters. Note that DLLIMPORT functions and call to global
8061 function via GOT slot are indirect. */
8062 if (!decl
8063 || (bind_global && flag_pic && !flag_plt)
8064 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8066 /* Check if regparm >= 3 since arg_reg_available is set to
8067 false if regparm == 0. If regparm is 1 or 2, there is
8068 always a call-clobbered register available.
8070 ??? The symbol indirect call doesn't need a call-clobbered
8071 register. But we don't know if this is a symbol indirect
8072 call or not here. */
8073 if (ix86_function_regparm (type, NULL) >= 3
8074 && !cfun->machine->arg_reg_available)
8075 return false;
8079 /* Otherwise okay. That also includes certain types of indirect calls. */
8080 return true;
8083 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8084 and "sseregparm" calling convention attributes;
8085 arguments as in struct attribute_spec.handler. */
8087 static tree
8088 ix86_handle_cconv_attribute (tree *node, tree name,
8089 tree args,
8090 int,
8091 bool *no_add_attrs)
8093 if (TREE_CODE (*node) != FUNCTION_TYPE
8094 && TREE_CODE (*node) != METHOD_TYPE
8095 && TREE_CODE (*node) != FIELD_DECL
8096 && TREE_CODE (*node) != TYPE_DECL)
8098 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8099 name);
8100 *no_add_attrs = true;
8101 return NULL_TREE;
8104 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8105 if (is_attribute_p ("regparm", name))
8107 tree cst;
8109 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8111 error ("fastcall and regparm attributes are not compatible");
8114 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8116 error ("regparam and thiscall attributes are not compatible");
8119 cst = TREE_VALUE (args);
8120 if (TREE_CODE (cst) != INTEGER_CST)
8122 warning (OPT_Wattributes,
8123 "%qE attribute requires an integer constant argument",
8124 name);
8125 *no_add_attrs = true;
8127 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8129 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8130 name, REGPARM_MAX);
8131 *no_add_attrs = true;
8134 return NULL_TREE;
8137 if (TARGET_64BIT)
8139 /* Do not warn when emulating the MS ABI. */
8140 if ((TREE_CODE (*node) != FUNCTION_TYPE
8141 && TREE_CODE (*node) != METHOD_TYPE)
8142 || ix86_function_type_abi (*node) != MS_ABI)
8143 warning (OPT_Wattributes, "%qE attribute ignored",
8144 name);
8145 *no_add_attrs = true;
8146 return NULL_TREE;
8149 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8150 if (is_attribute_p ("fastcall", name))
8152 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8154 error ("fastcall and cdecl attributes are not compatible");
8156 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8158 error ("fastcall and stdcall attributes are not compatible");
8160 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8162 error ("fastcall and regparm attributes are not compatible");
8164 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8166 error ("fastcall and thiscall attributes are not compatible");
8170 /* Can combine stdcall with fastcall (redundant), regparm and
8171 sseregparm. */
8172 else if (is_attribute_p ("stdcall", name))
8174 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8176 error ("stdcall and cdecl attributes are not compatible");
8178 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8180 error ("stdcall and fastcall attributes are not compatible");
8182 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8184 error ("stdcall and thiscall attributes are not compatible");
8188 /* Can combine cdecl with regparm and sseregparm. */
8189 else if (is_attribute_p ("cdecl", name))
8191 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8193 error ("stdcall and cdecl attributes are not compatible");
8195 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8197 error ("fastcall and cdecl attributes are not compatible");
8199 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8201 error ("cdecl and thiscall attributes are not compatible");
8204 else if (is_attribute_p ("thiscall", name))
8206 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8207 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8208 name);
8209 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8211 error ("stdcall and thiscall attributes are not compatible");
8213 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8215 error ("fastcall and thiscall attributes are not compatible");
8217 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8219 error ("cdecl and thiscall attributes are not compatible");
8223 /* Can combine sseregparm with all attributes. */
8225 return NULL_TREE;
8228 /* The transactional memory builtins are implicitly regparm or fastcall
8229 depending on the ABI. Override the generic do-nothing attribute that
8230 these builtins were declared with, and replace it with one of the two
8231 attributes that we expect elsewhere. */
8233 static tree
8234 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8235 int flags, bool *no_add_attrs)
8237 tree alt;
8239 /* In no case do we want to add the placeholder attribute. */
8240 *no_add_attrs = true;
8242 /* The 64-bit ABI is unchanged for transactional memory. */
8243 if (TARGET_64BIT)
8244 return NULL_TREE;
8246 /* ??? Is there a better way to validate 32-bit windows? We have
8247 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8248 if (CHECK_STACK_LIMIT > 0)
8249 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8250 else
8252 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8253 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8255 decl_attributes (node, alt, flags);
8257 return NULL_TREE;
8260 /* This function determines from TYPE the calling-convention. */
8262 unsigned int
8263 ix86_get_callcvt (const_tree type)
8265 unsigned int ret = 0;
8266 bool is_stdarg;
8267 tree attrs;
8269 if (TARGET_64BIT)
8270 return IX86_CALLCVT_CDECL;
8272 attrs = TYPE_ATTRIBUTES (type);
8273 if (attrs != NULL_TREE)
8275 if (lookup_attribute ("cdecl", attrs))
8276 ret |= IX86_CALLCVT_CDECL;
8277 else if (lookup_attribute ("stdcall", attrs))
8278 ret |= IX86_CALLCVT_STDCALL;
8279 else if (lookup_attribute ("fastcall", attrs))
8280 ret |= IX86_CALLCVT_FASTCALL;
8281 else if (lookup_attribute ("thiscall", attrs))
8282 ret |= IX86_CALLCVT_THISCALL;
8284 /* Regparam isn't allowed for thiscall and fastcall. */
8285 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8287 if (lookup_attribute ("regparm", attrs))
8288 ret |= IX86_CALLCVT_REGPARM;
8289 if (lookup_attribute ("sseregparm", attrs))
8290 ret |= IX86_CALLCVT_SSEREGPARM;
8293 if (IX86_BASE_CALLCVT(ret) != 0)
8294 return ret;
8297 is_stdarg = stdarg_p (type);
8298 if (TARGET_RTD && !is_stdarg)
8299 return IX86_CALLCVT_STDCALL | ret;
8301 if (ret != 0
8302 || is_stdarg
8303 || TREE_CODE (type) != METHOD_TYPE
8304 || ix86_function_type_abi (type) != MS_ABI)
8305 return IX86_CALLCVT_CDECL | ret;
8307 return IX86_CALLCVT_THISCALL;
8310 /* Return 0 if the attributes for two types are incompatible, 1 if they
8311 are compatible, and 2 if they are nearly compatible (which causes a
8312 warning to be generated). */
8314 static int
8315 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8317 unsigned int ccvt1, ccvt2;
8319 if (TREE_CODE (type1) != FUNCTION_TYPE
8320 && TREE_CODE (type1) != METHOD_TYPE)
8321 return 1;
8323 ccvt1 = ix86_get_callcvt (type1);
8324 ccvt2 = ix86_get_callcvt (type2);
8325 if (ccvt1 != ccvt2)
8326 return 0;
8327 if (ix86_function_regparm (type1, NULL)
8328 != ix86_function_regparm (type2, NULL))
8329 return 0;
8331 return 1;
8334 /* Return the regparm value for a function with the indicated TYPE and DECL.
8335 DECL may be NULL when calling function indirectly
8336 or considering a libcall. */
8338 static int
8339 ix86_function_regparm (const_tree type, const_tree decl)
8341 tree attr;
8342 int regparm;
8343 unsigned int ccvt;
8345 if (TARGET_64BIT)
8346 return (ix86_function_type_abi (type) == SYSV_ABI
8347 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8348 ccvt = ix86_get_callcvt (type);
8349 regparm = ix86_regparm;
8351 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8353 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8354 if (attr)
8356 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8357 return regparm;
8360 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8361 return 2;
8362 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8363 return 1;
8365 /* Use register calling convention for local functions when possible. */
8366 if (decl
8367 && TREE_CODE (decl) == FUNCTION_DECL)
8369 cgraph_node *target = cgraph_node::get (decl);
8370 if (target)
8371 target = target->function_symbol ();
8373 /* Caller and callee must agree on the calling convention, so
8374 checking here just optimize means that with
8375 __attribute__((optimize (...))) caller could use regparm convention
8376 and callee not, or vice versa. Instead look at whether the callee
8377 is optimized or not. */
8378 if (target && opt_for_fn (target->decl, optimize)
8379 && !(profile_flag && !flag_fentry))
8381 cgraph_local_info *i = &target->local;
8382 if (i && i->local && i->can_change_signature)
8384 int local_regparm, globals = 0, regno;
8386 /* Make sure no regparm register is taken by a
8387 fixed register variable. */
8388 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8389 local_regparm++)
8390 if (fixed_regs[local_regparm])
8391 break;
8393 /* We don't want to use regparm(3) for nested functions as
8394 these use a static chain pointer in the third argument. */
8395 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8396 local_regparm = 2;
8398 /* Save a register for the split stack. */
8399 if (flag_split_stack)
8401 if (local_regparm == 3)
8402 local_regparm = 2;
8403 else if (local_regparm == 2
8404 && DECL_STATIC_CHAIN (target->decl))
8405 local_regparm = 1;
8408 /* Each fixed register usage increases register pressure,
8409 so less registers should be used for argument passing.
8410 This functionality can be overriden by an explicit
8411 regparm value. */
8412 for (regno = AX_REG; regno <= DI_REG; regno++)
8413 if (fixed_regs[regno])
8414 globals++;
8416 local_regparm
8417 = globals < local_regparm ? local_regparm - globals : 0;
8419 if (local_regparm > regparm)
8420 regparm = local_regparm;
8425 return regparm;
8428 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8429 DFmode (2) arguments in SSE registers for a function with the
8430 indicated TYPE and DECL. DECL may be NULL when calling function
8431 indirectly or considering a libcall. Return -1 if any FP parameter
8432 should be rejected by error. This is used in siutation we imply SSE
8433 calling convetion but the function is called from another function with
8434 SSE disabled. Otherwise return 0. */
8436 static int
8437 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8439 gcc_assert (!TARGET_64BIT);
8441 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8442 by the sseregparm attribute. */
8443 if (TARGET_SSEREGPARM
8444 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8446 if (!TARGET_SSE)
8448 if (warn)
8450 if (decl)
8451 error ("calling %qD with attribute sseregparm without "
8452 "SSE/SSE2 enabled", decl);
8453 else
8454 error ("calling %qT with attribute sseregparm without "
8455 "SSE/SSE2 enabled", type);
8457 return 0;
8460 return 2;
8463 if (!decl)
8464 return 0;
8466 cgraph_node *target = cgraph_node::get (decl);
8467 if (target)
8468 target = target->function_symbol ();
8470 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8471 (and DFmode for SSE2) arguments in SSE registers. */
8472 if (target
8473 /* TARGET_SSE_MATH */
8474 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8475 && opt_for_fn (target->decl, optimize)
8476 && !(profile_flag && !flag_fentry))
8478 cgraph_local_info *i = &target->local;
8479 if (i && i->local && i->can_change_signature)
8481 /* Refuse to produce wrong code when local function with SSE enabled
8482 is called from SSE disabled function.
8483 FIXME: We need a way to detect these cases cross-ltrans partition
8484 and avoid using SSE calling conventions on local functions called
8485 from function with SSE disabled. For now at least delay the
8486 warning until we know we are going to produce wrong code.
8487 See PR66047 */
8488 if (!TARGET_SSE && warn)
8489 return -1;
8490 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8491 ->x_ix86_isa_flags) ? 2 : 1;
8495 return 0;
8498 /* Return true if EAX is live at the start of the function. Used by
8499 ix86_expand_prologue to determine if we need special help before
8500 calling allocate_stack_worker. */
8502 static bool
8503 ix86_eax_live_at_start_p (void)
8505 /* Cheat. Don't bother working forward from ix86_function_regparm
8506 to the function type to whether an actual argument is located in
8507 eax. Instead just look at cfg info, which is still close enough
8508 to correct at this point. This gives false positives for broken
8509 functions that might use uninitialized data that happens to be
8510 allocated in eax, but who cares? */
8511 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8514 static bool
8515 ix86_keep_aggregate_return_pointer (tree fntype)
8517 tree attr;
8519 if (!TARGET_64BIT)
8521 attr = lookup_attribute ("callee_pop_aggregate_return",
8522 TYPE_ATTRIBUTES (fntype));
8523 if (attr)
8524 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8526 /* For 32-bit MS-ABI the default is to keep aggregate
8527 return pointer. */
8528 if (ix86_function_type_abi (fntype) == MS_ABI)
8529 return true;
8531 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8534 /* Value is the number of bytes of arguments automatically
8535 popped when returning from a subroutine call.
8536 FUNDECL is the declaration node of the function (as a tree),
8537 FUNTYPE is the data type of the function (as a tree),
8538 or for a library call it is an identifier node for the subroutine name.
8539 SIZE is the number of bytes of arguments passed on the stack.
8541 On the 80386, the RTD insn may be used to pop them if the number
8542 of args is fixed, but if the number is variable then the caller
8543 must pop them all. RTD can't be used for library calls now
8544 because the library is compiled with the Unix compiler.
8545 Use of RTD is a selectable option, since it is incompatible with
8546 standard Unix calling sequences. If the option is not selected,
8547 the caller must always pop the args.
8549 The attribute stdcall is equivalent to RTD on a per module basis. */
8551 static int
8552 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8554 unsigned int ccvt;
8556 /* None of the 64-bit ABIs pop arguments. */
8557 if (TARGET_64BIT)
8558 return 0;
8560 ccvt = ix86_get_callcvt (funtype);
8562 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8563 | IX86_CALLCVT_THISCALL)) != 0
8564 && ! stdarg_p (funtype))
8565 return size;
8567 /* Lose any fake structure return argument if it is passed on the stack. */
8568 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8569 && !ix86_keep_aggregate_return_pointer (funtype))
8571 int nregs = ix86_function_regparm (funtype, fundecl);
8572 if (nregs == 0)
8573 return GET_MODE_SIZE (Pmode);
8576 return 0;
8579 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8581 static bool
8582 ix86_legitimate_combined_insn (rtx_insn *insn)
8584 int i;
8586 /* Check operand constraints in case hard registers were propagated
8587 into insn pattern. This check prevents combine pass from
8588 generating insn patterns with invalid hard register operands.
8589 These invalid insns can eventually confuse reload to error out
8590 with a spill failure. See also PRs 46829 and 46843. */
8592 gcc_assert (INSN_CODE (insn) >= 0);
8594 extract_insn (insn);
8595 preprocess_constraints (insn);
8597 int n_operands = recog_data.n_operands;
8598 int n_alternatives = recog_data.n_alternatives;
8599 for (i = 0; i < n_operands; i++)
8601 rtx op = recog_data.operand[i];
8602 machine_mode mode = GET_MODE (op);
8603 const operand_alternative *op_alt;
8604 int offset = 0;
8605 bool win;
8606 int j;
8608 /* A unary operator may be accepted by the predicate, but it
8609 is irrelevant for matching constraints. */
8610 if (UNARY_P (op))
8611 op = XEXP (op, 0);
8613 if (SUBREG_P (op))
8615 if (REG_P (SUBREG_REG (op))
8616 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8617 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8618 GET_MODE (SUBREG_REG (op)),
8619 SUBREG_BYTE (op),
8620 GET_MODE (op));
8621 op = SUBREG_REG (op);
8624 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8625 continue;
8627 op_alt = recog_op_alt;
8629 /* Operand has no constraints, anything is OK. */
8630 win = !n_alternatives;
8632 alternative_mask preferred = get_preferred_alternatives (insn);
8633 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8635 if (!TEST_BIT (preferred, j))
8636 continue;
8637 if (op_alt[i].anything_ok
8638 || (op_alt[i].matches != -1
8639 && operands_match_p
8640 (recog_data.operand[i],
8641 recog_data.operand[op_alt[i].matches]))
8642 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8644 win = true;
8645 break;
8649 if (!win)
8650 return false;
8653 return true;
8656 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8658 static unsigned HOST_WIDE_INT
8659 ix86_asan_shadow_offset (void)
8661 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8662 : HOST_WIDE_INT_C (0x7fff8000))
8663 : (HOST_WIDE_INT_1 << 29);
8666 /* Argument support functions. */
8668 /* Return true when register may be used to pass function parameters. */
8669 bool
8670 ix86_function_arg_regno_p (int regno)
8672 int i;
8673 enum calling_abi call_abi;
8674 const int *parm_regs;
8676 if (TARGET_MPX && BND_REGNO_P (regno))
8677 return true;
8679 if (!TARGET_64BIT)
8681 if (TARGET_MACHO)
8682 return (regno < REGPARM_MAX
8683 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8684 else
8685 return (regno < REGPARM_MAX
8686 || (TARGET_MMX && MMX_REGNO_P (regno)
8687 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8688 || (TARGET_SSE && SSE_REGNO_P (regno)
8689 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8692 if (TARGET_SSE && SSE_REGNO_P (regno)
8693 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8694 return true;
8696 /* TODO: The function should depend on current function ABI but
8697 builtins.c would need updating then. Therefore we use the
8698 default ABI. */
8699 call_abi = ix86_cfun_abi ();
8701 /* RAX is used as hidden argument to va_arg functions. */
8702 if (call_abi == SYSV_ABI && regno == AX_REG)
8703 return true;
8705 if (call_abi == MS_ABI)
8706 parm_regs = x86_64_ms_abi_int_parameter_registers;
8707 else
8708 parm_regs = x86_64_int_parameter_registers;
8710 for (i = 0; i < (call_abi == MS_ABI
8711 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8712 if (regno == parm_regs[i])
8713 return true;
8714 return false;
8717 /* Return if we do not know how to pass TYPE solely in registers. */
8719 static bool
8720 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8722 if (must_pass_in_stack_var_size_or_pad (mode, type))
8723 return true;
8725 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8726 The layout_type routine is crafty and tries to trick us into passing
8727 currently unsupported vector types on the stack by using TImode. */
8728 return (!TARGET_64BIT && mode == TImode
8729 && type && TREE_CODE (type) != VECTOR_TYPE);
8732 /* It returns the size, in bytes, of the area reserved for arguments passed
8733 in registers for the function represented by fndecl dependent to the used
8734 abi format. */
8736 ix86_reg_parm_stack_space (const_tree fndecl)
8738 enum calling_abi call_abi = SYSV_ABI;
8739 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8740 call_abi = ix86_function_abi (fndecl);
8741 else
8742 call_abi = ix86_function_type_abi (fndecl);
8743 if (TARGET_64BIT && call_abi == MS_ABI)
8744 return 32;
8745 return 0;
8748 /* We add this as a workaround in order to use libc_has_function
8749 hook in i386.md. */
8750 bool
8751 ix86_libc_has_function (enum function_class fn_class)
8753 return targetm.libc_has_function (fn_class);
8756 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8757 specifying the call abi used. */
8758 enum calling_abi
8759 ix86_function_type_abi (const_tree fntype)
8761 enum calling_abi abi = ix86_abi;
8763 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8764 return abi;
8766 if (abi == SYSV_ABI
8767 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8769 static int warned;
8770 if (TARGET_X32 && !warned)
8772 error ("X32 does not support ms_abi attribute");
8773 warned = 1;
8776 abi = MS_ABI;
8778 else if (abi == MS_ABI
8779 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8780 abi = SYSV_ABI;
8782 return abi;
8785 static enum calling_abi
8786 ix86_function_abi (const_tree fndecl)
8788 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8791 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8792 specifying the call abi used. */
8793 enum calling_abi
8794 ix86_cfun_abi (void)
8796 return cfun ? cfun->machine->call_abi : ix86_abi;
8799 static bool
8800 ix86_function_ms_hook_prologue (const_tree fn)
8802 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8804 if (decl_function_context (fn) != NULL_TREE)
8805 error_at (DECL_SOURCE_LOCATION (fn),
8806 "ms_hook_prologue is not compatible with nested function");
8807 else
8808 return true;
8810 return false;
8813 static bool
8814 ix86_function_naked (const_tree fn)
8816 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
8817 return true;
8819 return false;
8822 /* Write the extra assembler code needed to declare a function properly. */
8824 void
8825 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8826 tree decl)
8828 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8830 if (is_ms_hook)
8832 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8833 unsigned int filler_cc = 0xcccccccc;
8835 for (i = 0; i < filler_count; i += 4)
8836 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8839 #ifdef SUBTARGET_ASM_UNWIND_INIT
8840 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8841 #endif
8843 ASM_OUTPUT_LABEL (asm_out_file, fname);
8845 /* Output magic byte marker, if hot-patch attribute is set. */
8846 if (is_ms_hook)
8848 if (TARGET_64BIT)
8850 /* leaq [%rsp + 0], %rsp */
8851 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
8852 asm_out_file);
8854 else
8856 /* movl.s %edi, %edi
8857 push %ebp
8858 movl.s %esp, %ebp */
8859 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
8864 /* Implementation of call abi switching target hook. Specific to FNDECL
8865 the specific call register sets are set. See also
8866 ix86_conditional_register_usage for more details. */
8867 void
8868 ix86_call_abi_override (const_tree fndecl)
8870 cfun->machine->call_abi = ix86_function_abi (fndecl);
8873 /* Return 1 if pseudo register should be created and used to hold
8874 GOT address for PIC code. */
8875 bool
8876 ix86_use_pseudo_pic_reg (void)
8878 if ((TARGET_64BIT
8879 && (ix86_cmodel == CM_SMALL_PIC
8880 || TARGET_PECOFF))
8881 || !flag_pic)
8882 return false;
8883 return true;
8886 /* Initialize large model PIC register. */
8888 static void
8889 ix86_init_large_pic_reg (unsigned int tmp_regno)
8891 rtx_code_label *label;
8892 rtx tmp_reg;
8894 gcc_assert (Pmode == DImode);
8895 label = gen_label_rtx ();
8896 emit_label (label);
8897 LABEL_PRESERVE_P (label) = 1;
8898 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8899 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8900 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8901 label));
8902 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8903 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8904 pic_offset_table_rtx, tmp_reg));
8907 /* Create and initialize PIC register if required. */
8908 static void
8909 ix86_init_pic_reg (void)
8911 edge entry_edge;
8912 rtx_insn *seq;
8914 if (!ix86_use_pseudo_pic_reg ())
8915 return;
8917 start_sequence ();
8919 if (TARGET_64BIT)
8921 if (ix86_cmodel == CM_LARGE_PIC)
8922 ix86_init_large_pic_reg (R11_REG);
8923 else
8924 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8926 else
8928 /* If there is future mcount call in the function it is more profitable
8929 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8930 rtx reg = crtl->profile
8931 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8932 : pic_offset_table_rtx;
8933 rtx_insn *insn = emit_insn (gen_set_got (reg));
8934 RTX_FRAME_RELATED_P (insn) = 1;
8935 if (crtl->profile)
8936 emit_move_insn (pic_offset_table_rtx, reg);
8937 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8940 seq = get_insns ();
8941 end_sequence ();
8943 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8944 insert_insn_on_edge (seq, entry_edge);
8945 commit_one_edge_insertion (entry_edge);
8948 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8949 for a call to a function whose data type is FNTYPE.
8950 For a library call, FNTYPE is 0. */
8952 void
8953 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8954 tree fntype, /* tree ptr for function decl */
8955 rtx libname, /* SYMBOL_REF of library name or 0 */
8956 tree fndecl,
8957 int caller)
8959 struct cgraph_local_info *i = NULL;
8960 struct cgraph_node *target = NULL;
8962 memset (cum, 0, sizeof (*cum));
8964 if (fndecl)
8966 target = cgraph_node::get (fndecl);
8967 if (target)
8969 target = target->function_symbol ();
8970 i = cgraph_node::local_info (target->decl);
8971 cum->call_abi = ix86_function_abi (target->decl);
8973 else
8974 cum->call_abi = ix86_function_abi (fndecl);
8976 else
8977 cum->call_abi = ix86_function_type_abi (fntype);
8979 cum->caller = caller;
8981 /* Set up the number of registers to use for passing arguments. */
8982 cum->nregs = ix86_regparm;
8983 if (TARGET_64BIT)
8985 cum->nregs = (cum->call_abi == SYSV_ABI
8986 ? X86_64_REGPARM_MAX
8987 : X86_64_MS_REGPARM_MAX);
8989 if (TARGET_SSE)
8991 cum->sse_nregs = SSE_REGPARM_MAX;
8992 if (TARGET_64BIT)
8994 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8995 ? X86_64_SSE_REGPARM_MAX
8996 : X86_64_MS_SSE_REGPARM_MAX);
8999 if (TARGET_MMX)
9000 cum->mmx_nregs = MMX_REGPARM_MAX;
9001 cum->warn_avx512f = true;
9002 cum->warn_avx = true;
9003 cum->warn_sse = true;
9004 cum->warn_mmx = true;
9006 /* Because type might mismatch in between caller and callee, we need to
9007 use actual type of function for local calls.
9008 FIXME: cgraph_analyze can be told to actually record if function uses
9009 va_start so for local functions maybe_vaarg can be made aggressive
9010 helping K&R code.
9011 FIXME: once typesytem is fixed, we won't need this code anymore. */
9012 if (i && i->local && i->can_change_signature)
9013 fntype = TREE_TYPE (target->decl);
9014 cum->stdarg = stdarg_p (fntype);
9015 cum->maybe_vaarg = (fntype
9016 ? (!prototype_p (fntype) || stdarg_p (fntype))
9017 : !libname);
9019 cum->bnd_regno = FIRST_BND_REG;
9020 cum->bnds_in_bt = 0;
9021 cum->force_bnd_pass = 0;
9022 cum->decl = fndecl;
9024 if (!TARGET_64BIT)
9026 /* If there are variable arguments, then we won't pass anything
9027 in registers in 32-bit mode. */
9028 if (stdarg_p (fntype))
9030 cum->nregs = 0;
9031 /* Since in 32-bit, variable arguments are always passed on
9032 stack, there is scratch register available for indirect
9033 sibcall. */
9034 cfun->machine->arg_reg_available = true;
9035 cum->sse_nregs = 0;
9036 cum->mmx_nregs = 0;
9037 cum->warn_avx512f = false;
9038 cum->warn_avx = false;
9039 cum->warn_sse = false;
9040 cum->warn_mmx = false;
9041 return;
9044 /* Use ecx and edx registers if function has fastcall attribute,
9045 else look for regparm information. */
9046 if (fntype)
9048 unsigned int ccvt = ix86_get_callcvt (fntype);
9049 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
9051 cum->nregs = 1;
9052 cum->fastcall = 1; /* Same first register as in fastcall. */
9054 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
9056 cum->nregs = 2;
9057 cum->fastcall = 1;
9059 else
9060 cum->nregs = ix86_function_regparm (fntype, fndecl);
9063 /* Set up the number of SSE registers used for passing SFmode
9064 and DFmode arguments. Warn for mismatching ABI. */
9065 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
9068 cfun->machine->arg_reg_available = (cum->nregs > 0);
9071 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9072 But in the case of vector types, it is some vector mode.
9074 When we have only some of our vector isa extensions enabled, then there
9075 are some modes for which vector_mode_supported_p is false. For these
9076 modes, the generic vector support in gcc will choose some non-vector mode
9077 in order to implement the type. By computing the natural mode, we'll
9078 select the proper ABI location for the operand and not depend on whatever
9079 the middle-end decides to do with these vector types.
9081 The midde-end can't deal with the vector types > 16 bytes. In this
9082 case, we return the original mode and warn ABI change if CUM isn't
9083 NULL.
9085 If INT_RETURN is true, warn ABI change if the vector mode isn't
9086 available for function return value. */
9088 static machine_mode
9089 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9090 bool in_return)
9092 machine_mode mode = TYPE_MODE (type);
9094 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9096 HOST_WIDE_INT size = int_size_in_bytes (type);
9097 if ((size == 8 || size == 16 || size == 32 || size == 64)
9098 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9099 && TYPE_VECTOR_SUBPARTS (type) > 1)
9101 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9103 /* There are no XFmode vector modes. */
9104 if (innermode == XFmode)
9105 return mode;
9107 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9108 mode = MIN_MODE_VECTOR_FLOAT;
9109 else
9110 mode = MIN_MODE_VECTOR_INT;
9112 /* Get the mode which has this inner mode and number of units. */
9113 FOR_EACH_MODE_FROM (mode, mode)
9114 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9115 && GET_MODE_INNER (mode) == innermode)
9117 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9119 static bool warnedavx512f;
9120 static bool warnedavx512f_ret;
9122 if (cum && cum->warn_avx512f && !warnedavx512f)
9124 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9125 "without AVX512F enabled changes the ABI"))
9126 warnedavx512f = true;
9128 else if (in_return && !warnedavx512f_ret)
9130 if (warning (OPT_Wpsabi, "AVX512F vector return "
9131 "without AVX512F enabled changes the ABI"))
9132 warnedavx512f_ret = true;
9135 return TYPE_MODE (type);
9137 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9139 static bool warnedavx;
9140 static bool warnedavx_ret;
9142 if (cum && cum->warn_avx && !warnedavx)
9144 if (warning (OPT_Wpsabi, "AVX vector argument "
9145 "without AVX enabled changes the ABI"))
9146 warnedavx = true;
9148 else if (in_return && !warnedavx_ret)
9150 if (warning (OPT_Wpsabi, "AVX vector return "
9151 "without AVX enabled changes the ABI"))
9152 warnedavx_ret = true;
9155 return TYPE_MODE (type);
9157 else if (((size == 8 && TARGET_64BIT) || size == 16)
9158 && !TARGET_SSE
9159 && !TARGET_IAMCU)
9161 static bool warnedsse;
9162 static bool warnedsse_ret;
9164 if (cum && cum->warn_sse && !warnedsse)
9166 if (warning (OPT_Wpsabi, "SSE vector argument "
9167 "without SSE enabled changes the ABI"))
9168 warnedsse = true;
9170 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9172 if (warning (OPT_Wpsabi, "SSE vector return "
9173 "without SSE enabled changes the ABI"))
9174 warnedsse_ret = true;
9177 else if ((size == 8 && !TARGET_64BIT)
9178 && (!cfun
9179 || cfun->machine->func_type == TYPE_NORMAL)
9180 && !TARGET_MMX
9181 && !TARGET_IAMCU)
9183 static bool warnedmmx;
9184 static bool warnedmmx_ret;
9186 if (cum && cum->warn_mmx && !warnedmmx)
9188 if (warning (OPT_Wpsabi, "MMX vector argument "
9189 "without MMX enabled changes the ABI"))
9190 warnedmmx = true;
9192 else if (in_return && !warnedmmx_ret)
9194 if (warning (OPT_Wpsabi, "MMX vector return "
9195 "without MMX enabled changes the ABI"))
9196 warnedmmx_ret = true;
9199 return mode;
9202 gcc_unreachable ();
9206 return mode;
9209 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9210 this may not agree with the mode that the type system has chosen for the
9211 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9212 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9214 static rtx
9215 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9216 unsigned int regno)
9218 rtx tmp;
9220 if (orig_mode != BLKmode)
9221 tmp = gen_rtx_REG (orig_mode, regno);
9222 else
9224 tmp = gen_rtx_REG (mode, regno);
9225 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9226 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9229 return tmp;
9232 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9233 of this code is to classify each 8bytes of incoming argument by the register
9234 class and assign registers accordingly. */
9236 /* Return the union class of CLASS1 and CLASS2.
9237 See the x86-64 PS ABI for details. */
9239 static enum x86_64_reg_class
9240 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9242 /* Rule #1: If both classes are equal, this is the resulting class. */
9243 if (class1 == class2)
9244 return class1;
9246 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9247 the other class. */
9248 if (class1 == X86_64_NO_CLASS)
9249 return class2;
9250 if (class2 == X86_64_NO_CLASS)
9251 return class1;
9253 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9254 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9255 return X86_64_MEMORY_CLASS;
9257 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9258 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9259 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9260 return X86_64_INTEGERSI_CLASS;
9261 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9262 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9263 return X86_64_INTEGER_CLASS;
9265 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9266 MEMORY is used. */
9267 if (class1 == X86_64_X87_CLASS
9268 || class1 == X86_64_X87UP_CLASS
9269 || class1 == X86_64_COMPLEX_X87_CLASS
9270 || class2 == X86_64_X87_CLASS
9271 || class2 == X86_64_X87UP_CLASS
9272 || class2 == X86_64_COMPLEX_X87_CLASS)
9273 return X86_64_MEMORY_CLASS;
9275 /* Rule #6: Otherwise class SSE is used. */
9276 return X86_64_SSE_CLASS;
9279 /* Classify the argument of type TYPE and mode MODE.
9280 CLASSES will be filled by the register class used to pass each word
9281 of the operand. The number of words is returned. In case the parameter
9282 should be passed in memory, 0 is returned. As a special case for zero
9283 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9285 BIT_OFFSET is used internally for handling records and specifies offset
9286 of the offset in bits modulo 512 to avoid overflow cases.
9288 See the x86-64 PS ABI for details.
9291 static int
9292 classify_argument (machine_mode mode, const_tree type,
9293 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9295 HOST_WIDE_INT bytes =
9296 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9297 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9299 /* Variable sized entities are always passed/returned in memory. */
9300 if (bytes < 0)
9301 return 0;
9303 if (mode != VOIDmode
9304 && targetm.calls.must_pass_in_stack (mode, type))
9305 return 0;
9307 if (type && AGGREGATE_TYPE_P (type))
9309 int i;
9310 tree field;
9311 enum x86_64_reg_class subclasses[MAX_CLASSES];
9313 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9314 if (bytes > 64)
9315 return 0;
9317 for (i = 0; i < words; i++)
9318 classes[i] = X86_64_NO_CLASS;
9320 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9321 signalize memory class, so handle it as special case. */
9322 if (!words)
9324 classes[0] = X86_64_NO_CLASS;
9325 return 1;
9328 /* Classify each field of record and merge classes. */
9329 switch (TREE_CODE (type))
9331 case RECORD_TYPE:
9332 /* And now merge the fields of structure. */
9333 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9335 if (TREE_CODE (field) == FIELD_DECL)
9337 int num;
9339 if (TREE_TYPE (field) == error_mark_node)
9340 continue;
9342 /* Bitfields are always classified as integer. Handle them
9343 early, since later code would consider them to be
9344 misaligned integers. */
9345 if (DECL_BIT_FIELD (field))
9347 for (i = (int_bit_position (field)
9348 + (bit_offset % 64)) / 8 / 8;
9349 i < ((int_bit_position (field) + (bit_offset % 64))
9350 + tree_to_shwi (DECL_SIZE (field))
9351 + 63) / 8 / 8; i++)
9352 classes[i] =
9353 merge_classes (X86_64_INTEGER_CLASS,
9354 classes[i]);
9356 else
9358 int pos;
9360 type = TREE_TYPE (field);
9362 /* Flexible array member is ignored. */
9363 if (TYPE_MODE (type) == BLKmode
9364 && TREE_CODE (type) == ARRAY_TYPE
9365 && TYPE_SIZE (type) == NULL_TREE
9366 && TYPE_DOMAIN (type) != NULL_TREE
9367 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9368 == NULL_TREE))
9370 static bool warned;
9372 if (!warned && warn_psabi)
9374 warned = true;
9375 inform (input_location,
9376 "the ABI of passing struct with"
9377 " a flexible array member has"
9378 " changed in GCC 4.4");
9380 continue;
9382 num = classify_argument (TYPE_MODE (type), type,
9383 subclasses,
9384 (int_bit_position (field)
9385 + bit_offset) % 512);
9386 if (!num)
9387 return 0;
9388 pos = (int_bit_position (field)
9389 + (bit_offset % 64)) / 8 / 8;
9390 for (i = 0; i < num && (i + pos) < words; i++)
9391 classes[i + pos] =
9392 merge_classes (subclasses[i], classes[i + pos]);
9396 break;
9398 case ARRAY_TYPE:
9399 /* Arrays are handled as small records. */
9401 int num;
9402 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9403 TREE_TYPE (type), subclasses, bit_offset);
9404 if (!num)
9405 return 0;
9407 /* The partial classes are now full classes. */
9408 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9409 subclasses[0] = X86_64_SSE_CLASS;
9410 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9411 && !((bit_offset % 64) == 0 && bytes == 4))
9412 subclasses[0] = X86_64_INTEGER_CLASS;
9414 for (i = 0; i < words; i++)
9415 classes[i] = subclasses[i % num];
9417 break;
9419 case UNION_TYPE:
9420 case QUAL_UNION_TYPE:
9421 /* Unions are similar to RECORD_TYPE but offset is always 0.
9423 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9425 if (TREE_CODE (field) == FIELD_DECL)
9427 int num;
9429 if (TREE_TYPE (field) == error_mark_node)
9430 continue;
9432 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9433 TREE_TYPE (field), subclasses,
9434 bit_offset);
9435 if (!num)
9436 return 0;
9437 for (i = 0; i < num && i < words; i++)
9438 classes[i] = merge_classes (subclasses[i], classes[i]);
9441 break;
9443 default:
9444 gcc_unreachable ();
9447 if (words > 2)
9449 /* When size > 16 bytes, if the first one isn't
9450 X86_64_SSE_CLASS or any other ones aren't
9451 X86_64_SSEUP_CLASS, everything should be passed in
9452 memory. */
9453 if (classes[0] != X86_64_SSE_CLASS)
9454 return 0;
9456 for (i = 1; i < words; i++)
9457 if (classes[i] != X86_64_SSEUP_CLASS)
9458 return 0;
9461 /* Final merger cleanup. */
9462 for (i = 0; i < words; i++)
9464 /* If one class is MEMORY, everything should be passed in
9465 memory. */
9466 if (classes[i] == X86_64_MEMORY_CLASS)
9467 return 0;
9469 /* The X86_64_SSEUP_CLASS should be always preceded by
9470 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9471 if (classes[i] == X86_64_SSEUP_CLASS
9472 && classes[i - 1] != X86_64_SSE_CLASS
9473 && classes[i - 1] != X86_64_SSEUP_CLASS)
9475 /* The first one should never be X86_64_SSEUP_CLASS. */
9476 gcc_assert (i != 0);
9477 classes[i] = X86_64_SSE_CLASS;
9480 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9481 everything should be passed in memory. */
9482 if (classes[i] == X86_64_X87UP_CLASS
9483 && (classes[i - 1] != X86_64_X87_CLASS))
9485 static bool warned;
9487 /* The first one should never be X86_64_X87UP_CLASS. */
9488 gcc_assert (i != 0);
9489 if (!warned && warn_psabi)
9491 warned = true;
9492 inform (input_location,
9493 "the ABI of passing union with long double"
9494 " has changed in GCC 4.4");
9496 return 0;
9499 return words;
9502 /* Compute alignment needed. We align all types to natural boundaries with
9503 exception of XFmode that is aligned to 64bits. */
9504 if (mode != VOIDmode && mode != BLKmode)
9506 int mode_alignment = GET_MODE_BITSIZE (mode);
9508 if (mode == XFmode)
9509 mode_alignment = 128;
9510 else if (mode == XCmode)
9511 mode_alignment = 256;
9512 if (COMPLEX_MODE_P (mode))
9513 mode_alignment /= 2;
9514 /* Misaligned fields are always returned in memory. */
9515 if (bit_offset % mode_alignment)
9516 return 0;
9519 /* for V1xx modes, just use the base mode */
9520 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9521 && GET_MODE_UNIT_SIZE (mode) == bytes)
9522 mode = GET_MODE_INNER (mode);
9524 /* Classification of atomic types. */
9525 switch (mode)
9527 case E_SDmode:
9528 case E_DDmode:
9529 classes[0] = X86_64_SSE_CLASS;
9530 return 1;
9531 case E_TDmode:
9532 classes[0] = X86_64_SSE_CLASS;
9533 classes[1] = X86_64_SSEUP_CLASS;
9534 return 2;
9535 case E_DImode:
9536 case E_SImode:
9537 case E_HImode:
9538 case E_QImode:
9539 case E_CSImode:
9540 case E_CHImode:
9541 case E_CQImode:
9543 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9545 /* Analyze last 128 bits only. */
9546 size = (size - 1) & 0x7f;
9548 if (size < 32)
9550 classes[0] = X86_64_INTEGERSI_CLASS;
9551 return 1;
9553 else if (size < 64)
9555 classes[0] = X86_64_INTEGER_CLASS;
9556 return 1;
9558 else if (size < 64+32)
9560 classes[0] = X86_64_INTEGER_CLASS;
9561 classes[1] = X86_64_INTEGERSI_CLASS;
9562 return 2;
9564 else if (size < 64+64)
9566 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9567 return 2;
9569 else
9570 gcc_unreachable ();
9572 case E_CDImode:
9573 case E_TImode:
9574 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9575 return 2;
9576 case E_COImode:
9577 case E_OImode:
9578 /* OImode shouldn't be used directly. */
9579 gcc_unreachable ();
9580 case E_CTImode:
9581 return 0;
9582 case E_SFmode:
9583 if (!(bit_offset % 64))
9584 classes[0] = X86_64_SSESF_CLASS;
9585 else
9586 classes[0] = X86_64_SSE_CLASS;
9587 return 1;
9588 case E_DFmode:
9589 classes[0] = X86_64_SSEDF_CLASS;
9590 return 1;
9591 case E_XFmode:
9592 classes[0] = X86_64_X87_CLASS;
9593 classes[1] = X86_64_X87UP_CLASS;
9594 return 2;
9595 case E_TFmode:
9596 classes[0] = X86_64_SSE_CLASS;
9597 classes[1] = X86_64_SSEUP_CLASS;
9598 return 2;
9599 case E_SCmode:
9600 classes[0] = X86_64_SSE_CLASS;
9601 if (!(bit_offset % 64))
9602 return 1;
9603 else
9605 static bool warned;
9607 if (!warned && warn_psabi)
9609 warned = true;
9610 inform (input_location,
9611 "the ABI of passing structure with complex float"
9612 " member has changed in GCC 4.4");
9614 classes[1] = X86_64_SSESF_CLASS;
9615 return 2;
9617 case E_DCmode:
9618 classes[0] = X86_64_SSEDF_CLASS;
9619 classes[1] = X86_64_SSEDF_CLASS;
9620 return 2;
9621 case E_XCmode:
9622 classes[0] = X86_64_COMPLEX_X87_CLASS;
9623 return 1;
9624 case E_TCmode:
9625 /* This modes is larger than 16 bytes. */
9626 return 0;
9627 case E_V8SFmode:
9628 case E_V8SImode:
9629 case E_V32QImode:
9630 case E_V16HImode:
9631 case E_V4DFmode:
9632 case E_V4DImode:
9633 classes[0] = X86_64_SSE_CLASS;
9634 classes[1] = X86_64_SSEUP_CLASS;
9635 classes[2] = X86_64_SSEUP_CLASS;
9636 classes[3] = X86_64_SSEUP_CLASS;
9637 return 4;
9638 case E_V8DFmode:
9639 case E_V16SFmode:
9640 case E_V8DImode:
9641 case E_V16SImode:
9642 case E_V32HImode:
9643 case E_V64QImode:
9644 classes[0] = X86_64_SSE_CLASS;
9645 classes[1] = X86_64_SSEUP_CLASS;
9646 classes[2] = X86_64_SSEUP_CLASS;
9647 classes[3] = X86_64_SSEUP_CLASS;
9648 classes[4] = X86_64_SSEUP_CLASS;
9649 classes[5] = X86_64_SSEUP_CLASS;
9650 classes[6] = X86_64_SSEUP_CLASS;
9651 classes[7] = X86_64_SSEUP_CLASS;
9652 return 8;
9653 case E_V4SFmode:
9654 case E_V4SImode:
9655 case E_V16QImode:
9656 case E_V8HImode:
9657 case E_V2DFmode:
9658 case E_V2DImode:
9659 classes[0] = X86_64_SSE_CLASS;
9660 classes[1] = X86_64_SSEUP_CLASS;
9661 return 2;
9662 case E_V1TImode:
9663 case E_V1DImode:
9664 case E_V2SFmode:
9665 case E_V2SImode:
9666 case E_V4HImode:
9667 case E_V8QImode:
9668 classes[0] = X86_64_SSE_CLASS;
9669 return 1;
9670 case E_BLKmode:
9671 case E_VOIDmode:
9672 return 0;
9673 default:
9674 gcc_assert (VECTOR_MODE_P (mode));
9676 if (bytes > 16)
9677 return 0;
9679 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9681 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9682 classes[0] = X86_64_INTEGERSI_CLASS;
9683 else
9684 classes[0] = X86_64_INTEGER_CLASS;
9685 classes[1] = X86_64_INTEGER_CLASS;
9686 return 1 + (bytes > 8);
9690 /* Examine the argument and return set number of register required in each
9691 class. Return true iff parameter should be passed in memory. */
9693 static bool
9694 examine_argument (machine_mode mode, const_tree type, int in_return,
9695 int *int_nregs, int *sse_nregs)
9697 enum x86_64_reg_class regclass[MAX_CLASSES];
9698 int n = classify_argument (mode, type, regclass, 0);
9700 *int_nregs = 0;
9701 *sse_nregs = 0;
9703 if (!n)
9704 return true;
9705 for (n--; n >= 0; n--)
9706 switch (regclass[n])
9708 case X86_64_INTEGER_CLASS:
9709 case X86_64_INTEGERSI_CLASS:
9710 (*int_nregs)++;
9711 break;
9712 case X86_64_SSE_CLASS:
9713 case X86_64_SSESF_CLASS:
9714 case X86_64_SSEDF_CLASS:
9715 (*sse_nregs)++;
9716 break;
9717 case X86_64_NO_CLASS:
9718 case X86_64_SSEUP_CLASS:
9719 break;
9720 case X86_64_X87_CLASS:
9721 case X86_64_X87UP_CLASS:
9722 case X86_64_COMPLEX_X87_CLASS:
9723 if (!in_return)
9724 return true;
9725 break;
9726 case X86_64_MEMORY_CLASS:
9727 gcc_unreachable ();
9730 return false;
9733 /* Construct container for the argument used by GCC interface. See
9734 FUNCTION_ARG for the detailed description. */
9736 static rtx
9737 construct_container (machine_mode mode, machine_mode orig_mode,
9738 const_tree type, int in_return, int nintregs, int nsseregs,
9739 const int *intreg, int sse_regno)
9741 /* The following variables hold the static issued_error state. */
9742 static bool issued_sse_arg_error;
9743 static bool issued_sse_ret_error;
9744 static bool issued_x87_ret_error;
9746 machine_mode tmpmode;
9747 int bytes =
9748 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9749 enum x86_64_reg_class regclass[MAX_CLASSES];
9750 int n;
9751 int i;
9752 int nexps = 0;
9753 int needed_sseregs, needed_intregs;
9754 rtx exp[MAX_CLASSES];
9755 rtx ret;
9757 n = classify_argument (mode, type, regclass, 0);
9758 if (!n)
9759 return NULL;
9760 if (examine_argument (mode, type, in_return, &needed_intregs,
9761 &needed_sseregs))
9762 return NULL;
9763 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9764 return NULL;
9766 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9767 some less clueful developer tries to use floating-point anyway. */
9768 if (needed_sseregs && !TARGET_SSE)
9770 if (in_return)
9772 if (!issued_sse_ret_error)
9774 error ("SSE register return with SSE disabled");
9775 issued_sse_ret_error = true;
9778 else if (!issued_sse_arg_error)
9780 error ("SSE register argument with SSE disabled");
9781 issued_sse_arg_error = true;
9783 return NULL;
9786 /* Likewise, error if the ABI requires us to return values in the
9787 x87 registers and the user specified -mno-80387. */
9788 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9789 for (i = 0; i < n; i++)
9790 if (regclass[i] == X86_64_X87_CLASS
9791 || regclass[i] == X86_64_X87UP_CLASS
9792 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9794 if (!issued_x87_ret_error)
9796 error ("x87 register return with x87 disabled");
9797 issued_x87_ret_error = true;
9799 return NULL;
9802 /* First construct simple cases. Avoid SCmode, since we want to use
9803 single register to pass this type. */
9804 if (n == 1 && mode != SCmode)
9805 switch (regclass[0])
9807 case X86_64_INTEGER_CLASS:
9808 case X86_64_INTEGERSI_CLASS:
9809 return gen_rtx_REG (mode, intreg[0]);
9810 case X86_64_SSE_CLASS:
9811 case X86_64_SSESF_CLASS:
9812 case X86_64_SSEDF_CLASS:
9813 if (mode != BLKmode)
9814 return gen_reg_or_parallel (mode, orig_mode,
9815 SSE_REGNO (sse_regno));
9816 break;
9817 case X86_64_X87_CLASS:
9818 case X86_64_COMPLEX_X87_CLASS:
9819 return gen_rtx_REG (mode, FIRST_STACK_REG);
9820 case X86_64_NO_CLASS:
9821 /* Zero sized array, struct or class. */
9822 return NULL;
9823 default:
9824 gcc_unreachable ();
9826 if (n == 2
9827 && regclass[0] == X86_64_SSE_CLASS
9828 && regclass[1] == X86_64_SSEUP_CLASS
9829 && mode != BLKmode)
9830 return gen_reg_or_parallel (mode, orig_mode,
9831 SSE_REGNO (sse_regno));
9832 if (n == 4
9833 && regclass[0] == X86_64_SSE_CLASS
9834 && regclass[1] == X86_64_SSEUP_CLASS
9835 && regclass[2] == X86_64_SSEUP_CLASS
9836 && regclass[3] == X86_64_SSEUP_CLASS
9837 && mode != BLKmode)
9838 return gen_reg_or_parallel (mode, orig_mode,
9839 SSE_REGNO (sse_regno));
9840 if (n == 8
9841 && regclass[0] == X86_64_SSE_CLASS
9842 && regclass[1] == X86_64_SSEUP_CLASS
9843 && regclass[2] == X86_64_SSEUP_CLASS
9844 && regclass[3] == X86_64_SSEUP_CLASS
9845 && regclass[4] == X86_64_SSEUP_CLASS
9846 && regclass[5] == X86_64_SSEUP_CLASS
9847 && regclass[6] == X86_64_SSEUP_CLASS
9848 && regclass[7] == X86_64_SSEUP_CLASS
9849 && mode != BLKmode)
9850 return gen_reg_or_parallel (mode, orig_mode,
9851 SSE_REGNO (sse_regno));
9852 if (n == 2
9853 && regclass[0] == X86_64_X87_CLASS
9854 && regclass[1] == X86_64_X87UP_CLASS)
9855 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9857 if (n == 2
9858 && regclass[0] == X86_64_INTEGER_CLASS
9859 && regclass[1] == X86_64_INTEGER_CLASS
9860 && (mode == CDImode || mode == TImode)
9861 && intreg[0] + 1 == intreg[1])
9862 return gen_rtx_REG (mode, intreg[0]);
9864 /* Otherwise figure out the entries of the PARALLEL. */
9865 for (i = 0; i < n; i++)
9867 int pos;
9869 switch (regclass[i])
9871 case X86_64_NO_CLASS:
9872 break;
9873 case X86_64_INTEGER_CLASS:
9874 case X86_64_INTEGERSI_CLASS:
9875 /* Merge TImodes on aligned occasions here too. */
9876 if (i * 8 + 8 > bytes)
9877 tmpmode
9878 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9879 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9880 tmpmode = SImode;
9881 else
9882 tmpmode = DImode;
9883 /* We've requested 24 bytes we
9884 don't have mode for. Use DImode. */
9885 if (tmpmode == BLKmode)
9886 tmpmode = DImode;
9887 exp [nexps++]
9888 = gen_rtx_EXPR_LIST (VOIDmode,
9889 gen_rtx_REG (tmpmode, *intreg),
9890 GEN_INT (i*8));
9891 intreg++;
9892 break;
9893 case X86_64_SSESF_CLASS:
9894 exp [nexps++]
9895 = gen_rtx_EXPR_LIST (VOIDmode,
9896 gen_rtx_REG (SFmode,
9897 SSE_REGNO (sse_regno)),
9898 GEN_INT (i*8));
9899 sse_regno++;
9900 break;
9901 case X86_64_SSEDF_CLASS:
9902 exp [nexps++]
9903 = gen_rtx_EXPR_LIST (VOIDmode,
9904 gen_rtx_REG (DFmode,
9905 SSE_REGNO (sse_regno)),
9906 GEN_INT (i*8));
9907 sse_regno++;
9908 break;
9909 case X86_64_SSE_CLASS:
9910 pos = i;
9911 switch (n)
9913 case 1:
9914 tmpmode = DImode;
9915 break;
9916 case 2:
9917 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9919 tmpmode = TImode;
9920 i++;
9922 else
9923 tmpmode = DImode;
9924 break;
9925 case 4:
9926 gcc_assert (i == 0
9927 && regclass[1] == X86_64_SSEUP_CLASS
9928 && regclass[2] == X86_64_SSEUP_CLASS
9929 && regclass[3] == X86_64_SSEUP_CLASS);
9930 tmpmode = OImode;
9931 i += 3;
9932 break;
9933 case 8:
9934 gcc_assert (i == 0
9935 && regclass[1] == X86_64_SSEUP_CLASS
9936 && regclass[2] == X86_64_SSEUP_CLASS
9937 && regclass[3] == X86_64_SSEUP_CLASS
9938 && regclass[4] == X86_64_SSEUP_CLASS
9939 && regclass[5] == X86_64_SSEUP_CLASS
9940 && regclass[6] == X86_64_SSEUP_CLASS
9941 && regclass[7] == X86_64_SSEUP_CLASS);
9942 tmpmode = XImode;
9943 i += 7;
9944 break;
9945 default:
9946 gcc_unreachable ();
9948 exp [nexps++]
9949 = gen_rtx_EXPR_LIST (VOIDmode,
9950 gen_rtx_REG (tmpmode,
9951 SSE_REGNO (sse_regno)),
9952 GEN_INT (pos*8));
9953 sse_regno++;
9954 break;
9955 default:
9956 gcc_unreachable ();
9960 /* Empty aligned struct, union or class. */
9961 if (nexps == 0)
9962 return NULL;
9964 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9965 for (i = 0; i < nexps; i++)
9966 XVECEXP (ret, 0, i) = exp [i];
9967 return ret;
9970 /* Update the data in CUM to advance over an argument of mode MODE
9971 and data type TYPE. (TYPE is null for libcalls where that information
9972 may not be available.)
9974 Return a number of integer regsiters advanced over. */
9976 static int
9977 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9978 const_tree type, HOST_WIDE_INT bytes,
9979 HOST_WIDE_INT words)
9981 int res = 0;
9982 bool error_p = false;
9984 if (TARGET_IAMCU)
9986 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9987 bytes in registers. */
9988 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9989 goto pass_in_reg;
9990 return res;
9993 switch (mode)
9995 default:
9996 break;
9998 case E_BLKmode:
9999 if (bytes < 0)
10000 break;
10001 /* FALLTHRU */
10003 case E_DImode:
10004 case E_SImode:
10005 case E_HImode:
10006 case E_QImode:
10007 pass_in_reg:
10008 cum->words += words;
10009 cum->nregs -= words;
10010 cum->regno += words;
10011 if (cum->nregs >= 0)
10012 res = words;
10013 if (cum->nregs <= 0)
10015 cum->nregs = 0;
10016 cfun->machine->arg_reg_available = false;
10017 cum->regno = 0;
10019 break;
10021 case E_OImode:
10022 /* OImode shouldn't be used directly. */
10023 gcc_unreachable ();
10025 case E_DFmode:
10026 if (cum->float_in_sse == -1)
10027 error_p = true;
10028 if (cum->float_in_sse < 2)
10029 break;
10030 /* FALLTHRU */
10031 case E_SFmode:
10032 if (cum->float_in_sse == -1)
10033 error_p = true;
10034 if (cum->float_in_sse < 1)
10035 break;
10036 /* FALLTHRU */
10038 case E_V8SFmode:
10039 case E_V8SImode:
10040 case E_V64QImode:
10041 case E_V32HImode:
10042 case E_V16SImode:
10043 case E_V8DImode:
10044 case E_V16SFmode:
10045 case E_V8DFmode:
10046 case E_V32QImode:
10047 case E_V16HImode:
10048 case E_V4DFmode:
10049 case E_V4DImode:
10050 case E_TImode:
10051 case E_V16QImode:
10052 case E_V8HImode:
10053 case E_V4SImode:
10054 case E_V2DImode:
10055 case E_V4SFmode:
10056 case E_V2DFmode:
10057 if (!type || !AGGREGATE_TYPE_P (type))
10059 cum->sse_words += words;
10060 cum->sse_nregs -= 1;
10061 cum->sse_regno += 1;
10062 if (cum->sse_nregs <= 0)
10064 cum->sse_nregs = 0;
10065 cum->sse_regno = 0;
10068 break;
10070 case E_V8QImode:
10071 case E_V4HImode:
10072 case E_V2SImode:
10073 case E_V2SFmode:
10074 case E_V1TImode:
10075 case E_V1DImode:
10076 if (!type || !AGGREGATE_TYPE_P (type))
10078 cum->mmx_words += words;
10079 cum->mmx_nregs -= 1;
10080 cum->mmx_regno += 1;
10081 if (cum->mmx_nregs <= 0)
10083 cum->mmx_nregs = 0;
10084 cum->mmx_regno = 0;
10087 break;
10089 if (error_p)
10091 cum->float_in_sse = 0;
10092 error ("calling %qD with SSE calling convention without "
10093 "SSE/SSE2 enabled", cum->decl);
10094 sorry ("this is a GCC bug that can be worked around by adding "
10095 "attribute used to function called");
10098 return res;
10101 static int
10102 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10103 const_tree type, HOST_WIDE_INT words, bool named)
10105 int int_nregs, sse_nregs;
10107 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10108 if (!named && (VALID_AVX512F_REG_MODE (mode)
10109 || VALID_AVX256_REG_MODE (mode)))
10110 return 0;
10112 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10113 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10115 cum->nregs -= int_nregs;
10116 cum->sse_nregs -= sse_nregs;
10117 cum->regno += int_nregs;
10118 cum->sse_regno += sse_nregs;
10119 return int_nregs;
10121 else
10123 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10124 cum->words = ROUND_UP (cum->words, align);
10125 cum->words += words;
10126 return 0;
10130 static int
10131 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10132 HOST_WIDE_INT words)
10134 /* Otherwise, this should be passed indirect. */
10135 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10137 cum->words += words;
10138 if (cum->nregs > 0)
10140 cum->nregs -= 1;
10141 cum->regno += 1;
10142 return 1;
10144 return 0;
10147 /* Update the data in CUM to advance over an argument of mode MODE and
10148 data type TYPE. (TYPE is null for libcalls where that information
10149 may not be available.) */
10151 static void
10152 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10153 const_tree type, bool named)
10155 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10156 HOST_WIDE_INT bytes, words;
10157 int nregs;
10159 /* The argument of interrupt handler is a special case and is
10160 handled in ix86_function_arg. */
10161 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10162 return;
10164 if (mode == BLKmode)
10165 bytes = int_size_in_bytes (type);
10166 else
10167 bytes = GET_MODE_SIZE (mode);
10168 words = CEIL (bytes, UNITS_PER_WORD);
10170 if (type)
10171 mode = type_natural_mode (type, NULL, false);
10173 if ((type && POINTER_BOUNDS_TYPE_P (type))
10174 || POINTER_BOUNDS_MODE_P (mode))
10176 /* If we pass bounds in BT then just update remained bounds count. */
10177 if (cum->bnds_in_bt)
10179 cum->bnds_in_bt--;
10180 return;
10183 /* Update remained number of bounds to force. */
10184 if (cum->force_bnd_pass)
10185 cum->force_bnd_pass--;
10187 cum->bnd_regno++;
10189 return;
10192 /* The first arg not going to Bounds Tables resets this counter. */
10193 cum->bnds_in_bt = 0;
10194 /* For unnamed args we always pass bounds to avoid bounds mess when
10195 passed and received types do not match. If bounds do not follow
10196 unnamed arg, still pretend required number of bounds were passed. */
10197 if (cum->force_bnd_pass)
10199 cum->bnd_regno += cum->force_bnd_pass;
10200 cum->force_bnd_pass = 0;
10203 if (TARGET_64BIT)
10205 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10207 if (call_abi == MS_ABI)
10208 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10209 else
10210 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10212 else
10213 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10215 /* For stdarg we expect bounds to be passed for each value passed
10216 in register. */
10217 if (cum->stdarg)
10218 cum->force_bnd_pass = nregs;
10219 /* For pointers passed in memory we expect bounds passed in Bounds
10220 Table. */
10221 if (!nregs)
10223 /* Track if there are outgoing arguments on stack. */
10224 if (cum->caller)
10225 cfun->machine->outgoing_args_on_stack = true;
10227 cum->bnds_in_bt = chkp_type_bounds_count (type);
10231 /* Define where to put the arguments to a function.
10232 Value is zero to push the argument on the stack,
10233 or a hard register in which to store the argument.
10235 MODE is the argument's machine mode.
10236 TYPE is the data type of the argument (as a tree).
10237 This is null for libcalls where that information may
10238 not be available.
10239 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10240 the preceding args and about the function being called.
10241 NAMED is nonzero if this argument is a named parameter
10242 (otherwise it is an extra parameter matching an ellipsis). */
10244 static rtx
10245 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10246 machine_mode orig_mode, const_tree type,
10247 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10249 bool error_p = false;
10251 /* Avoid the AL settings for the Unix64 ABI. */
10252 if (mode == VOIDmode)
10253 return constm1_rtx;
10255 if (TARGET_IAMCU)
10257 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10258 bytes in registers. */
10259 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10260 goto pass_in_reg;
10261 return NULL_RTX;
10264 switch (mode)
10266 default:
10267 break;
10269 case E_BLKmode:
10270 if (bytes < 0)
10271 break;
10272 /* FALLTHRU */
10273 case E_DImode:
10274 case E_SImode:
10275 case E_HImode:
10276 case E_QImode:
10277 pass_in_reg:
10278 if (words <= cum->nregs)
10280 int regno = cum->regno;
10282 /* Fastcall allocates the first two DWORD (SImode) or
10283 smaller arguments to ECX and EDX if it isn't an
10284 aggregate type . */
10285 if (cum->fastcall)
10287 if (mode == BLKmode
10288 || mode == DImode
10289 || (type && AGGREGATE_TYPE_P (type)))
10290 break;
10292 /* ECX not EAX is the first allocated register. */
10293 if (regno == AX_REG)
10294 regno = CX_REG;
10296 return gen_rtx_REG (mode, regno);
10298 break;
10300 case E_DFmode:
10301 if (cum->float_in_sse == -1)
10302 error_p = true;
10303 if (cum->float_in_sse < 2)
10304 break;
10305 /* FALLTHRU */
10306 case E_SFmode:
10307 if (cum->float_in_sse == -1)
10308 error_p = true;
10309 if (cum->float_in_sse < 1)
10310 break;
10311 /* FALLTHRU */
10312 case E_TImode:
10313 /* In 32bit, we pass TImode in xmm registers. */
10314 case E_V16QImode:
10315 case E_V8HImode:
10316 case E_V4SImode:
10317 case E_V2DImode:
10318 case E_V4SFmode:
10319 case E_V2DFmode:
10320 if (!type || !AGGREGATE_TYPE_P (type))
10322 if (cum->sse_nregs)
10323 return gen_reg_or_parallel (mode, orig_mode,
10324 cum->sse_regno + FIRST_SSE_REG);
10326 break;
10328 case E_OImode:
10329 case E_XImode:
10330 /* OImode and XImode shouldn't be used directly. */
10331 gcc_unreachable ();
10333 case E_V64QImode:
10334 case E_V32HImode:
10335 case E_V16SImode:
10336 case E_V8DImode:
10337 case E_V16SFmode:
10338 case E_V8DFmode:
10339 case E_V8SFmode:
10340 case E_V8SImode:
10341 case E_V32QImode:
10342 case E_V16HImode:
10343 case E_V4DFmode:
10344 case E_V4DImode:
10345 if (!type || !AGGREGATE_TYPE_P (type))
10347 if (cum->sse_nregs)
10348 return gen_reg_or_parallel (mode, orig_mode,
10349 cum->sse_regno + FIRST_SSE_REG);
10351 break;
10353 case E_V8QImode:
10354 case E_V4HImode:
10355 case E_V2SImode:
10356 case E_V2SFmode:
10357 case E_V1TImode:
10358 case E_V1DImode:
10359 if (!type || !AGGREGATE_TYPE_P (type))
10361 if (cum->mmx_nregs)
10362 return gen_reg_or_parallel (mode, orig_mode,
10363 cum->mmx_regno + FIRST_MMX_REG);
10365 break;
10367 if (error_p)
10369 cum->float_in_sse = 0;
10370 error ("calling %qD with SSE calling convention without "
10371 "SSE/SSE2 enabled", cum->decl);
10372 sorry ("this is a GCC bug that can be worked around by adding "
10373 "attribute used to function called");
10376 return NULL_RTX;
10379 static rtx
10380 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10381 machine_mode orig_mode, const_tree type, bool named)
10383 /* Handle a hidden AL argument containing number of registers
10384 for varargs x86-64 functions. */
10385 if (mode == VOIDmode)
10386 return GEN_INT (cum->maybe_vaarg
10387 ? (cum->sse_nregs < 0
10388 ? X86_64_SSE_REGPARM_MAX
10389 : cum->sse_regno)
10390 : -1);
10392 switch (mode)
10394 default:
10395 break;
10397 case E_V8SFmode:
10398 case E_V8SImode:
10399 case E_V32QImode:
10400 case E_V16HImode:
10401 case E_V4DFmode:
10402 case E_V4DImode:
10403 case E_V16SFmode:
10404 case E_V16SImode:
10405 case E_V64QImode:
10406 case E_V32HImode:
10407 case E_V8DFmode:
10408 case E_V8DImode:
10409 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10410 if (!named)
10411 return NULL;
10412 break;
10415 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10416 cum->sse_nregs,
10417 &x86_64_int_parameter_registers [cum->regno],
10418 cum->sse_regno);
10421 static rtx
10422 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10423 machine_mode orig_mode, bool named,
10424 HOST_WIDE_INT bytes)
10426 unsigned int regno;
10428 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10429 We use value of -2 to specify that current function call is MSABI. */
10430 if (mode == VOIDmode)
10431 return GEN_INT (-2);
10433 /* If we've run out of registers, it goes on the stack. */
10434 if (cum->nregs == 0)
10435 return NULL_RTX;
10437 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10439 /* Only floating point modes are passed in anything but integer regs. */
10440 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10442 if (named)
10443 regno = cum->regno + FIRST_SSE_REG;
10444 else
10446 rtx t1, t2;
10448 /* Unnamed floating parameters are passed in both the
10449 SSE and integer registers. */
10450 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10451 t2 = gen_rtx_REG (mode, regno);
10452 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10453 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10454 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10457 /* Handle aggregated types passed in register. */
10458 if (orig_mode == BLKmode)
10460 if (bytes > 0 && bytes <= 8)
10461 mode = (bytes > 4 ? DImode : SImode);
10462 if (mode == BLKmode)
10463 mode = DImode;
10466 return gen_reg_or_parallel (mode, orig_mode, regno);
10469 /* Return where to put the arguments to a function.
10470 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10472 MODE is the argument's machine mode. TYPE is the data type of the
10473 argument. It is null for libcalls where that information may not be
10474 available. CUM gives information about the preceding args and about
10475 the function being called. NAMED is nonzero if this argument is a
10476 named parameter (otherwise it is an extra parameter matching an
10477 ellipsis). */
10479 static rtx
10480 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10481 const_tree type, bool named)
10483 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10484 machine_mode mode = omode;
10485 HOST_WIDE_INT bytes, words;
10486 rtx arg;
10488 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10490 gcc_assert (type != NULL_TREE);
10491 if (POINTER_TYPE_P (type))
10493 /* This is the pointer argument. */
10494 gcc_assert (TYPE_MODE (type) == Pmode);
10495 /* It is at -WORD(AP) in the current frame in interrupt and
10496 exception handlers. */
10497 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
10499 else
10501 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10502 && TREE_CODE (type) == INTEGER_TYPE
10503 && TYPE_MODE (type) == word_mode);
10504 /* The error code is the word-mode integer argument at
10505 -2 * WORD(AP) in the current frame of the exception
10506 handler. */
10507 arg = gen_rtx_MEM (word_mode,
10508 plus_constant (Pmode,
10509 arg_pointer_rtx,
10510 -2 * UNITS_PER_WORD));
10512 return arg;
10515 /* All pointer bounds arguments are handled separately here. */
10516 if ((type && POINTER_BOUNDS_TYPE_P (type))
10517 || POINTER_BOUNDS_MODE_P (mode))
10519 /* Return NULL if bounds are forced to go in Bounds Table. */
10520 if (cum->bnds_in_bt)
10521 arg = NULL;
10522 /* Return the next available bound reg if any. */
10523 else if (cum->bnd_regno <= LAST_BND_REG)
10524 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10525 /* Return the next special slot number otherwise. */
10526 else
10527 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10529 return arg;
10532 if (mode == BLKmode)
10533 bytes = int_size_in_bytes (type);
10534 else
10535 bytes = GET_MODE_SIZE (mode);
10536 words = CEIL (bytes, UNITS_PER_WORD);
10538 /* To simplify the code below, represent vector types with a vector mode
10539 even if MMX/SSE are not active. */
10540 if (type && TREE_CODE (type) == VECTOR_TYPE)
10541 mode = type_natural_mode (type, cum, false);
10543 if (TARGET_64BIT)
10545 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10547 if (call_abi == MS_ABI)
10548 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10549 else
10550 arg = function_arg_64 (cum, mode, omode, type, named);
10552 else
10553 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10555 /* Track if there are outgoing arguments on stack. */
10556 if (arg == NULL_RTX && cum->caller)
10557 cfun->machine->outgoing_args_on_stack = true;
10559 return arg;
10562 /* A C expression that indicates when an argument must be passed by
10563 reference. If nonzero for an argument, a copy of that argument is
10564 made in memory and a pointer to the argument is passed instead of
10565 the argument itself. The pointer is passed in whatever way is
10566 appropriate for passing a pointer to that type. */
10568 static bool
10569 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10570 const_tree type, bool)
10572 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10574 /* Bounds are never passed by reference. */
10575 if ((type && POINTER_BOUNDS_TYPE_P (type))
10576 || POINTER_BOUNDS_MODE_P (mode))
10577 return false;
10579 if (TARGET_64BIT)
10581 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10583 /* See Windows x64 Software Convention. */
10584 if (call_abi == MS_ABI)
10586 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10588 if (type)
10590 /* Arrays are passed by reference. */
10591 if (TREE_CODE (type) == ARRAY_TYPE)
10592 return true;
10594 if (RECORD_OR_UNION_TYPE_P (type))
10596 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10597 are passed by reference. */
10598 msize = int_size_in_bytes (type);
10602 /* __m128 is passed by reference. */
10603 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10605 else if (type && int_size_in_bytes (type) == -1)
10606 return true;
10609 return false;
10612 /* Return true when TYPE should be 128bit aligned for 32bit argument
10613 passing ABI. XXX: This function is obsolete and is only used for
10614 checking psABI compatibility with previous versions of GCC. */
10616 static bool
10617 ix86_compat_aligned_value_p (const_tree type)
10619 machine_mode mode = TYPE_MODE (type);
10620 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10621 || mode == TDmode
10622 || mode == TFmode
10623 || mode == TCmode)
10624 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10625 return true;
10626 if (TYPE_ALIGN (type) < 128)
10627 return false;
10629 if (AGGREGATE_TYPE_P (type))
10631 /* Walk the aggregates recursively. */
10632 switch (TREE_CODE (type))
10634 case RECORD_TYPE:
10635 case UNION_TYPE:
10636 case QUAL_UNION_TYPE:
10638 tree field;
10640 /* Walk all the structure fields. */
10641 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10643 if (TREE_CODE (field) == FIELD_DECL
10644 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10645 return true;
10647 break;
10650 case ARRAY_TYPE:
10651 /* Just for use if some languages passes arrays by value. */
10652 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10653 return true;
10654 break;
10656 default:
10657 gcc_unreachable ();
10660 return false;
10663 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10664 XXX: This function is obsolete and is only used for checking psABI
10665 compatibility with previous versions of GCC. */
10667 static unsigned int
10668 ix86_compat_function_arg_boundary (machine_mode mode,
10669 const_tree type, unsigned int align)
10671 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10672 natural boundaries. */
10673 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10675 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10676 make an exception for SSE modes since these require 128bit
10677 alignment.
10679 The handling here differs from field_alignment. ICC aligns MMX
10680 arguments to 4 byte boundaries, while structure fields are aligned
10681 to 8 byte boundaries. */
10682 if (!type)
10684 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10685 align = PARM_BOUNDARY;
10687 else
10689 if (!ix86_compat_aligned_value_p (type))
10690 align = PARM_BOUNDARY;
10693 if (align > BIGGEST_ALIGNMENT)
10694 align = BIGGEST_ALIGNMENT;
10695 return align;
10698 /* Return true when TYPE should be 128bit aligned for 32bit argument
10699 passing ABI. */
10701 static bool
10702 ix86_contains_aligned_value_p (const_tree type)
10704 machine_mode mode = TYPE_MODE (type);
10706 if (mode == XFmode || mode == XCmode)
10707 return false;
10709 if (TYPE_ALIGN (type) < 128)
10710 return false;
10712 if (AGGREGATE_TYPE_P (type))
10714 /* Walk the aggregates recursively. */
10715 switch (TREE_CODE (type))
10717 case RECORD_TYPE:
10718 case UNION_TYPE:
10719 case QUAL_UNION_TYPE:
10721 tree field;
10723 /* Walk all the structure fields. */
10724 for (field = TYPE_FIELDS (type);
10725 field;
10726 field = DECL_CHAIN (field))
10728 if (TREE_CODE (field) == FIELD_DECL
10729 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10730 return true;
10732 break;
10735 case ARRAY_TYPE:
10736 /* Just for use if some languages passes arrays by value. */
10737 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10738 return true;
10739 break;
10741 default:
10742 gcc_unreachable ();
10745 else
10746 return TYPE_ALIGN (type) >= 128;
10748 return false;
10751 /* Gives the alignment boundary, in bits, of an argument with the
10752 specified mode and type. */
10754 static unsigned int
10755 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10757 unsigned int align;
10758 if (type)
10760 /* Since the main variant type is used for call, we convert it to
10761 the main variant type. */
10762 type = TYPE_MAIN_VARIANT (type);
10763 align = TYPE_ALIGN (type);
10765 else
10766 align = GET_MODE_ALIGNMENT (mode);
10767 if (align < PARM_BOUNDARY)
10768 align = PARM_BOUNDARY;
10769 else
10771 static bool warned;
10772 unsigned int saved_align = align;
10774 if (!TARGET_64BIT)
10776 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10777 if (!type)
10779 if (mode == XFmode || mode == XCmode)
10780 align = PARM_BOUNDARY;
10782 else if (!ix86_contains_aligned_value_p (type))
10783 align = PARM_BOUNDARY;
10785 if (align < 128)
10786 align = PARM_BOUNDARY;
10789 if (warn_psabi
10790 && !warned
10791 && align != ix86_compat_function_arg_boundary (mode, type,
10792 saved_align))
10794 warned = true;
10795 inform (input_location,
10796 "The ABI for passing parameters with %d-byte"
10797 " alignment has changed in GCC 4.6",
10798 align / BITS_PER_UNIT);
10802 return align;
10805 /* Return true if N is a possible register number of function value. */
10807 static bool
10808 ix86_function_value_regno_p (const unsigned int regno)
10810 switch (regno)
10812 case AX_REG:
10813 return true;
10814 case DX_REG:
10815 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10816 case DI_REG:
10817 case SI_REG:
10818 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10820 case BND0_REG:
10821 case BND1_REG:
10822 return chkp_function_instrumented_p (current_function_decl);
10824 /* Complex values are returned in %st(0)/%st(1) pair. */
10825 case ST0_REG:
10826 case ST1_REG:
10827 /* TODO: The function should depend on current function ABI but
10828 builtins.c would need updating then. Therefore we use the
10829 default ABI. */
10830 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10831 return false;
10832 return TARGET_FLOAT_RETURNS_IN_80387;
10834 /* Complex values are returned in %xmm0/%xmm1 pair. */
10835 case XMM0_REG:
10836 case XMM1_REG:
10837 return TARGET_SSE;
10839 case MM0_REG:
10840 if (TARGET_MACHO || TARGET_64BIT)
10841 return false;
10842 return TARGET_MMX;
10845 return false;
10848 /* Define how to find the value returned by a function.
10849 VALTYPE is the data type of the value (as a tree).
10850 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10851 otherwise, FUNC is 0. */
10853 static rtx
10854 function_value_32 (machine_mode orig_mode, machine_mode mode,
10855 const_tree fntype, const_tree fn)
10857 unsigned int regno;
10859 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10860 we normally prevent this case when mmx is not available. However
10861 some ABIs may require the result to be returned like DImode. */
10862 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10863 regno = FIRST_MMX_REG;
10865 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10866 we prevent this case when sse is not available. However some ABIs
10867 may require the result to be returned like integer TImode. */
10868 else if (mode == TImode
10869 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10870 regno = FIRST_SSE_REG;
10872 /* 32-byte vector modes in %ymm0. */
10873 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10874 regno = FIRST_SSE_REG;
10876 /* 64-byte vector modes in %zmm0. */
10877 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10878 regno = FIRST_SSE_REG;
10880 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10881 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10882 regno = FIRST_FLOAT_REG;
10883 else
10884 /* Most things go in %eax. */
10885 regno = AX_REG;
10887 /* Override FP return register with %xmm0 for local functions when
10888 SSE math is enabled or for functions with sseregparm attribute. */
10889 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10891 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10892 if (sse_level == -1)
10894 error ("calling %qD with SSE calling convention without "
10895 "SSE/SSE2 enabled", fn);
10896 sorry ("this is a GCC bug that can be worked around by adding "
10897 "attribute used to function called");
10899 else if ((sse_level >= 1 && mode == SFmode)
10900 || (sse_level == 2 && mode == DFmode))
10901 regno = FIRST_SSE_REG;
10904 /* OImode shouldn't be used directly. */
10905 gcc_assert (mode != OImode);
10907 return gen_rtx_REG (orig_mode, regno);
10910 static rtx
10911 function_value_64 (machine_mode orig_mode, machine_mode mode,
10912 const_tree valtype)
10914 rtx ret;
10916 /* Handle libcalls, which don't provide a type node. */
10917 if (valtype == NULL)
10919 unsigned int regno;
10921 switch (mode)
10923 case E_SFmode:
10924 case E_SCmode:
10925 case E_DFmode:
10926 case E_DCmode:
10927 case E_TFmode:
10928 case E_SDmode:
10929 case E_DDmode:
10930 case E_TDmode:
10931 regno = FIRST_SSE_REG;
10932 break;
10933 case E_XFmode:
10934 case E_XCmode:
10935 regno = FIRST_FLOAT_REG;
10936 break;
10937 case E_TCmode:
10938 return NULL;
10939 default:
10940 regno = AX_REG;
10943 return gen_rtx_REG (mode, regno);
10945 else if (POINTER_TYPE_P (valtype))
10947 /* Pointers are always returned in word_mode. */
10948 mode = word_mode;
10951 ret = construct_container (mode, orig_mode, valtype, 1,
10952 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10953 x86_64_int_return_registers, 0);
10955 /* For zero sized structures, construct_container returns NULL, but we
10956 need to keep rest of compiler happy by returning meaningful value. */
10957 if (!ret)
10958 ret = gen_rtx_REG (orig_mode, AX_REG);
10960 return ret;
10963 static rtx
10964 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10965 const_tree valtype)
10967 unsigned int regno = AX_REG;
10969 if (TARGET_SSE)
10971 switch (GET_MODE_SIZE (mode))
10973 case 16:
10974 if (valtype != NULL_TREE
10975 && !VECTOR_INTEGER_TYPE_P (valtype)
10976 && !VECTOR_INTEGER_TYPE_P (valtype)
10977 && !INTEGRAL_TYPE_P (valtype)
10978 && !VECTOR_FLOAT_TYPE_P (valtype))
10979 break;
10980 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10981 && !COMPLEX_MODE_P (mode))
10982 regno = FIRST_SSE_REG;
10983 break;
10984 case 8:
10985 case 4:
10986 if (mode == SFmode || mode == DFmode)
10987 regno = FIRST_SSE_REG;
10988 break;
10989 default:
10990 break;
10993 return gen_rtx_REG (orig_mode, regno);
10996 static rtx
10997 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10998 machine_mode orig_mode, machine_mode mode)
11000 const_tree fn, fntype;
11002 fn = NULL_TREE;
11003 if (fntype_or_decl && DECL_P (fntype_or_decl))
11004 fn = fntype_or_decl;
11005 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
11007 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
11008 || POINTER_BOUNDS_MODE_P (mode))
11009 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
11010 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
11011 return function_value_ms_64 (orig_mode, mode, valtype);
11012 else if (TARGET_64BIT)
11013 return function_value_64 (orig_mode, mode, valtype);
11014 else
11015 return function_value_32 (orig_mode, mode, fntype, fn);
11018 static rtx
11019 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
11021 machine_mode mode, orig_mode;
11023 orig_mode = TYPE_MODE (valtype);
11024 mode = type_natural_mode (valtype, NULL, true);
11025 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
11028 /* Return an RTX representing a place where a function returns
11029 or recieves pointer bounds or NULL if no bounds are returned.
11031 VALTYPE is a data type of a value returned by the function.
11033 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
11034 or FUNCTION_TYPE of the function.
11036 If OUTGOING is false, return a place in which the caller will
11037 see the return value. Otherwise, return a place where a
11038 function returns a value. */
11040 static rtx
11041 ix86_function_value_bounds (const_tree valtype,
11042 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
11043 bool outgoing ATTRIBUTE_UNUSED)
11045 rtx res = NULL_RTX;
11047 if (BOUNDED_TYPE_P (valtype))
11048 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
11049 else if (chkp_type_has_pointer (valtype))
11051 bitmap slots;
11052 rtx bounds[2];
11053 bitmap_iterator bi;
11054 unsigned i, bnd_no = 0;
11056 bitmap_obstack_initialize (NULL);
11057 slots = BITMAP_ALLOC (NULL);
11058 chkp_find_bound_slots (valtype, slots);
11060 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
11062 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
11063 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
11064 gcc_assert (bnd_no < 2);
11065 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11068 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11070 BITMAP_FREE (slots);
11071 bitmap_obstack_release (NULL);
11073 else
11074 res = NULL_RTX;
11076 return res;
11079 /* Pointer function arguments and return values are promoted to
11080 word_mode for normal functions. */
11082 static machine_mode
11083 ix86_promote_function_mode (const_tree type, machine_mode mode,
11084 int *punsignedp, const_tree fntype,
11085 int for_return)
11087 if (cfun->machine->func_type == TYPE_NORMAL
11088 && type != NULL_TREE
11089 && POINTER_TYPE_P (type))
11091 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11092 return word_mode;
11094 return default_promote_function_mode (type, mode, punsignedp, fntype,
11095 for_return);
11098 /* Return true if a structure, union or array with MODE containing FIELD
11099 should be accessed using BLKmode. */
11101 static bool
11102 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11104 /* Union with XFmode must be in BLKmode. */
11105 return (mode == XFmode
11106 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11107 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11111 ix86_libcall_value (machine_mode mode)
11113 return ix86_function_value_1 (NULL, NULL, mode, mode);
11116 /* Return true iff type is returned in memory. */
11118 static bool
11119 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11121 #ifdef SUBTARGET_RETURN_IN_MEMORY
11122 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11123 #else
11124 const machine_mode mode = type_natural_mode (type, NULL, true);
11125 HOST_WIDE_INT size;
11127 if (POINTER_BOUNDS_TYPE_P (type))
11128 return false;
11130 if (TARGET_64BIT)
11132 if (ix86_function_type_abi (fntype) == MS_ABI)
11134 size = int_size_in_bytes (type);
11136 /* __m128 is returned in xmm0. */
11137 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11138 || INTEGRAL_TYPE_P (type)
11139 || VECTOR_FLOAT_TYPE_P (type))
11140 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11141 && !COMPLEX_MODE_P (mode)
11142 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11143 return false;
11145 /* Otherwise, the size must be exactly in [1248]. */
11146 return size != 1 && size != 2 && size != 4 && size != 8;
11148 else
11150 int needed_intregs, needed_sseregs;
11152 return examine_argument (mode, type, 1,
11153 &needed_intregs, &needed_sseregs);
11156 else
11158 size = int_size_in_bytes (type);
11160 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11161 bytes in registers. */
11162 if (TARGET_IAMCU)
11163 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11165 if (mode == BLKmode)
11166 return true;
11168 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11169 return false;
11171 if (VECTOR_MODE_P (mode) || mode == TImode)
11173 /* User-created vectors small enough to fit in EAX. */
11174 if (size < 8)
11175 return false;
11177 /* Unless ABI prescibes otherwise,
11178 MMX/3dNow values are returned in MM0 if available. */
11180 if (size == 8)
11181 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11183 /* SSE values are returned in XMM0 if available. */
11184 if (size == 16)
11185 return !TARGET_SSE;
11187 /* AVX values are returned in YMM0 if available. */
11188 if (size == 32)
11189 return !TARGET_AVX;
11191 /* AVX512F values are returned in ZMM0 if available. */
11192 if (size == 64)
11193 return !TARGET_AVX512F;
11196 if (mode == XFmode)
11197 return false;
11199 if (size > 12)
11200 return true;
11202 /* OImode shouldn't be used directly. */
11203 gcc_assert (mode != OImode);
11205 return false;
11207 #endif
11211 /* Create the va_list data type. */
11213 static tree
11214 ix86_build_builtin_va_list_64 (void)
11216 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11218 record = lang_hooks.types.make_type (RECORD_TYPE);
11219 type_decl = build_decl (BUILTINS_LOCATION,
11220 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11222 f_gpr = build_decl (BUILTINS_LOCATION,
11223 FIELD_DECL, get_identifier ("gp_offset"),
11224 unsigned_type_node);
11225 f_fpr = build_decl (BUILTINS_LOCATION,
11226 FIELD_DECL, get_identifier ("fp_offset"),
11227 unsigned_type_node);
11228 f_ovf = build_decl (BUILTINS_LOCATION,
11229 FIELD_DECL, get_identifier ("overflow_arg_area"),
11230 ptr_type_node);
11231 f_sav = build_decl (BUILTINS_LOCATION,
11232 FIELD_DECL, get_identifier ("reg_save_area"),
11233 ptr_type_node);
11235 va_list_gpr_counter_field = f_gpr;
11236 va_list_fpr_counter_field = f_fpr;
11238 DECL_FIELD_CONTEXT (f_gpr) = record;
11239 DECL_FIELD_CONTEXT (f_fpr) = record;
11240 DECL_FIELD_CONTEXT (f_ovf) = record;
11241 DECL_FIELD_CONTEXT (f_sav) = record;
11243 TYPE_STUB_DECL (record) = type_decl;
11244 TYPE_NAME (record) = type_decl;
11245 TYPE_FIELDS (record) = f_gpr;
11246 DECL_CHAIN (f_gpr) = f_fpr;
11247 DECL_CHAIN (f_fpr) = f_ovf;
11248 DECL_CHAIN (f_ovf) = f_sav;
11250 layout_type (record);
11252 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11253 NULL_TREE, TYPE_ATTRIBUTES (record));
11255 /* The correct type is an array type of one element. */
11256 return build_array_type (record, build_index_type (size_zero_node));
11259 /* Setup the builtin va_list data type and for 64-bit the additional
11260 calling convention specific va_list data types. */
11262 static tree
11263 ix86_build_builtin_va_list (void)
11265 if (TARGET_64BIT)
11267 /* Initialize ABI specific va_list builtin types.
11269 In lto1, we can encounter two va_list types:
11270 - one as a result of the type-merge across TUs, and
11271 - the one constructed here.
11272 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11273 a type identity check in canonical_va_list_type based on
11274 TYPE_MAIN_VARIANT (which we used to have) will not work.
11275 Instead, we tag each va_list_type_node with its unique attribute, and
11276 look for the attribute in the type identity check in
11277 canonical_va_list_type.
11279 Tagging sysv_va_list_type_node directly with the attribute is
11280 problematic since it's a array of one record, which will degrade into a
11281 pointer to record when used as parameter (see build_va_arg comments for
11282 an example), dropping the attribute in the process. So we tag the
11283 record instead. */
11285 /* For SYSV_ABI we use an array of one record. */
11286 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11288 /* For MS_ABI we use plain pointer to argument area. */
11289 tree char_ptr_type = build_pointer_type (char_type_node);
11290 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11291 TYPE_ATTRIBUTES (char_ptr_type));
11292 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11294 return ((ix86_abi == MS_ABI)
11295 ? ms_va_list_type_node
11296 : sysv_va_list_type_node);
11298 else
11300 /* For i386 we use plain pointer to argument area. */
11301 return build_pointer_type (char_type_node);
11305 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11307 static void
11308 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11310 rtx save_area, mem;
11311 alias_set_type set;
11312 int i, max;
11314 /* GPR size of varargs save area. */
11315 if (cfun->va_list_gpr_size)
11316 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11317 else
11318 ix86_varargs_gpr_size = 0;
11320 /* FPR size of varargs save area. We don't need it if we don't pass
11321 anything in SSE registers. */
11322 if (TARGET_SSE && cfun->va_list_fpr_size)
11323 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11324 else
11325 ix86_varargs_fpr_size = 0;
11327 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11328 return;
11330 save_area = frame_pointer_rtx;
11331 set = get_varargs_alias_set ();
11333 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11334 if (max > X86_64_REGPARM_MAX)
11335 max = X86_64_REGPARM_MAX;
11337 for (i = cum->regno; i < max; i++)
11339 mem = gen_rtx_MEM (word_mode,
11340 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11341 MEM_NOTRAP_P (mem) = 1;
11342 set_mem_alias_set (mem, set);
11343 emit_move_insn (mem,
11344 gen_rtx_REG (word_mode,
11345 x86_64_int_parameter_registers[i]));
11348 if (ix86_varargs_fpr_size)
11350 machine_mode smode;
11351 rtx_code_label *label;
11352 rtx test;
11354 /* Now emit code to save SSE registers. The AX parameter contains number
11355 of SSE parameter registers used to call this function, though all we
11356 actually check here is the zero/non-zero status. */
11358 label = gen_label_rtx ();
11359 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11360 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11361 label));
11363 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11364 we used movdqa (i.e. TImode) instead? Perhaps even better would
11365 be if we could determine the real mode of the data, via a hook
11366 into pass_stdarg. Ignore all that for now. */
11367 smode = V4SFmode;
11368 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11369 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11371 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11372 if (max > X86_64_SSE_REGPARM_MAX)
11373 max = X86_64_SSE_REGPARM_MAX;
11375 for (i = cum->sse_regno; i < max; ++i)
11377 mem = plus_constant (Pmode, save_area,
11378 i * 16 + ix86_varargs_gpr_size);
11379 mem = gen_rtx_MEM (smode, mem);
11380 MEM_NOTRAP_P (mem) = 1;
11381 set_mem_alias_set (mem, set);
11382 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11384 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11387 emit_label (label);
11391 static void
11392 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11394 alias_set_type set = get_varargs_alias_set ();
11395 int i;
11397 /* Reset to zero, as there might be a sysv vaarg used
11398 before. */
11399 ix86_varargs_gpr_size = 0;
11400 ix86_varargs_fpr_size = 0;
11402 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11404 rtx reg, mem;
11406 mem = gen_rtx_MEM (Pmode,
11407 plus_constant (Pmode, virtual_incoming_args_rtx,
11408 i * UNITS_PER_WORD));
11409 MEM_NOTRAP_P (mem) = 1;
11410 set_mem_alias_set (mem, set);
11412 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11413 emit_move_insn (mem, reg);
11417 static void
11418 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11419 tree type, int *, int no_rtl)
11421 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11422 CUMULATIVE_ARGS next_cum;
11423 tree fntype;
11425 /* This argument doesn't appear to be used anymore. Which is good,
11426 because the old code here didn't suppress rtl generation. */
11427 gcc_assert (!no_rtl);
11429 if (!TARGET_64BIT)
11430 return;
11432 fntype = TREE_TYPE (current_function_decl);
11434 /* For varargs, we do not want to skip the dummy va_dcl argument.
11435 For stdargs, we do want to skip the last named argument. */
11436 next_cum = *cum;
11437 if (stdarg_p (fntype))
11438 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11439 true);
11441 if (cum->call_abi == MS_ABI)
11442 setup_incoming_varargs_ms_64 (&next_cum);
11443 else
11444 setup_incoming_varargs_64 (&next_cum);
11447 static void
11448 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11449 machine_mode mode,
11450 tree type,
11451 int *pretend_size ATTRIBUTE_UNUSED,
11452 int no_rtl)
11454 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11455 CUMULATIVE_ARGS next_cum;
11456 tree fntype;
11457 rtx save_area;
11458 int bnd_reg, i, max;
11460 gcc_assert (!no_rtl);
11462 /* Do nothing if we use plain pointer to argument area. */
11463 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11464 return;
11466 fntype = TREE_TYPE (current_function_decl);
11468 /* For varargs, we do not want to skip the dummy va_dcl argument.
11469 For stdargs, we do want to skip the last named argument. */
11470 next_cum = *cum;
11471 if (stdarg_p (fntype))
11472 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11473 true);
11474 save_area = frame_pointer_rtx;
11476 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11477 if (max > X86_64_REGPARM_MAX)
11478 max = X86_64_REGPARM_MAX;
11480 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11481 if (chkp_function_instrumented_p (current_function_decl))
11482 for (i = cum->regno; i < max; i++)
11484 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11485 rtx ptr = gen_rtx_REG (Pmode,
11486 x86_64_int_parameter_registers[i]);
11487 rtx bounds;
11489 if (bnd_reg <= LAST_BND_REG)
11490 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11491 else
11493 rtx ldx_addr =
11494 plus_constant (Pmode, arg_pointer_rtx,
11495 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11496 bounds = gen_reg_rtx (BNDmode);
11497 emit_insn (BNDmode == BND64mode
11498 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11499 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11502 emit_insn (BNDmode == BND64mode
11503 ? gen_bnd64_stx (addr, ptr, bounds)
11504 : gen_bnd32_stx (addr, ptr, bounds));
11506 bnd_reg++;
11511 /* Checks if TYPE is of kind va_list char *. */
11513 static bool
11514 is_va_list_char_pointer (tree type)
11516 tree canonic;
11518 /* For 32-bit it is always true. */
11519 if (!TARGET_64BIT)
11520 return true;
11521 canonic = ix86_canonical_va_list_type (type);
11522 return (canonic == ms_va_list_type_node
11523 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11526 /* Implement va_start. */
11528 static void
11529 ix86_va_start (tree valist, rtx nextarg)
11531 HOST_WIDE_INT words, n_gpr, n_fpr;
11532 tree f_gpr, f_fpr, f_ovf, f_sav;
11533 tree gpr, fpr, ovf, sav, t;
11534 tree type;
11535 rtx ovf_rtx;
11537 if (flag_split_stack
11538 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11540 unsigned int scratch_regno;
11542 /* When we are splitting the stack, we can't refer to the stack
11543 arguments using internal_arg_pointer, because they may be on
11544 the old stack. The split stack prologue will arrange to
11545 leave a pointer to the old stack arguments in a scratch
11546 register, which we here copy to a pseudo-register. The split
11547 stack prologue can't set the pseudo-register directly because
11548 it (the prologue) runs before any registers have been saved. */
11550 scratch_regno = split_stack_prologue_scratch_regno ();
11551 if (scratch_regno != INVALID_REGNUM)
11553 rtx reg;
11554 rtx_insn *seq;
11556 reg = gen_reg_rtx (Pmode);
11557 cfun->machine->split_stack_varargs_pointer = reg;
11559 start_sequence ();
11560 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11561 seq = get_insns ();
11562 end_sequence ();
11564 push_topmost_sequence ();
11565 emit_insn_after (seq, entry_of_function ());
11566 pop_topmost_sequence ();
11570 /* Only 64bit target needs something special. */
11571 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11573 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11574 std_expand_builtin_va_start (valist, nextarg);
11575 else
11577 rtx va_r, next;
11579 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11580 next = expand_binop (ptr_mode, add_optab,
11581 cfun->machine->split_stack_varargs_pointer,
11582 crtl->args.arg_offset_rtx,
11583 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11584 convert_move (va_r, next, 0);
11586 /* Store zero bounds for va_list. */
11587 if (chkp_function_instrumented_p (current_function_decl))
11588 chkp_expand_bounds_reset_for_mem (valist,
11589 make_tree (TREE_TYPE (valist),
11590 next));
11593 return;
11596 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11597 f_fpr = DECL_CHAIN (f_gpr);
11598 f_ovf = DECL_CHAIN (f_fpr);
11599 f_sav = DECL_CHAIN (f_ovf);
11601 valist = build_simple_mem_ref (valist);
11602 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11603 /* The following should be folded into the MEM_REF offset. */
11604 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11605 f_gpr, NULL_TREE);
11606 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11607 f_fpr, NULL_TREE);
11608 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11609 f_ovf, NULL_TREE);
11610 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11611 f_sav, NULL_TREE);
11613 /* Count number of gp and fp argument registers used. */
11614 words = crtl->args.info.words;
11615 n_gpr = crtl->args.info.regno;
11616 n_fpr = crtl->args.info.sse_regno;
11618 if (cfun->va_list_gpr_size)
11620 type = TREE_TYPE (gpr);
11621 t = build2 (MODIFY_EXPR, type,
11622 gpr, build_int_cst (type, n_gpr * 8));
11623 TREE_SIDE_EFFECTS (t) = 1;
11624 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11627 if (TARGET_SSE && cfun->va_list_fpr_size)
11629 type = TREE_TYPE (fpr);
11630 t = build2 (MODIFY_EXPR, type, fpr,
11631 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11632 TREE_SIDE_EFFECTS (t) = 1;
11633 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11636 /* Find the overflow area. */
11637 type = TREE_TYPE (ovf);
11638 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11639 ovf_rtx = crtl->args.internal_arg_pointer;
11640 else
11641 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11642 t = make_tree (type, ovf_rtx);
11643 if (words != 0)
11644 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11646 /* Store zero bounds for overflow area pointer. */
11647 if (chkp_function_instrumented_p (current_function_decl))
11648 chkp_expand_bounds_reset_for_mem (ovf, t);
11650 t = build2 (MODIFY_EXPR, type, ovf, t);
11651 TREE_SIDE_EFFECTS (t) = 1;
11652 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11654 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11656 /* Find the register save area.
11657 Prologue of the function save it right above stack frame. */
11658 type = TREE_TYPE (sav);
11659 t = make_tree (type, frame_pointer_rtx);
11660 if (!ix86_varargs_gpr_size)
11661 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11663 /* Store zero bounds for save area pointer. */
11664 if (chkp_function_instrumented_p (current_function_decl))
11665 chkp_expand_bounds_reset_for_mem (sav, t);
11667 t = build2 (MODIFY_EXPR, type, sav, t);
11668 TREE_SIDE_EFFECTS (t) = 1;
11669 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11673 /* Implement va_arg. */
11675 static tree
11676 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11677 gimple_seq *post_p)
11679 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11680 tree f_gpr, f_fpr, f_ovf, f_sav;
11681 tree gpr, fpr, ovf, sav, t;
11682 int size, rsize;
11683 tree lab_false, lab_over = NULL_TREE;
11684 tree addr, t2;
11685 rtx container;
11686 int indirect_p = 0;
11687 tree ptrtype;
11688 machine_mode nat_mode;
11689 unsigned int arg_boundary;
11691 /* Only 64bit target needs something special. */
11692 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11693 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11695 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11696 f_fpr = DECL_CHAIN (f_gpr);
11697 f_ovf = DECL_CHAIN (f_fpr);
11698 f_sav = DECL_CHAIN (f_ovf);
11700 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11701 valist, f_gpr, NULL_TREE);
11703 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11704 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11705 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11707 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11708 if (indirect_p)
11709 type = build_pointer_type (type);
11710 size = int_size_in_bytes (type);
11711 rsize = CEIL (size, UNITS_PER_WORD);
11713 nat_mode = type_natural_mode (type, NULL, false);
11714 switch (nat_mode)
11716 case E_V8SFmode:
11717 case E_V8SImode:
11718 case E_V32QImode:
11719 case E_V16HImode:
11720 case E_V4DFmode:
11721 case E_V4DImode:
11722 case E_V16SFmode:
11723 case E_V16SImode:
11724 case E_V64QImode:
11725 case E_V32HImode:
11726 case E_V8DFmode:
11727 case E_V8DImode:
11728 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11729 if (!TARGET_64BIT_MS_ABI)
11731 container = NULL;
11732 break;
11734 /* FALLTHRU */
11736 default:
11737 container = construct_container (nat_mode, TYPE_MODE (type),
11738 type, 0, X86_64_REGPARM_MAX,
11739 X86_64_SSE_REGPARM_MAX, intreg,
11741 break;
11744 /* Pull the value out of the saved registers. */
11746 addr = create_tmp_var (ptr_type_node, "addr");
11748 if (container)
11750 int needed_intregs, needed_sseregs;
11751 bool need_temp;
11752 tree int_addr, sse_addr;
11754 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11755 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11757 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11759 need_temp = (!REG_P (container)
11760 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11761 || TYPE_ALIGN (type) > 128));
11763 /* In case we are passing structure, verify that it is consecutive block
11764 on the register save area. If not we need to do moves. */
11765 if (!need_temp && !REG_P (container))
11767 /* Verify that all registers are strictly consecutive */
11768 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11770 int i;
11772 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11774 rtx slot = XVECEXP (container, 0, i);
11775 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11776 || INTVAL (XEXP (slot, 1)) != i * 16)
11777 need_temp = true;
11780 else
11782 int i;
11784 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11786 rtx slot = XVECEXP (container, 0, i);
11787 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11788 || INTVAL (XEXP (slot, 1)) != i * 8)
11789 need_temp = true;
11793 if (!need_temp)
11795 int_addr = addr;
11796 sse_addr = addr;
11798 else
11800 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11801 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11804 /* First ensure that we fit completely in registers. */
11805 if (needed_intregs)
11807 t = build_int_cst (TREE_TYPE (gpr),
11808 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11809 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11810 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11811 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11812 gimplify_and_add (t, pre_p);
11814 if (needed_sseregs)
11816 t = build_int_cst (TREE_TYPE (fpr),
11817 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11818 + X86_64_REGPARM_MAX * 8);
11819 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11820 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11821 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11822 gimplify_and_add (t, pre_p);
11825 /* Compute index to start of area used for integer regs. */
11826 if (needed_intregs)
11828 /* int_addr = gpr + sav; */
11829 t = fold_build_pointer_plus (sav, gpr);
11830 gimplify_assign (int_addr, t, pre_p);
11832 if (needed_sseregs)
11834 /* sse_addr = fpr + sav; */
11835 t = fold_build_pointer_plus (sav, fpr);
11836 gimplify_assign (sse_addr, t, pre_p);
11838 if (need_temp)
11840 int i, prev_size = 0;
11841 tree temp = create_tmp_var (type, "va_arg_tmp");
11843 /* addr = &temp; */
11844 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11845 gimplify_assign (addr, t, pre_p);
11847 for (i = 0; i < XVECLEN (container, 0); i++)
11849 rtx slot = XVECEXP (container, 0, i);
11850 rtx reg = XEXP (slot, 0);
11851 machine_mode mode = GET_MODE (reg);
11852 tree piece_type;
11853 tree addr_type;
11854 tree daddr_type;
11855 tree src_addr, src;
11856 int src_offset;
11857 tree dest_addr, dest;
11858 int cur_size = GET_MODE_SIZE (mode);
11860 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11861 prev_size = INTVAL (XEXP (slot, 1));
11862 if (prev_size + cur_size > size)
11864 cur_size = size - prev_size;
11865 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11866 if (mode == BLKmode)
11867 mode = QImode;
11869 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11870 if (mode == GET_MODE (reg))
11871 addr_type = build_pointer_type (piece_type);
11872 else
11873 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11874 true);
11875 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11876 true);
11878 if (SSE_REGNO_P (REGNO (reg)))
11880 src_addr = sse_addr;
11881 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11883 else
11885 src_addr = int_addr;
11886 src_offset = REGNO (reg) * 8;
11888 src_addr = fold_convert (addr_type, src_addr);
11889 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11891 dest_addr = fold_convert (daddr_type, addr);
11892 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11893 if (cur_size == GET_MODE_SIZE (mode))
11895 src = build_va_arg_indirect_ref (src_addr);
11896 dest = build_va_arg_indirect_ref (dest_addr);
11898 gimplify_assign (dest, src, pre_p);
11900 else
11902 tree copy
11903 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11904 3, dest_addr, src_addr,
11905 size_int (cur_size));
11906 gimplify_and_add (copy, pre_p);
11908 prev_size += cur_size;
11912 if (needed_intregs)
11914 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11915 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11916 gimplify_assign (gpr, t, pre_p);
11919 if (needed_sseregs)
11921 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11922 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11923 gimplify_assign (unshare_expr (fpr), t, pre_p);
11926 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11928 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11931 /* ... otherwise out of the overflow area. */
11933 /* When we align parameter on stack for caller, if the parameter
11934 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11935 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11936 here with caller. */
11937 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11938 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11939 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11941 /* Care for on-stack alignment if needed. */
11942 if (arg_boundary <= 64 || size == 0)
11943 t = ovf;
11944 else
11946 HOST_WIDE_INT align = arg_boundary / 8;
11947 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11948 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11949 build_int_cst (TREE_TYPE (t), -align));
11952 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11953 gimplify_assign (addr, t, pre_p);
11955 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11956 gimplify_assign (unshare_expr (ovf), t, pre_p);
11958 if (container)
11959 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11961 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11962 addr = fold_convert (ptrtype, addr);
11964 if (indirect_p)
11965 addr = build_va_arg_indirect_ref (addr);
11966 return build_va_arg_indirect_ref (addr);
11969 /* Return true if OPNUM's MEM should be matched
11970 in movabs* patterns. */
11972 bool
11973 ix86_check_movabs (rtx insn, int opnum)
11975 rtx set, mem;
11977 set = PATTERN (insn);
11978 if (GET_CODE (set) == PARALLEL)
11979 set = XVECEXP (set, 0, 0);
11980 gcc_assert (GET_CODE (set) == SET);
11981 mem = XEXP (set, opnum);
11982 while (SUBREG_P (mem))
11983 mem = SUBREG_REG (mem);
11984 gcc_assert (MEM_P (mem));
11985 return volatile_ok || !MEM_VOLATILE_P (mem);
11988 /* Return false if INSN contains a MEM with a non-default address space. */
11989 bool
11990 ix86_check_no_addr_space (rtx insn)
11992 subrtx_var_iterator::array_type array;
11993 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11995 rtx x = *iter;
11996 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11997 return false;
11999 return true;
12002 /* Initialize the table of extra 80387 mathematical constants. */
12004 static void
12005 init_ext_80387_constants (void)
12007 static const char * cst[5] =
12009 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
12010 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
12011 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
12012 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
12013 "3.1415926535897932385128089594061862044", /* 4: fldpi */
12015 int i;
12017 for (i = 0; i < 5; i++)
12019 real_from_string (&ext_80387_constants_table[i], cst[i]);
12020 /* Ensure each constant is rounded to XFmode precision. */
12021 real_convert (&ext_80387_constants_table[i],
12022 XFmode, &ext_80387_constants_table[i]);
12025 ext_80387_constants_init = 1;
12028 /* Return non-zero if the constant is something that
12029 can be loaded with a special instruction. */
12032 standard_80387_constant_p (rtx x)
12034 machine_mode mode = GET_MODE (x);
12036 const REAL_VALUE_TYPE *r;
12038 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
12039 return -1;
12041 if (x == CONST0_RTX (mode))
12042 return 1;
12043 if (x == CONST1_RTX (mode))
12044 return 2;
12046 r = CONST_DOUBLE_REAL_VALUE (x);
12048 /* For XFmode constants, try to find a special 80387 instruction when
12049 optimizing for size or on those CPUs that benefit from them. */
12050 if (mode == XFmode
12051 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
12053 int i;
12055 if (! ext_80387_constants_init)
12056 init_ext_80387_constants ();
12058 for (i = 0; i < 5; i++)
12059 if (real_identical (r, &ext_80387_constants_table[i]))
12060 return i + 3;
12063 /* Load of the constant -0.0 or -1.0 will be split as
12064 fldz;fchs or fld1;fchs sequence. */
12065 if (real_isnegzero (r))
12066 return 8;
12067 if (real_identical (r, &dconstm1))
12068 return 9;
12070 return 0;
12073 /* Return the opcode of the special instruction to be used to load
12074 the constant X. */
12076 const char *
12077 standard_80387_constant_opcode (rtx x)
12079 switch (standard_80387_constant_p (x))
12081 case 1:
12082 return "fldz";
12083 case 2:
12084 return "fld1";
12085 case 3:
12086 return "fldlg2";
12087 case 4:
12088 return "fldln2";
12089 case 5:
12090 return "fldl2e";
12091 case 6:
12092 return "fldl2t";
12093 case 7:
12094 return "fldpi";
12095 case 8:
12096 case 9:
12097 return "#";
12098 default:
12099 gcc_unreachable ();
12103 /* Return the CONST_DOUBLE representing the 80387 constant that is
12104 loaded by the specified special instruction. The argument IDX
12105 matches the return value from standard_80387_constant_p. */
12108 standard_80387_constant_rtx (int idx)
12110 int i;
12112 if (! ext_80387_constants_init)
12113 init_ext_80387_constants ();
12115 switch (idx)
12117 case 3:
12118 case 4:
12119 case 5:
12120 case 6:
12121 case 7:
12122 i = idx - 3;
12123 break;
12125 default:
12126 gcc_unreachable ();
12129 return const_double_from_real_value (ext_80387_constants_table[i],
12130 XFmode);
12133 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12134 in supported SSE/AVX vector mode. */
12137 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12139 machine_mode mode;
12141 if (!TARGET_SSE)
12142 return 0;
12144 mode = GET_MODE (x);
12146 if (x == const0_rtx || const0_operand (x, mode))
12147 return 1;
12149 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12151 /* VOIDmode integer constant, get mode from the predicate. */
12152 if (mode == VOIDmode)
12153 mode = pred_mode;
12155 switch (GET_MODE_SIZE (mode))
12157 case 64:
12158 if (TARGET_AVX512F)
12159 return 2;
12160 break;
12161 case 32:
12162 if (TARGET_AVX2)
12163 return 2;
12164 break;
12165 case 16:
12166 if (TARGET_SSE2)
12167 return 2;
12168 break;
12169 case 0:
12170 /* VOIDmode */
12171 gcc_unreachable ();
12172 default:
12173 break;
12177 return 0;
12180 /* Return the opcode of the special instruction to be used to load
12181 the constant X. */
12183 const char *
12184 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12186 machine_mode mode;
12188 gcc_assert (TARGET_SSE);
12190 mode = GET_MODE (x);
12192 if (x == const0_rtx || const0_operand (x, mode))
12194 switch (get_attr_mode (insn))
12196 case MODE_XI:
12197 return "vpxord\t%g0, %g0, %g0";
12198 case MODE_OI:
12199 return (TARGET_AVX512VL
12200 ? "vpxord\t%x0, %x0, %x0"
12201 : "vpxor\t%x0, %x0, %x0");
12202 case MODE_TI:
12203 return (TARGET_AVX512VL
12204 ? "vpxord\t%t0, %t0, %t0"
12205 : "%vpxor\t%0, %d0");
12207 case MODE_V8DF:
12208 return (TARGET_AVX512DQ
12209 ? "vxorpd\t%g0, %g0, %g0"
12210 : "vpxorq\t%g0, %g0, %g0");
12211 case MODE_V4DF:
12212 return "vxorpd\t%x0, %x0, %x0";
12213 case MODE_V2DF:
12214 return "%vxorpd\t%0, %d0";
12216 case MODE_V16SF:
12217 return (TARGET_AVX512DQ
12218 ? "vxorps\t%g0, %g0, %g0"
12219 : "vpxord\t%g0, %g0, %g0");
12220 case MODE_V8SF:
12221 return "vxorps\t%x0, %x0, %x0";
12222 case MODE_V4SF:
12223 return "%vxorps\t%0, %d0";
12225 default:
12226 gcc_unreachable ();
12229 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12231 enum attr_mode insn_mode = get_attr_mode (insn);
12233 switch (insn_mode)
12235 case MODE_XI:
12236 case MODE_V8DF:
12237 case MODE_V16SF:
12238 gcc_assert (TARGET_AVX512F);
12239 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12241 case MODE_OI:
12242 case MODE_V4DF:
12243 case MODE_V8SF:
12244 gcc_assert (TARGET_AVX2);
12245 /* FALLTHRU */
12246 case MODE_TI:
12247 case MODE_V2DF:
12248 case MODE_V4SF:
12249 gcc_assert (TARGET_SSE2);
12250 return (TARGET_AVX
12251 ? "vpcmpeqd\t%0, %0, %0"
12252 : "pcmpeqd\t%0, %0");
12254 default:
12255 gcc_unreachable ();
12259 gcc_unreachable ();
12262 /* Returns true if INSN can be transformed from a memory load
12263 to a supported FP constant load. */
12265 bool
12266 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12268 rtx src = find_constant_src (insn);
12270 gcc_assert (REG_P (dst));
12272 if (src == NULL
12273 || (SSE_REGNO_P (REGNO (dst))
12274 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12275 || (STACK_REGNO_P (REGNO (dst))
12276 && standard_80387_constant_p (src) < 1))
12277 return false;
12279 return true;
12282 /* Returns true if OP contains a symbol reference */
12284 bool
12285 symbolic_reference_mentioned_p (rtx op)
12287 const char *fmt;
12288 int i;
12290 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12291 return true;
12293 fmt = GET_RTX_FORMAT (GET_CODE (op));
12294 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12296 if (fmt[i] == 'E')
12298 int j;
12300 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12301 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12302 return true;
12305 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12306 return true;
12309 return false;
12312 /* Return true if it is appropriate to emit `ret' instructions in the
12313 body of a function. Do this only if the epilogue is simple, needing a
12314 couple of insns. Prior to reloading, we can't tell how many registers
12315 must be saved, so return false then. Return false if there is no frame
12316 marker to de-allocate. */
12318 bool
12319 ix86_can_use_return_insn_p (void)
12321 struct ix86_frame frame;
12323 if (ix86_function_naked (current_function_decl))
12324 return false;
12326 /* Don't use `ret' instruction in interrupt handler. */
12327 if (! reload_completed
12328 || frame_pointer_needed
12329 || cfun->machine->func_type != TYPE_NORMAL)
12330 return 0;
12332 /* Don't allow more than 32k pop, since that's all we can do
12333 with one instruction. */
12334 if (crtl->args.pops_args && crtl->args.size >= 32768)
12335 return 0;
12337 frame = cfun->machine->frame;
12338 return (frame.stack_pointer_offset == UNITS_PER_WORD
12339 && (frame.nregs + frame.nsseregs) == 0);
12342 /* Value should be nonzero if functions must have frame pointers.
12343 Zero means the frame pointer need not be set up (and parms may
12344 be accessed via the stack pointer) in functions that seem suitable. */
12346 static bool
12347 ix86_frame_pointer_required (void)
12349 /* If we accessed previous frames, then the generated code expects
12350 to be able to access the saved ebp value in our frame. */
12351 if (cfun->machine->accesses_prev_frame)
12352 return true;
12354 /* Several x86 os'es need a frame pointer for other reasons,
12355 usually pertaining to setjmp. */
12356 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12357 return true;
12359 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12360 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12361 return true;
12363 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12364 allocation is 4GB. */
12365 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12366 return true;
12368 /* SSE saves require frame-pointer when stack is misaligned. */
12369 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12370 return true;
12372 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12373 turns off the frame pointer by default. Turn it back on now if
12374 we've not got a leaf function. */
12375 if (TARGET_OMIT_LEAF_FRAME_POINTER
12376 && (!crtl->is_leaf
12377 || ix86_current_function_calls_tls_descriptor))
12378 return true;
12380 if (crtl->profile && !flag_fentry)
12381 return true;
12383 return false;
12386 /* Record that the current function accesses previous call frames. */
12388 void
12389 ix86_setup_frame_addresses (void)
12391 cfun->machine->accesses_prev_frame = 1;
12394 #ifndef USE_HIDDEN_LINKONCE
12395 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12396 # define USE_HIDDEN_LINKONCE 1
12397 # else
12398 # define USE_HIDDEN_LINKONCE 0
12399 # endif
12400 #endif
12402 static int pic_labels_used;
12404 /* Fills in the label name that should be used for a pc thunk for
12405 the given register. */
12407 static void
12408 get_pc_thunk_name (char name[32], unsigned int regno)
12410 gcc_assert (!TARGET_64BIT);
12412 if (USE_HIDDEN_LINKONCE)
12413 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12414 else
12415 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12419 /* This function generates code for -fpic that loads %ebx with
12420 the return address of the caller and then returns. */
12422 static void
12423 ix86_code_end (void)
12425 rtx xops[2];
12426 int regno;
12428 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12430 char name[32];
12431 tree decl;
12433 if (!(pic_labels_used & (1 << regno)))
12434 continue;
12436 get_pc_thunk_name (name, regno);
12438 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12439 get_identifier (name),
12440 build_function_type_list (void_type_node, NULL_TREE));
12441 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12442 NULL_TREE, void_type_node);
12443 TREE_PUBLIC (decl) = 1;
12444 TREE_STATIC (decl) = 1;
12445 DECL_IGNORED_P (decl) = 1;
12447 #if TARGET_MACHO
12448 if (TARGET_MACHO)
12450 switch_to_section (darwin_sections[picbase_thunk_section]);
12451 fputs ("\t.weak_definition\t", asm_out_file);
12452 assemble_name (asm_out_file, name);
12453 fputs ("\n\t.private_extern\t", asm_out_file);
12454 assemble_name (asm_out_file, name);
12455 putc ('\n', asm_out_file);
12456 ASM_OUTPUT_LABEL (asm_out_file, name);
12457 DECL_WEAK (decl) = 1;
12459 else
12460 #endif
12461 if (USE_HIDDEN_LINKONCE)
12463 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12465 targetm.asm_out.unique_section (decl, 0);
12466 switch_to_section (get_named_section (decl, NULL, 0));
12468 targetm.asm_out.globalize_label (asm_out_file, name);
12469 fputs ("\t.hidden\t", asm_out_file);
12470 assemble_name (asm_out_file, name);
12471 putc ('\n', asm_out_file);
12472 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12474 else
12476 switch_to_section (text_section);
12477 ASM_OUTPUT_LABEL (asm_out_file, name);
12480 DECL_INITIAL (decl) = make_node (BLOCK);
12481 current_function_decl = decl;
12482 allocate_struct_function (decl, false);
12483 init_function_start (decl);
12484 /* We're about to hide the function body from callees of final_* by
12485 emitting it directly; tell them we're a thunk, if they care. */
12486 cfun->is_thunk = true;
12487 first_function_block_is_cold = false;
12488 /* Make sure unwind info is emitted for the thunk if needed. */
12489 final_start_function (emit_barrier (), asm_out_file, 1);
12491 /* Pad stack IP move with 4 instructions (two NOPs count
12492 as one instruction). */
12493 if (TARGET_PAD_SHORT_FUNCTION)
12495 int i = 8;
12497 while (i--)
12498 fputs ("\tnop\n", asm_out_file);
12501 xops[0] = gen_rtx_REG (Pmode, regno);
12502 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12503 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12504 output_asm_insn ("%!ret", NULL);
12505 final_end_function ();
12506 init_insn_lengths ();
12507 free_after_compilation (cfun);
12508 set_cfun (NULL);
12509 current_function_decl = NULL;
12512 if (flag_split_stack)
12513 file_end_indicate_split_stack ();
12516 /* Emit code for the SET_GOT patterns. */
12518 const char *
12519 output_set_got (rtx dest, rtx label)
12521 rtx xops[3];
12523 xops[0] = dest;
12525 if (TARGET_VXWORKS_RTP && flag_pic)
12527 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12528 xops[2] = gen_rtx_MEM (Pmode,
12529 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12530 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12532 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12533 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12534 an unadorned address. */
12535 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12536 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12537 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12538 return "";
12541 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12543 if (flag_pic)
12545 char name[32];
12546 get_pc_thunk_name (name, REGNO (dest));
12547 pic_labels_used |= 1 << REGNO (dest);
12549 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12550 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12551 output_asm_insn ("%!call\t%X2", xops);
12553 #if TARGET_MACHO
12554 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12555 This is what will be referenced by the Mach-O PIC subsystem. */
12556 if (machopic_should_output_picbase_label () || !label)
12557 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12559 /* When we are restoring the pic base at the site of a nonlocal label,
12560 and we decided to emit the pic base above, we will still output a
12561 local label used for calculating the correction offset (even though
12562 the offset will be 0 in that case). */
12563 if (label)
12564 targetm.asm_out.internal_label (asm_out_file, "L",
12565 CODE_LABEL_NUMBER (label));
12566 #endif
12568 else
12570 if (TARGET_MACHO)
12571 /* We don't need a pic base, we're not producing pic. */
12572 gcc_unreachable ();
12574 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12575 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12576 targetm.asm_out.internal_label (asm_out_file, "L",
12577 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12580 if (!TARGET_MACHO)
12581 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12583 return "";
12586 /* Generate an "push" pattern for input ARG. */
12588 static rtx
12589 gen_push (rtx arg)
12591 struct machine_function *m = cfun->machine;
12593 if (m->fs.cfa_reg == stack_pointer_rtx)
12594 m->fs.cfa_offset += UNITS_PER_WORD;
12595 m->fs.sp_offset += UNITS_PER_WORD;
12597 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12598 arg = gen_rtx_REG (word_mode, REGNO (arg));
12600 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12601 gen_rtx_PRE_DEC (Pmode,
12602 stack_pointer_rtx)),
12603 arg);
12606 /* Generate an "pop" pattern for input ARG. */
12608 static rtx
12609 gen_pop (rtx arg)
12611 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12612 arg = gen_rtx_REG (word_mode, REGNO (arg));
12614 return gen_rtx_SET (arg,
12615 gen_rtx_MEM (word_mode,
12616 gen_rtx_POST_INC (Pmode,
12617 stack_pointer_rtx)));
12620 /* Return >= 0 if there is an unused call-clobbered register available
12621 for the entire function. */
12623 static unsigned int
12624 ix86_select_alt_pic_regnum (void)
12626 if (ix86_use_pseudo_pic_reg ())
12627 return INVALID_REGNUM;
12629 if (crtl->is_leaf
12630 && !crtl->profile
12631 && !ix86_current_function_calls_tls_descriptor)
12633 int i, drap;
12634 /* Can't use the same register for both PIC and DRAP. */
12635 if (crtl->drap_reg)
12636 drap = REGNO (crtl->drap_reg);
12637 else
12638 drap = -1;
12639 for (i = 2; i >= 0; --i)
12640 if (i != drap && !df_regs_ever_live_p (i))
12641 return i;
12644 return INVALID_REGNUM;
12647 /* Return true if REGNO is used by the epilogue. */
12649 bool
12650 ix86_epilogue_uses (int regno)
12652 /* If there are no caller-saved registers, we preserve all registers,
12653 except for MMX and x87 registers which aren't supported when saving
12654 and restoring registers. Don't explicitly save SP register since
12655 it is always preserved. */
12656 return (epilogue_completed
12657 && cfun->machine->no_caller_saved_registers
12658 && !fixed_regs[regno]
12659 && !STACK_REGNO_P (regno)
12660 && !MMX_REGNO_P (regno));
12663 /* Return nonzero if register REGNO can be used as a scratch register
12664 in peephole2. */
12666 static bool
12667 ix86_hard_regno_scratch_ok (unsigned int regno)
12669 /* If there are no caller-saved registers, we can't use any register
12670 as a scratch register after epilogue and use REGNO as scratch
12671 register only if it has been used before to avoid saving and
12672 restoring it. */
12673 return (!cfun->machine->no_caller_saved_registers
12674 || (!epilogue_completed
12675 && df_regs_ever_live_p (regno)));
12678 /* Return true if register class CL should be an additional allocno
12679 class. */
12681 static bool
12682 ix86_additional_allocno_class_p (reg_class_t cl)
12684 return cl == MOD4_SSE_REGS;
12687 /* Return TRUE if we need to save REGNO. */
12689 static bool
12690 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12692 /* If there are no caller-saved registers, we preserve all registers,
12693 except for MMX and x87 registers which aren't supported when saving
12694 and restoring registers. Don't explicitly save SP register since
12695 it is always preserved. */
12696 if (cfun->machine->no_caller_saved_registers)
12698 /* Don't preserve registers used for function return value. */
12699 rtx reg = crtl->return_rtx;
12700 if (reg)
12702 unsigned int i = REGNO (reg);
12703 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12704 while (nregs-- > 0)
12705 if ((i + nregs) == regno)
12706 return false;
12708 reg = crtl->return_bnd;
12709 if (reg)
12711 i = REGNO (reg);
12712 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12713 while (nregs-- > 0)
12714 if ((i + nregs) == regno)
12715 return false;
12719 return (df_regs_ever_live_p (regno)
12720 && !fixed_regs[regno]
12721 && !STACK_REGNO_P (regno)
12722 && !MMX_REGNO_P (regno)
12723 && (regno != HARD_FRAME_POINTER_REGNUM
12724 || !frame_pointer_needed));
12727 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12728 && pic_offset_table_rtx)
12730 if (ix86_use_pseudo_pic_reg ())
12732 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12733 _mcount in prologue. */
12734 if (!TARGET_64BIT && flag_pic && crtl->profile)
12735 return true;
12737 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12738 || crtl->profile
12739 || crtl->calls_eh_return
12740 || crtl->uses_const_pool
12741 || cfun->has_nonlocal_label)
12742 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12745 if (crtl->calls_eh_return && maybe_eh_return)
12747 unsigned i;
12748 for (i = 0; ; i++)
12750 unsigned test = EH_RETURN_DATA_REGNO (i);
12751 if (test == INVALID_REGNUM)
12752 break;
12753 if (test == regno)
12754 return true;
12758 if (ignore_outlined && cfun->machine->call_ms2sysv)
12760 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12761 + xlogue_layout::MIN_REGS;
12762 if (xlogue_layout::is_stub_managed_reg (regno, count))
12763 return false;
12766 if (crtl->drap_reg
12767 && regno == REGNO (crtl->drap_reg)
12768 && !cfun->machine->no_drap_save_restore)
12769 return true;
12771 return (df_regs_ever_live_p (regno)
12772 && !call_used_regs[regno]
12773 && !fixed_regs[regno]
12774 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12777 /* Return number of saved general prupose registers. */
12779 static int
12780 ix86_nsaved_regs (void)
12782 int nregs = 0;
12783 int regno;
12785 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12786 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12787 nregs ++;
12788 return nregs;
12791 /* Return number of saved SSE registers. */
12793 static int
12794 ix86_nsaved_sseregs (void)
12796 int nregs = 0;
12797 int regno;
12799 if (!TARGET_64BIT_MS_ABI)
12800 return 0;
12801 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12802 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12803 nregs ++;
12804 return nregs;
12807 /* Given FROM and TO register numbers, say whether this elimination is
12808 allowed. If stack alignment is needed, we can only replace argument
12809 pointer with hard frame pointer, or replace frame pointer with stack
12810 pointer. Otherwise, frame pointer elimination is automatically
12811 handled and all other eliminations are valid. */
12813 static bool
12814 ix86_can_eliminate (const int from, const int to)
12816 if (stack_realign_fp)
12817 return ((from == ARG_POINTER_REGNUM
12818 && to == HARD_FRAME_POINTER_REGNUM)
12819 || (from == FRAME_POINTER_REGNUM
12820 && to == STACK_POINTER_REGNUM));
12821 else
12822 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12825 /* Return the offset between two registers, one to be eliminated, and the other
12826 its replacement, at the start of a routine. */
12828 HOST_WIDE_INT
12829 ix86_initial_elimination_offset (int from, int to)
12831 struct ix86_frame frame = cfun->machine->frame;
12833 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12834 return frame.hard_frame_pointer_offset;
12835 else if (from == FRAME_POINTER_REGNUM
12836 && to == HARD_FRAME_POINTER_REGNUM)
12837 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12838 else
12840 gcc_assert (to == STACK_POINTER_REGNUM);
12842 if (from == ARG_POINTER_REGNUM)
12843 return frame.stack_pointer_offset;
12845 gcc_assert (from == FRAME_POINTER_REGNUM);
12846 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12850 /* In a dynamically-aligned function, we can't know the offset from
12851 stack pointer to frame pointer, so we must ensure that setjmp
12852 eliminates fp against the hard fp (%ebp) rather than trying to
12853 index from %esp up to the top of the frame across a gap that is
12854 of unknown (at compile-time) size. */
12855 static rtx
12856 ix86_builtin_setjmp_frame_value (void)
12858 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12861 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12862 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12864 static bool warned_once = false;
12865 if (!warned_once)
12867 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12868 feature);
12869 warned_once = true;
12873 /* When using -fsplit-stack, the allocation routines set a field in
12874 the TCB to the bottom of the stack plus this much space, measured
12875 in bytes. */
12877 #define SPLIT_STACK_AVAILABLE 256
12879 /* Fill structure ix86_frame about frame of currently computed function. */
12881 static void
12882 ix86_compute_frame_layout (void)
12884 struct ix86_frame *frame = &cfun->machine->frame;
12885 struct machine_function *m = cfun->machine;
12886 unsigned HOST_WIDE_INT stack_alignment_needed;
12887 HOST_WIDE_INT offset;
12888 unsigned HOST_WIDE_INT preferred_alignment;
12889 HOST_WIDE_INT size = get_frame_size ();
12890 HOST_WIDE_INT to_allocate;
12892 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12893 * ms_abi functions that call a sysv function. We now need to prune away
12894 * cases where it should be disabled. */
12895 if (TARGET_64BIT && m->call_ms2sysv)
12897 gcc_assert (TARGET_64BIT_MS_ABI);
12898 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12899 gcc_assert (!TARGET_SEH);
12900 gcc_assert (TARGET_SSE);
12901 gcc_assert (!ix86_using_red_zone ());
12903 if (crtl->calls_eh_return)
12905 gcc_assert (!reload_completed);
12906 m->call_ms2sysv = false;
12907 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12910 else if (ix86_static_chain_on_stack)
12912 gcc_assert (!reload_completed);
12913 m->call_ms2sysv = false;
12914 warn_once_call_ms2sysv_xlogues ("static call chains");
12917 /* Finally, compute which registers the stub will manage. */
12918 else
12920 unsigned count = xlogue_layout::count_stub_managed_regs ();
12921 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12922 m->call_ms2sysv_pad_in = 0;
12926 frame->nregs = ix86_nsaved_regs ();
12927 frame->nsseregs = ix86_nsaved_sseregs ();
12929 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12930 except for function prologues, leaf functions and when the defult
12931 incoming stack boundary is overriden at command line or via
12932 force_align_arg_pointer attribute. */
12933 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12934 && (!crtl->is_leaf || cfun->calls_alloca != 0
12935 || ix86_current_function_calls_tls_descriptor
12936 || ix86_incoming_stack_boundary < 128))
12938 crtl->preferred_stack_boundary = 128;
12939 crtl->stack_alignment_needed = 128;
12942 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12943 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12945 gcc_assert (!size || stack_alignment_needed);
12946 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12947 gcc_assert (preferred_alignment <= stack_alignment_needed);
12949 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
12950 gcc_assert (TARGET_64BIT || !frame->nsseregs);
12951 if (TARGET_64BIT && m->call_ms2sysv)
12953 gcc_assert (stack_alignment_needed >= 16);
12954 gcc_assert (!frame->nsseregs);
12957 /* For SEH we have to limit the amount of code movement into the prologue.
12958 At present we do this via a BLOCKAGE, at which point there's very little
12959 scheduling that can be done, which means that there's very little point
12960 in doing anything except PUSHs. */
12961 if (TARGET_SEH)
12962 m->use_fast_prologue_epilogue = false;
12963 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12965 int count = frame->nregs;
12966 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12968 /* The fast prologue uses move instead of push to save registers. This
12969 is significantly longer, but also executes faster as modern hardware
12970 can execute the moves in parallel, but can't do that for push/pop.
12972 Be careful about choosing what prologue to emit: When function takes
12973 many instructions to execute we may use slow version as well as in
12974 case function is known to be outside hot spot (this is known with
12975 feedback only). Weight the size of function by number of registers
12976 to save as it is cheap to use one or two push instructions but very
12977 slow to use many of them. */
12978 if (count)
12979 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12980 if (node->frequency < NODE_FREQUENCY_NORMAL
12981 || (flag_branch_probabilities
12982 && node->frequency < NODE_FREQUENCY_HOT))
12983 m->use_fast_prologue_epilogue = false;
12984 else
12985 m->use_fast_prologue_epilogue
12986 = !expensive_function_p (count);
12989 frame->save_regs_using_mov
12990 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12991 /* If static stack checking is enabled and done with probes,
12992 the registers need to be saved before allocating the frame. */
12993 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12995 /* Skip return address and error code in exception handler. */
12996 offset = INCOMING_FRAME_SP_OFFSET;
12998 /* Skip pushed static chain. */
12999 if (ix86_static_chain_on_stack)
13000 offset += UNITS_PER_WORD;
13002 /* Skip saved base pointer. */
13003 if (frame_pointer_needed)
13004 offset += UNITS_PER_WORD;
13005 frame->hfp_save_offset = offset;
13007 /* The traditional frame pointer location is at the top of the frame. */
13008 frame->hard_frame_pointer_offset = offset;
13010 /* Register save area */
13011 offset += frame->nregs * UNITS_PER_WORD;
13012 frame->reg_save_offset = offset;
13014 /* On SEH target, registers are pushed just before the frame pointer
13015 location. */
13016 if (TARGET_SEH)
13017 frame->hard_frame_pointer_offset = offset;
13019 /* Calculate the size of the va-arg area (not including padding, if any). */
13020 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
13022 if (stack_realign_fp)
13024 /* We may need a 16-byte aligned stack for the remainder of the
13025 register save area, but the stack frame for the local function
13026 may require a greater alignment if using AVX/2/512. In order
13027 to avoid wasting space, we first calculate the space needed for
13028 the rest of the register saves, add that to the stack pointer,
13029 and then realign the stack to the boundary of the start of the
13030 frame for the local function. */
13031 HOST_WIDE_INT space_needed = 0;
13032 HOST_WIDE_INT sse_reg_space_needed = 0;
13034 if (TARGET_64BIT)
13036 if (m->call_ms2sysv)
13038 m->call_ms2sysv_pad_in = 0;
13039 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
13042 else if (frame->nsseregs)
13043 /* The only ABI that has saved SSE registers (Win64) also has a
13044 16-byte aligned default stack. However, many programs violate
13045 the ABI, and Wine64 forces stack realignment to compensate. */
13046 space_needed = frame->nsseregs * 16;
13048 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
13050 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
13051 rounding to be pedantic. */
13052 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
13054 else
13055 space_needed = frame->va_arg_size;
13057 /* Record the allocation size required prior to the realignment AND. */
13058 frame->stack_realign_allocate = space_needed;
13060 /* The re-aligned stack starts at frame->stack_realign_offset. Values
13061 before this point are not directly comparable with values below
13062 this point. Use sp_valid_at to determine if the stack pointer is
13063 valid for a given offset, fp_valid_at for the frame pointer, or
13064 choose_baseaddr to have a base register chosen for you.
13066 Note that the result of (frame->stack_realign_offset
13067 & (stack_alignment_needed - 1)) may not equal zero. */
13068 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
13069 frame->stack_realign_offset = offset - space_needed;
13070 frame->sse_reg_save_offset = frame->stack_realign_offset
13071 + sse_reg_space_needed;
13073 else
13075 frame->stack_realign_offset = offset;
13077 if (TARGET_64BIT && m->call_ms2sysv)
13079 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
13080 offset += xlogue_layout::get_instance ().get_stack_space_used ();
13083 /* Align and set SSE register save area. */
13084 else if (frame->nsseregs)
13086 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
13087 required and the DRAP re-alignment boundary is at least 16 bytes,
13088 then we want the SSE register save area properly aligned. */
13089 if (ix86_incoming_stack_boundary >= 128
13090 || (stack_realign_drap && stack_alignment_needed >= 16))
13091 offset = ROUND_UP (offset, 16);
13092 offset += frame->nsseregs * 16;
13094 frame->sse_reg_save_offset = offset;
13095 offset += frame->va_arg_size;
13098 /* Align start of frame for local function. */
13099 if (m->call_ms2sysv
13100 || frame->va_arg_size != 0
13101 || size != 0
13102 || !crtl->is_leaf
13103 || cfun->calls_alloca
13104 || ix86_current_function_calls_tls_descriptor)
13105 offset = ROUND_UP (offset, stack_alignment_needed);
13107 /* Frame pointer points here. */
13108 frame->frame_pointer_offset = offset;
13110 offset += size;
13112 /* Add outgoing arguments area. Can be skipped if we eliminated
13113 all the function calls as dead code.
13114 Skipping is however impossible when function calls alloca. Alloca
13115 expander assumes that last crtl->outgoing_args_size
13116 of stack frame are unused. */
13117 if (ACCUMULATE_OUTGOING_ARGS
13118 && (!crtl->is_leaf || cfun->calls_alloca
13119 || ix86_current_function_calls_tls_descriptor))
13121 offset += crtl->outgoing_args_size;
13122 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13124 else
13125 frame->outgoing_arguments_size = 0;
13127 /* Align stack boundary. Only needed if we're calling another function
13128 or using alloca. */
13129 if (!crtl->is_leaf || cfun->calls_alloca
13130 || ix86_current_function_calls_tls_descriptor)
13131 offset = ROUND_UP (offset, preferred_alignment);
13133 /* We've reached end of stack frame. */
13134 frame->stack_pointer_offset = offset;
13136 /* Size prologue needs to allocate. */
13137 to_allocate = offset - frame->sse_reg_save_offset;
13139 if ((!to_allocate && frame->nregs <= 1)
13140 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13141 frame->save_regs_using_mov = false;
13143 if (ix86_using_red_zone ()
13144 && crtl->sp_is_unchanging
13145 && crtl->is_leaf
13146 && !ix86_pc_thunk_call_expanded
13147 && !ix86_current_function_calls_tls_descriptor)
13149 frame->red_zone_size = to_allocate;
13150 if (frame->save_regs_using_mov)
13151 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13152 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13153 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13155 else
13156 frame->red_zone_size = 0;
13157 frame->stack_pointer_offset -= frame->red_zone_size;
13159 /* The SEH frame pointer location is near the bottom of the frame.
13160 This is enforced by the fact that the difference between the
13161 stack pointer and the frame pointer is limited to 240 bytes in
13162 the unwind data structure. */
13163 if (TARGET_SEH)
13165 HOST_WIDE_INT diff;
13167 /* If we can leave the frame pointer where it is, do so. Also, returns
13168 the establisher frame for __builtin_frame_address (0). */
13169 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13170 if (diff <= SEH_MAX_FRAME_SIZE
13171 && (diff > 240 || (diff & 15) != 0)
13172 && !crtl->accesses_prior_frames)
13174 /* Ideally we'd determine what portion of the local stack frame
13175 (within the constraint of the lowest 240) is most heavily used.
13176 But without that complication, simply bias the frame pointer
13177 by 128 bytes so as to maximize the amount of the local stack
13178 frame that is addressable with 8-bit offsets. */
13179 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13184 /* This is semi-inlined memory_address_length, but simplified
13185 since we know that we're always dealing with reg+offset, and
13186 to avoid having to create and discard all that rtl. */
13188 static inline int
13189 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13191 int len = 4;
13193 if (offset == 0)
13195 /* EBP and R13 cannot be encoded without an offset. */
13196 len = (regno == BP_REG || regno == R13_REG);
13198 else if (IN_RANGE (offset, -128, 127))
13199 len = 1;
13201 /* ESP and R12 must be encoded with a SIB byte. */
13202 if (regno == SP_REG || regno == R12_REG)
13203 len++;
13205 return len;
13208 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
13209 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13211 static bool
13212 sp_valid_at (HOST_WIDE_INT cfa_offset)
13214 const struct machine_frame_state &fs = cfun->machine->fs;
13215 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
13217 /* Validate that the cfa_offset isn't in a "no-man's land". */
13218 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
13219 return false;
13221 return fs.sp_valid;
13224 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
13225 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13227 static inline bool
13228 fp_valid_at (HOST_WIDE_INT cfa_offset)
13230 const struct machine_frame_state &fs = cfun->machine->fs;
13231 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
13233 /* Validate that the cfa_offset isn't in a "no-man's land". */
13234 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
13235 return false;
13237 return fs.fp_valid;
13240 /* Choose a base register based upon alignment requested, speed and/or
13241 size. */
13243 static void
13244 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13245 HOST_WIDE_INT &base_offset,
13246 unsigned int align_reqested, unsigned int *align)
13248 const struct machine_function *m = cfun->machine;
13249 unsigned int hfp_align;
13250 unsigned int drap_align;
13251 unsigned int sp_align;
13252 bool hfp_ok = fp_valid_at (cfa_offset);
13253 bool drap_ok = m->fs.drap_valid;
13254 bool sp_ok = sp_valid_at (cfa_offset);
13256 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13258 /* Filter out any registers that don't meet the requested alignment
13259 criteria. */
13260 if (align_reqested)
13262 if (m->fs.realigned)
13263 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13264 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13265 notes (which we would need to use a realigned stack pointer),
13266 so disable on SEH targets. */
13267 else if (m->fs.sp_realigned)
13268 sp_align = crtl->stack_alignment_needed;
13270 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13271 drap_ok = drap_ok && drap_align >= align_reqested;
13272 sp_ok = sp_ok && sp_align >= align_reqested;
13275 if (m->use_fast_prologue_epilogue)
13277 /* Choose the base register most likely to allow the most scheduling
13278 opportunities. Generally FP is valid throughout the function,
13279 while DRAP must be reloaded within the epilogue. But choose either
13280 over the SP due to increased encoding size. */
13282 if (hfp_ok)
13284 base_reg = hard_frame_pointer_rtx;
13285 base_offset = m->fs.fp_offset - cfa_offset;
13287 else if (drap_ok)
13289 base_reg = crtl->drap_reg;
13290 base_offset = 0 - cfa_offset;
13292 else if (sp_ok)
13294 base_reg = stack_pointer_rtx;
13295 base_offset = m->fs.sp_offset - cfa_offset;
13298 else
13300 HOST_WIDE_INT toffset;
13301 int len = 16, tlen;
13303 /* Choose the base register with the smallest address encoding.
13304 With a tie, choose FP > DRAP > SP. */
13305 if (sp_ok)
13307 base_reg = stack_pointer_rtx;
13308 base_offset = m->fs.sp_offset - cfa_offset;
13309 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13311 if (drap_ok)
13313 toffset = 0 - cfa_offset;
13314 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13315 if (tlen <= len)
13317 base_reg = crtl->drap_reg;
13318 base_offset = toffset;
13319 len = tlen;
13322 if (hfp_ok)
13324 toffset = m->fs.fp_offset - cfa_offset;
13325 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13326 if (tlen <= len)
13328 base_reg = hard_frame_pointer_rtx;
13329 base_offset = toffset;
13330 len = tlen;
13335 /* Set the align return value. */
13336 if (align)
13338 if (base_reg == stack_pointer_rtx)
13339 *align = sp_align;
13340 else if (base_reg == crtl->drap_reg)
13341 *align = drap_align;
13342 else if (base_reg == hard_frame_pointer_rtx)
13343 *align = hfp_align;
13347 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13348 the alignment of address. If ALIGN is non-null, it should point to
13349 an alignment value (in bits) that is preferred or zero and will
13350 recieve the alignment of the base register that was selected,
13351 irrespective of rather or not CFA_OFFSET is a multiple of that
13352 alignment value.
13354 The valid base registers are taken from CFUN->MACHINE->FS. */
13356 static rtx
13357 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13359 rtx base_reg = NULL;
13360 HOST_WIDE_INT base_offset = 0;
13362 /* If a specific alignment is requested, try to get a base register
13363 with that alignment first. */
13364 if (align && *align)
13365 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13367 if (!base_reg)
13368 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13370 gcc_assert (base_reg != NULL);
13371 return plus_constant (Pmode, base_reg, base_offset);
13374 /* Emit code to save registers in the prologue. */
13376 static void
13377 ix86_emit_save_regs (void)
13379 unsigned int regno;
13380 rtx_insn *insn;
13382 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13383 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13385 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13386 RTX_FRAME_RELATED_P (insn) = 1;
13390 /* Emit a single register save at CFA - CFA_OFFSET. */
13392 static void
13393 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13394 HOST_WIDE_INT cfa_offset)
13396 struct machine_function *m = cfun->machine;
13397 rtx reg = gen_rtx_REG (mode, regno);
13398 rtx mem, addr, base, insn;
13399 unsigned int align = GET_MODE_ALIGNMENT (mode);
13401 addr = choose_baseaddr (cfa_offset, &align);
13402 mem = gen_frame_mem (mode, addr);
13404 /* The location aligment depends upon the base register. */
13405 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13406 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13407 set_mem_align (mem, align);
13409 insn = emit_insn (gen_rtx_SET (mem, reg));
13410 RTX_FRAME_RELATED_P (insn) = 1;
13412 base = addr;
13413 if (GET_CODE (base) == PLUS)
13414 base = XEXP (base, 0);
13415 gcc_checking_assert (REG_P (base));
13417 /* When saving registers into a re-aligned local stack frame, avoid
13418 any tricky guessing by dwarf2out. */
13419 if (m->fs.realigned)
13421 gcc_checking_assert (stack_realign_drap);
13423 if (regno == REGNO (crtl->drap_reg))
13425 /* A bit of a hack. We force the DRAP register to be saved in
13426 the re-aligned stack frame, which provides us with a copy
13427 of the CFA that will last past the prologue. Install it. */
13428 gcc_checking_assert (cfun->machine->fs.fp_valid);
13429 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13430 cfun->machine->fs.fp_offset - cfa_offset);
13431 mem = gen_rtx_MEM (mode, addr);
13432 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13434 else
13436 /* The frame pointer is a stable reference within the
13437 aligned frame. Use it. */
13438 gcc_checking_assert (cfun->machine->fs.fp_valid);
13439 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13440 cfun->machine->fs.fp_offset - cfa_offset);
13441 mem = gen_rtx_MEM (mode, addr);
13442 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13446 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13447 && cfa_offset >= m->fs.sp_realigned_offset)
13449 gcc_checking_assert (stack_realign_fp);
13450 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13453 /* The memory may not be relative to the current CFA register,
13454 which means that we may need to generate a new pattern for
13455 use by the unwind info. */
13456 else if (base != m->fs.cfa_reg)
13458 addr = plus_constant (Pmode, m->fs.cfa_reg,
13459 m->fs.cfa_offset - cfa_offset);
13460 mem = gen_rtx_MEM (mode, addr);
13461 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13465 /* Emit code to save registers using MOV insns.
13466 First register is stored at CFA - CFA_OFFSET. */
13467 static void
13468 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13470 unsigned int regno;
13472 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13473 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13475 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13476 cfa_offset -= UNITS_PER_WORD;
13480 /* Emit code to save SSE registers using MOV insns.
13481 First register is stored at CFA - CFA_OFFSET. */
13482 static void
13483 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13485 unsigned int regno;
13487 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13488 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13490 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13491 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13495 static GTY(()) rtx queued_cfa_restores;
13497 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13498 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13499 Don't add the note if the previously saved value will be left untouched
13500 within stack red-zone till return, as unwinders can find the same value
13501 in the register and on the stack. */
13503 static void
13504 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13506 if (!crtl->shrink_wrapped
13507 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13508 return;
13510 if (insn)
13512 add_reg_note (insn, REG_CFA_RESTORE, reg);
13513 RTX_FRAME_RELATED_P (insn) = 1;
13515 else
13516 queued_cfa_restores
13517 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13520 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13522 static void
13523 ix86_add_queued_cfa_restore_notes (rtx insn)
13525 rtx last;
13526 if (!queued_cfa_restores)
13527 return;
13528 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13530 XEXP (last, 1) = REG_NOTES (insn);
13531 REG_NOTES (insn) = queued_cfa_restores;
13532 queued_cfa_restores = NULL_RTX;
13533 RTX_FRAME_RELATED_P (insn) = 1;
13536 /* Expand prologue or epilogue stack adjustment.
13537 The pattern exist to put a dependency on all ebp-based memory accesses.
13538 STYLE should be negative if instructions should be marked as frame related,
13539 zero if %r11 register is live and cannot be freely used and positive
13540 otherwise. */
13542 static void
13543 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13544 int style, bool set_cfa)
13546 struct machine_function *m = cfun->machine;
13547 rtx insn;
13548 bool add_frame_related_expr = false;
13550 if (Pmode == SImode)
13551 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13552 else if (x86_64_immediate_operand (offset, DImode))
13553 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13554 else
13556 rtx tmp;
13557 /* r11 is used by indirect sibcall return as well, set before the
13558 epilogue and used after the epilogue. */
13559 if (style)
13560 tmp = gen_rtx_REG (DImode, R11_REG);
13561 else
13563 gcc_assert (src != hard_frame_pointer_rtx
13564 && dest != hard_frame_pointer_rtx);
13565 tmp = hard_frame_pointer_rtx;
13567 insn = emit_insn (gen_rtx_SET (tmp, offset));
13568 if (style < 0)
13569 add_frame_related_expr = true;
13571 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13574 insn = emit_insn (insn);
13575 if (style >= 0)
13576 ix86_add_queued_cfa_restore_notes (insn);
13578 if (set_cfa)
13580 rtx r;
13582 gcc_assert (m->fs.cfa_reg == src);
13583 m->fs.cfa_offset += INTVAL (offset);
13584 m->fs.cfa_reg = dest;
13586 r = gen_rtx_PLUS (Pmode, src, offset);
13587 r = gen_rtx_SET (dest, r);
13588 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13589 RTX_FRAME_RELATED_P (insn) = 1;
13591 else if (style < 0)
13593 RTX_FRAME_RELATED_P (insn) = 1;
13594 if (add_frame_related_expr)
13596 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13597 r = gen_rtx_SET (dest, r);
13598 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13602 if (dest == stack_pointer_rtx)
13604 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13605 bool valid = m->fs.sp_valid;
13606 bool realigned = m->fs.sp_realigned;
13608 if (src == hard_frame_pointer_rtx)
13610 valid = m->fs.fp_valid;
13611 realigned = false;
13612 ooffset = m->fs.fp_offset;
13614 else if (src == crtl->drap_reg)
13616 valid = m->fs.drap_valid;
13617 realigned = false;
13618 ooffset = 0;
13620 else
13622 /* Else there are two possibilities: SP itself, which we set
13623 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13624 taken care of this by hand along the eh_return path. */
13625 gcc_checking_assert (src == stack_pointer_rtx
13626 || offset == const0_rtx);
13629 m->fs.sp_offset = ooffset - INTVAL (offset);
13630 m->fs.sp_valid = valid;
13631 m->fs.sp_realigned = realigned;
13635 /* Find an available register to be used as dynamic realign argument
13636 pointer regsiter. Such a register will be written in prologue and
13637 used in begin of body, so it must not be
13638 1. parameter passing register.
13639 2. GOT pointer.
13640 We reuse static-chain register if it is available. Otherwise, we
13641 use DI for i386 and R13 for x86-64. We chose R13 since it has
13642 shorter encoding.
13644 Return: the regno of chosen register. */
13646 static unsigned int
13647 find_drap_reg (void)
13649 tree decl = cfun->decl;
13651 /* Always use callee-saved register if there are no caller-saved
13652 registers. */
13653 if (TARGET_64BIT)
13655 /* Use R13 for nested function or function need static chain.
13656 Since function with tail call may use any caller-saved
13657 registers in epilogue, DRAP must not use caller-saved
13658 register in such case. */
13659 if (DECL_STATIC_CHAIN (decl)
13660 || cfun->machine->no_caller_saved_registers
13661 || crtl->tail_call_emit)
13662 return R13_REG;
13664 return R10_REG;
13666 else
13668 /* Use DI for nested function or function need static chain.
13669 Since function with tail call may use any caller-saved
13670 registers in epilogue, DRAP must not use caller-saved
13671 register in such case. */
13672 if (DECL_STATIC_CHAIN (decl)
13673 || cfun->machine->no_caller_saved_registers
13674 || crtl->tail_call_emit)
13675 return DI_REG;
13677 /* Reuse static chain register if it isn't used for parameter
13678 passing. */
13679 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13681 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13682 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13683 return CX_REG;
13685 return DI_REG;
13689 /* Handle a "force_align_arg_pointer" attribute. */
13691 static tree
13692 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13693 tree, int, bool *no_add_attrs)
13695 if (TREE_CODE (*node) != FUNCTION_TYPE
13696 && TREE_CODE (*node) != METHOD_TYPE
13697 && TREE_CODE (*node) != FIELD_DECL
13698 && TREE_CODE (*node) != TYPE_DECL)
13700 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13701 name);
13702 *no_add_attrs = true;
13705 return NULL_TREE;
13708 /* Return minimum incoming stack alignment. */
13710 static unsigned int
13711 ix86_minimum_incoming_stack_boundary (bool sibcall)
13713 unsigned int incoming_stack_boundary;
13715 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13716 if (cfun->machine->func_type != TYPE_NORMAL)
13717 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13718 /* Prefer the one specified at command line. */
13719 else if (ix86_user_incoming_stack_boundary)
13720 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13721 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13722 if -mstackrealign is used, it isn't used for sibcall check and
13723 estimated stack alignment is 128bit. */
13724 else if (!sibcall
13725 && ix86_force_align_arg_pointer
13726 && crtl->stack_alignment_estimated == 128)
13727 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13728 else
13729 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13731 /* Incoming stack alignment can be changed on individual functions
13732 via force_align_arg_pointer attribute. We use the smallest
13733 incoming stack boundary. */
13734 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13735 && lookup_attribute (ix86_force_align_arg_pointer_string,
13736 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13737 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13739 /* The incoming stack frame has to be aligned at least at
13740 parm_stack_boundary. */
13741 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13742 incoming_stack_boundary = crtl->parm_stack_boundary;
13744 /* Stack at entrance of main is aligned by runtime. We use the
13745 smallest incoming stack boundary. */
13746 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13747 && DECL_NAME (current_function_decl)
13748 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13749 && DECL_FILE_SCOPE_P (current_function_decl))
13750 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13752 return incoming_stack_boundary;
13755 /* Update incoming stack boundary and estimated stack alignment. */
13757 static void
13758 ix86_update_stack_boundary (void)
13760 ix86_incoming_stack_boundary
13761 = ix86_minimum_incoming_stack_boundary (false);
13763 /* x86_64 vararg needs 16byte stack alignment for register save
13764 area. */
13765 if (TARGET_64BIT
13766 && cfun->stdarg
13767 && crtl->stack_alignment_estimated < 128)
13768 crtl->stack_alignment_estimated = 128;
13770 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13771 if (ix86_tls_descriptor_calls_expanded_in_cfun
13772 && crtl->preferred_stack_boundary < 128)
13773 crtl->preferred_stack_boundary = 128;
13776 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13777 needed or an rtx for DRAP otherwise. */
13779 static rtx
13780 ix86_get_drap_rtx (void)
13782 /* We must use DRAP if there are outgoing arguments on stack and
13783 ACCUMULATE_OUTGOING_ARGS is false. */
13784 if (ix86_force_drap
13785 || (cfun->machine->outgoing_args_on_stack
13786 && !ACCUMULATE_OUTGOING_ARGS))
13787 crtl->need_drap = true;
13789 if (stack_realign_drap)
13791 /* Assign DRAP to vDRAP and returns vDRAP */
13792 unsigned int regno = find_drap_reg ();
13793 rtx drap_vreg;
13794 rtx arg_ptr;
13795 rtx_insn *seq, *insn;
13797 arg_ptr = gen_rtx_REG (Pmode, regno);
13798 crtl->drap_reg = arg_ptr;
13800 start_sequence ();
13801 drap_vreg = copy_to_reg (arg_ptr);
13802 seq = get_insns ();
13803 end_sequence ();
13805 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13806 if (!optimize)
13808 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13809 RTX_FRAME_RELATED_P (insn) = 1;
13811 return drap_vreg;
13813 else
13814 return NULL;
13817 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13819 static rtx
13820 ix86_internal_arg_pointer (void)
13822 return virtual_incoming_args_rtx;
13825 struct scratch_reg {
13826 rtx reg;
13827 bool saved;
13830 /* Return a short-lived scratch register for use on function entry.
13831 In 32-bit mode, it is valid only after the registers are saved
13832 in the prologue. This register must be released by means of
13833 release_scratch_register_on_entry once it is dead. */
13835 static void
13836 get_scratch_register_on_entry (struct scratch_reg *sr)
13838 int regno;
13840 sr->saved = false;
13842 if (TARGET_64BIT)
13844 /* We always use R11 in 64-bit mode. */
13845 regno = R11_REG;
13847 else
13849 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13850 bool fastcall_p
13851 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13852 bool thiscall_p
13853 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13854 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13855 int regparm = ix86_function_regparm (fntype, decl);
13856 int drap_regno
13857 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13859 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13860 for the static chain register. */
13861 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13862 && drap_regno != AX_REG)
13863 regno = AX_REG;
13864 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13865 for the static chain register. */
13866 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13867 regno = AX_REG;
13868 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13869 regno = DX_REG;
13870 /* ecx is the static chain register. */
13871 else if (regparm < 3 && !fastcall_p && !thiscall_p
13872 && !static_chain_p
13873 && drap_regno != CX_REG)
13874 regno = CX_REG;
13875 else if (ix86_save_reg (BX_REG, true, false))
13876 regno = BX_REG;
13877 /* esi is the static chain register. */
13878 else if (!(regparm == 3 && static_chain_p)
13879 && ix86_save_reg (SI_REG, true, false))
13880 regno = SI_REG;
13881 else if (ix86_save_reg (DI_REG, true, false))
13882 regno = DI_REG;
13883 else
13885 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13886 sr->saved = true;
13890 sr->reg = gen_rtx_REG (Pmode, regno);
13891 if (sr->saved)
13893 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13894 RTX_FRAME_RELATED_P (insn) = 1;
13898 /* Release a scratch register obtained from the preceding function. */
13900 static void
13901 release_scratch_register_on_entry (struct scratch_reg *sr)
13903 if (sr->saved)
13905 struct machine_function *m = cfun->machine;
13906 rtx x, insn = emit_insn (gen_pop (sr->reg));
13908 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13909 RTX_FRAME_RELATED_P (insn) = 1;
13910 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13911 x = gen_rtx_SET (stack_pointer_rtx, x);
13912 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13913 m->fs.sp_offset -= UNITS_PER_WORD;
13917 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13919 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13921 static void
13922 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13924 /* We skip the probe for the first interval + a small dope of 4 words and
13925 probe that many bytes past the specified size to maintain a protection
13926 area at the botton of the stack. */
13927 const int dope = 4 * UNITS_PER_WORD;
13928 rtx size_rtx = GEN_INT (size), last;
13930 /* See if we have a constant small number of probes to generate. If so,
13931 that's the easy case. The run-time loop is made up of 9 insns in the
13932 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13933 for n # of intervals. */
13934 if (size <= 4 * PROBE_INTERVAL)
13936 HOST_WIDE_INT i, adjust;
13937 bool first_probe = true;
13939 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13940 values of N from 1 until it exceeds SIZE. If only one probe is
13941 needed, this will not generate any code. Then adjust and probe
13942 to PROBE_INTERVAL + SIZE. */
13943 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13945 if (first_probe)
13947 adjust = 2 * PROBE_INTERVAL + dope;
13948 first_probe = false;
13950 else
13951 adjust = PROBE_INTERVAL;
13953 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13954 plus_constant (Pmode, stack_pointer_rtx,
13955 -adjust)));
13956 emit_stack_probe (stack_pointer_rtx);
13959 if (first_probe)
13960 adjust = size + PROBE_INTERVAL + dope;
13961 else
13962 adjust = size + PROBE_INTERVAL - i;
13964 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13965 plus_constant (Pmode, stack_pointer_rtx,
13966 -adjust)));
13967 emit_stack_probe (stack_pointer_rtx);
13969 /* Adjust back to account for the additional first interval. */
13970 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13971 plus_constant (Pmode, stack_pointer_rtx,
13972 PROBE_INTERVAL + dope)));
13975 /* Otherwise, do the same as above, but in a loop. Note that we must be
13976 extra careful with variables wrapping around because we might be at
13977 the very top (or the very bottom) of the address space and we have
13978 to be able to handle this case properly; in particular, we use an
13979 equality test for the loop condition. */
13980 else
13982 HOST_WIDE_INT rounded_size;
13983 struct scratch_reg sr;
13985 get_scratch_register_on_entry (&sr);
13988 /* Step 1: round SIZE to the previous multiple of the interval. */
13990 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13993 /* Step 2: compute initial and final value of the loop counter. */
13995 /* SP = SP_0 + PROBE_INTERVAL. */
13996 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13997 plus_constant (Pmode, stack_pointer_rtx,
13998 - (PROBE_INTERVAL + dope))));
14000 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
14001 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
14002 emit_insn (gen_rtx_SET (sr.reg,
14003 plus_constant (Pmode, stack_pointer_rtx,
14004 -rounded_size)));
14005 else
14007 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
14008 emit_insn (gen_rtx_SET (sr.reg,
14009 gen_rtx_PLUS (Pmode, sr.reg,
14010 stack_pointer_rtx)));
14014 /* Step 3: the loop
14018 SP = SP + PROBE_INTERVAL
14019 probe at SP
14021 while (SP != LAST_ADDR)
14023 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
14024 values of N from 1 until it is equal to ROUNDED_SIZE. */
14026 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
14029 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
14030 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
14032 if (size != rounded_size)
14034 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14035 plus_constant (Pmode, stack_pointer_rtx,
14036 rounded_size - size)));
14037 emit_stack_probe (stack_pointer_rtx);
14040 /* Adjust back to account for the additional first interval. */
14041 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
14042 plus_constant (Pmode, stack_pointer_rtx,
14043 PROBE_INTERVAL + dope)));
14045 release_scratch_register_on_entry (&sr);
14048 /* Even if the stack pointer isn't the CFA register, we need to correctly
14049 describe the adjustments made to it, in particular differentiate the
14050 frame-related ones from the frame-unrelated ones. */
14051 if (size > 0)
14053 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
14054 XVECEXP (expr, 0, 0)
14055 = gen_rtx_SET (stack_pointer_rtx,
14056 plus_constant (Pmode, stack_pointer_rtx, -size));
14057 XVECEXP (expr, 0, 1)
14058 = gen_rtx_SET (stack_pointer_rtx,
14059 plus_constant (Pmode, stack_pointer_rtx,
14060 PROBE_INTERVAL + dope + size));
14061 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
14062 RTX_FRAME_RELATED_P (last) = 1;
14064 cfun->machine->fs.sp_offset += size;
14067 /* Make sure nothing is scheduled before we are done. */
14068 emit_insn (gen_blockage ());
14071 /* Adjust the stack pointer up to REG while probing it. */
14073 const char *
14074 output_adjust_stack_and_probe (rtx reg)
14076 static int labelno = 0;
14077 char loop_lab[32];
14078 rtx xops[2];
14080 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14082 /* Loop. */
14083 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14085 /* SP = SP + PROBE_INTERVAL. */
14086 xops[0] = stack_pointer_rtx;
14087 xops[1] = GEN_INT (PROBE_INTERVAL);
14088 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14090 /* Probe at SP. */
14091 xops[1] = const0_rtx;
14092 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
14094 /* Test if SP == LAST_ADDR. */
14095 xops[0] = stack_pointer_rtx;
14096 xops[1] = reg;
14097 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14099 /* Branch. */
14100 fputs ("\tjne\t", asm_out_file);
14101 assemble_name_raw (asm_out_file, loop_lab);
14102 fputc ('\n', asm_out_file);
14104 return "";
14107 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
14108 inclusive. These are offsets from the current stack pointer. */
14110 static void
14111 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
14113 /* See if we have a constant small number of probes to generate. If so,
14114 that's the easy case. The run-time loop is made up of 6 insns in the
14115 generic case while the compile-time loop is made up of n insns for n #
14116 of intervals. */
14117 if (size <= 6 * PROBE_INTERVAL)
14119 HOST_WIDE_INT i;
14121 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14122 it exceeds SIZE. If only one probe is needed, this will not
14123 generate any code. Then probe at FIRST + SIZE. */
14124 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14125 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14126 -(first + i)));
14128 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14129 -(first + size)));
14132 /* Otherwise, do the same as above, but in a loop. Note that we must be
14133 extra careful with variables wrapping around because we might be at
14134 the very top (or the very bottom) of the address space and we have
14135 to be able to handle this case properly; in particular, we use an
14136 equality test for the loop condition. */
14137 else
14139 HOST_WIDE_INT rounded_size, last;
14140 struct scratch_reg sr;
14142 get_scratch_register_on_entry (&sr);
14145 /* Step 1: round SIZE to the previous multiple of the interval. */
14147 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14150 /* Step 2: compute initial and final value of the loop counter. */
14152 /* TEST_OFFSET = FIRST. */
14153 emit_move_insn (sr.reg, GEN_INT (-first));
14155 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14156 last = first + rounded_size;
14159 /* Step 3: the loop
14163 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14164 probe at TEST_ADDR
14166 while (TEST_ADDR != LAST_ADDR)
14168 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14169 until it is equal to ROUNDED_SIZE. */
14171 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14174 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14175 that SIZE is equal to ROUNDED_SIZE. */
14177 if (size != rounded_size)
14178 emit_stack_probe (plus_constant (Pmode,
14179 gen_rtx_PLUS (Pmode,
14180 stack_pointer_rtx,
14181 sr.reg),
14182 rounded_size - size));
14184 release_scratch_register_on_entry (&sr);
14187 /* Make sure nothing is scheduled before we are done. */
14188 emit_insn (gen_blockage ());
14191 /* Probe a range of stack addresses from REG to END, inclusive. These are
14192 offsets from the current stack pointer. */
14194 const char *
14195 output_probe_stack_range (rtx reg, rtx end)
14197 static int labelno = 0;
14198 char loop_lab[32];
14199 rtx xops[3];
14201 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14203 /* Loop. */
14204 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14206 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14207 xops[0] = reg;
14208 xops[1] = GEN_INT (PROBE_INTERVAL);
14209 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14211 /* Probe at TEST_ADDR. */
14212 xops[0] = stack_pointer_rtx;
14213 xops[1] = reg;
14214 xops[2] = const0_rtx;
14215 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14217 /* Test if TEST_ADDR == LAST_ADDR. */
14218 xops[0] = reg;
14219 xops[1] = end;
14220 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14222 /* Branch. */
14223 fputs ("\tjne\t", asm_out_file);
14224 assemble_name_raw (asm_out_file, loop_lab);
14225 fputc ('\n', asm_out_file);
14227 return "";
14230 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
14231 will guide prologue/epilogue to be generated in correct form. */
14233 static void
14234 ix86_finalize_stack_frame_flags (void)
14236 /* Check if stack realign is really needed after reload, and
14237 stores result in cfun */
14238 unsigned int incoming_stack_boundary
14239 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14240 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14241 unsigned int stack_realign
14242 = (incoming_stack_boundary
14243 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14244 ? crtl->max_used_stack_slot_alignment
14245 : crtl->stack_alignment_needed));
14246 bool recompute_frame_layout_p = false;
14248 if (crtl->stack_realign_finalized)
14250 /* After stack_realign_needed is finalized, we can't no longer
14251 change it. */
14252 gcc_assert (crtl->stack_realign_needed == stack_realign);
14253 return;
14256 /* If the only reason for frame_pointer_needed is that we conservatively
14257 assumed stack realignment might be needed or -fno-omit-frame-pointer
14258 is used, but in the end nothing that needed the stack alignment had
14259 been spilled nor stack access, clear frame_pointer_needed and say we
14260 don't need stack realignment. */
14261 if ((stack_realign || !flag_omit_frame_pointer)
14262 && frame_pointer_needed
14263 && crtl->is_leaf
14264 && crtl->sp_is_unchanging
14265 && !ix86_current_function_calls_tls_descriptor
14266 && !crtl->accesses_prior_frames
14267 && !cfun->calls_alloca
14268 && !crtl->calls_eh_return
14269 /* See ira_setup_eliminable_regset for the rationale. */
14270 && !(STACK_CHECK_MOVING_SP
14271 && flag_stack_check
14272 && flag_exceptions
14273 && cfun->can_throw_non_call_exceptions)
14274 && !ix86_frame_pointer_required ()
14275 && get_frame_size () == 0
14276 && ix86_nsaved_sseregs () == 0
14277 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14279 HARD_REG_SET set_up_by_prologue, prologue_used;
14280 basic_block bb;
14282 CLEAR_HARD_REG_SET (prologue_used);
14283 CLEAR_HARD_REG_SET (set_up_by_prologue);
14284 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14285 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14286 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14287 HARD_FRAME_POINTER_REGNUM);
14288 FOR_EACH_BB_FN (bb, cfun)
14290 rtx_insn *insn;
14291 FOR_BB_INSNS (bb, insn)
14292 if (NONDEBUG_INSN_P (insn)
14293 && requires_stack_frame_p (insn, prologue_used,
14294 set_up_by_prologue))
14296 if (crtl->stack_realign_needed != stack_realign)
14297 recompute_frame_layout_p = true;
14298 crtl->stack_realign_needed = stack_realign;
14299 crtl->stack_realign_finalized = true;
14300 if (recompute_frame_layout_p)
14301 ix86_compute_frame_layout ();
14302 return;
14306 /* If drap has been set, but it actually isn't live at the start
14307 of the function, there is no reason to set it up. */
14308 if (crtl->drap_reg)
14310 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14311 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
14313 crtl->drap_reg = NULL_RTX;
14314 crtl->need_drap = false;
14317 else
14318 cfun->machine->no_drap_save_restore = true;
14320 frame_pointer_needed = false;
14321 stack_realign = false;
14322 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14323 crtl->stack_alignment_needed = incoming_stack_boundary;
14324 crtl->stack_alignment_estimated = incoming_stack_boundary;
14325 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14326 crtl->preferred_stack_boundary = incoming_stack_boundary;
14327 df_finish_pass (true);
14328 df_scan_alloc (NULL);
14329 df_scan_blocks ();
14330 df_compute_regs_ever_live (true);
14331 df_analyze ();
14333 if (flag_var_tracking)
14335 /* Since frame pointer is no longer available, replace it with
14336 stack pointer - UNITS_PER_WORD in debug insns. */
14337 df_ref ref, next;
14338 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
14339 ref; ref = next)
14341 rtx_insn *insn = DF_REF_INSN (ref);
14342 /* Make sure the next ref is for a different instruction,
14343 so that we're not affected by the rescan. */
14344 next = DF_REF_NEXT_REG (ref);
14345 while (next && DF_REF_INSN (next) == insn)
14346 next = DF_REF_NEXT_REG (next);
14348 if (DEBUG_INSN_P (insn))
14350 bool changed = false;
14351 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
14353 rtx *loc = DF_REF_LOC (ref);
14354 if (*loc == hard_frame_pointer_rtx)
14356 *loc = plus_constant (Pmode,
14357 stack_pointer_rtx,
14358 -UNITS_PER_WORD);
14359 changed = true;
14362 if (changed)
14363 df_insn_rescan (insn);
14368 recompute_frame_layout_p = true;
14371 if (crtl->stack_realign_needed != stack_realign)
14372 recompute_frame_layout_p = true;
14373 crtl->stack_realign_needed = stack_realign;
14374 crtl->stack_realign_finalized = true;
14375 if (recompute_frame_layout_p)
14376 ix86_compute_frame_layout ();
14379 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14381 static void
14382 ix86_elim_entry_set_got (rtx reg)
14384 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14385 rtx_insn *c_insn = BB_HEAD (bb);
14386 if (!NONDEBUG_INSN_P (c_insn))
14387 c_insn = next_nonnote_nondebug_insn (c_insn);
14388 if (c_insn && NONJUMP_INSN_P (c_insn))
14390 rtx pat = PATTERN (c_insn);
14391 if (GET_CODE (pat) == PARALLEL)
14393 rtx vec = XVECEXP (pat, 0, 0);
14394 if (GET_CODE (vec) == SET
14395 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14396 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14397 delete_insn (c_insn);
14402 static rtx
14403 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14405 rtx addr, mem;
14407 if (offset)
14408 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14409 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14410 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14413 static inline rtx
14414 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14416 return gen_frame_set (reg, frame_reg, offset, false);
14419 static inline rtx
14420 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14422 return gen_frame_set (reg, frame_reg, offset, true);
14425 static void
14426 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14428 struct machine_function *m = cfun->machine;
14429 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14430 + m->call_ms2sysv_extra_regs;
14431 rtvec v = rtvec_alloc (ncregs + 1);
14432 unsigned int align, i, vi = 0;
14433 rtx_insn *insn;
14434 rtx sym, addr;
14435 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14436 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14437 HOST_WIDE_INT allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14439 /* AL should only be live with sysv_abi. */
14440 gcc_assert (!ix86_eax_live_at_start_p ());
14442 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
14443 we've actually realigned the stack or not. */
14444 align = GET_MODE_ALIGNMENT (V4SFmode);
14445 addr = choose_baseaddr (frame.stack_realign_offset
14446 + xlogue.get_stub_ptr_offset (), &align);
14447 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14448 emit_insn (gen_rtx_SET (rax, addr));
14450 /* Allocate stack if not already done. */
14451 if (allocate > 0)
14452 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14453 GEN_INT (-allocate), -1, false);
14455 /* Get the stub symbol. */
14456 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14457 : XLOGUE_STUB_SAVE);
14458 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14460 for (i = 0; i < ncregs; ++i)
14462 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14463 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14464 r.regno);
14465 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
14468 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14470 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14471 RTX_FRAME_RELATED_P (insn) = true;
14474 /* Expand the prologue into a bunch of separate insns. */
14476 void
14477 ix86_expand_prologue (void)
14479 struct machine_function *m = cfun->machine;
14480 rtx insn, t;
14481 struct ix86_frame frame;
14482 HOST_WIDE_INT allocate;
14483 bool int_registers_saved;
14484 bool sse_registers_saved;
14485 rtx static_chain = NULL_RTX;
14487 if (ix86_function_naked (current_function_decl))
14488 return;
14490 ix86_finalize_stack_frame_flags ();
14492 /* DRAP should not coexist with stack_realign_fp */
14493 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14495 memset (&m->fs, 0, sizeof (m->fs));
14497 /* Initialize CFA state for before the prologue. */
14498 m->fs.cfa_reg = stack_pointer_rtx;
14499 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14501 /* Track SP offset to the CFA. We continue tracking this after we've
14502 swapped the CFA register away from SP. In the case of re-alignment
14503 this is fudged; we're interested to offsets within the local frame. */
14504 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14505 m->fs.sp_valid = true;
14506 m->fs.sp_realigned = false;
14508 frame = m->frame;
14510 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14512 /* We should have already generated an error for any use of
14513 ms_hook on a nested function. */
14514 gcc_checking_assert (!ix86_static_chain_on_stack);
14516 /* Check if profiling is active and we shall use profiling before
14517 prologue variant. If so sorry. */
14518 if (crtl->profile && flag_fentry != 0)
14519 sorry ("ms_hook_prologue attribute isn%'t compatible "
14520 "with -mfentry for 32-bit");
14522 /* In ix86_asm_output_function_label we emitted:
14523 8b ff movl.s %edi,%edi
14524 55 push %ebp
14525 8b ec movl.s %esp,%ebp
14527 This matches the hookable function prologue in Win32 API
14528 functions in Microsoft Windows XP Service Pack 2 and newer.
14529 Wine uses this to enable Windows apps to hook the Win32 API
14530 functions provided by Wine.
14532 What that means is that we've already set up the frame pointer. */
14534 if (frame_pointer_needed
14535 && !(crtl->drap_reg && crtl->stack_realign_needed))
14537 rtx push, mov;
14539 /* We've decided to use the frame pointer already set up.
14540 Describe this to the unwinder by pretending that both
14541 push and mov insns happen right here.
14543 Putting the unwind info here at the end of the ms_hook
14544 is done so that we can make absolutely certain we get
14545 the required byte sequence at the start of the function,
14546 rather than relying on an assembler that can produce
14547 the exact encoding required.
14549 However it does mean (in the unpatched case) that we have
14550 a 1 insn window where the asynchronous unwind info is
14551 incorrect. However, if we placed the unwind info at
14552 its correct location we would have incorrect unwind info
14553 in the patched case. Which is probably all moot since
14554 I don't expect Wine generates dwarf2 unwind info for the
14555 system libraries that use this feature. */
14557 insn = emit_insn (gen_blockage ());
14559 push = gen_push (hard_frame_pointer_rtx);
14560 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14561 stack_pointer_rtx);
14562 RTX_FRAME_RELATED_P (push) = 1;
14563 RTX_FRAME_RELATED_P (mov) = 1;
14565 RTX_FRAME_RELATED_P (insn) = 1;
14566 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14567 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14569 /* Note that gen_push incremented m->fs.cfa_offset, even
14570 though we didn't emit the push insn here. */
14571 m->fs.cfa_reg = hard_frame_pointer_rtx;
14572 m->fs.fp_offset = m->fs.cfa_offset;
14573 m->fs.fp_valid = true;
14575 else
14577 /* The frame pointer is not needed so pop %ebp again.
14578 This leaves us with a pristine state. */
14579 emit_insn (gen_pop (hard_frame_pointer_rtx));
14583 /* The first insn of a function that accepts its static chain on the
14584 stack is to push the register that would be filled in by a direct
14585 call. This insn will be skipped by the trampoline. */
14586 else if (ix86_static_chain_on_stack)
14588 static_chain = ix86_static_chain (cfun->decl, false);
14589 insn = emit_insn (gen_push (static_chain));
14590 emit_insn (gen_blockage ());
14592 /* We don't want to interpret this push insn as a register save,
14593 only as a stack adjustment. The real copy of the register as
14594 a save will be done later, if needed. */
14595 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14596 t = gen_rtx_SET (stack_pointer_rtx, t);
14597 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14598 RTX_FRAME_RELATED_P (insn) = 1;
14601 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14602 of DRAP is needed and stack realignment is really needed after reload */
14603 if (stack_realign_drap)
14605 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14607 /* Can't use DRAP in interrupt function. */
14608 if (cfun->machine->func_type != TYPE_NORMAL)
14609 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14610 "in interrupt service routine. This may be worked "
14611 "around by avoiding functions with aggregate return.");
14613 /* Only need to push parameter pointer reg if it is caller saved. */
14614 if (!call_used_regs[REGNO (crtl->drap_reg)])
14616 /* Push arg pointer reg */
14617 insn = emit_insn (gen_push (crtl->drap_reg));
14618 RTX_FRAME_RELATED_P (insn) = 1;
14621 /* Grab the argument pointer. */
14622 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14623 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14624 RTX_FRAME_RELATED_P (insn) = 1;
14625 m->fs.cfa_reg = crtl->drap_reg;
14626 m->fs.cfa_offset = 0;
14628 /* Align the stack. */
14629 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14630 stack_pointer_rtx,
14631 GEN_INT (-align_bytes)));
14632 RTX_FRAME_RELATED_P (insn) = 1;
14634 /* Replicate the return address on the stack so that return
14635 address can be reached via (argp - 1) slot. This is needed
14636 to implement macro RETURN_ADDR_RTX and intrinsic function
14637 expand_builtin_return_addr etc. */
14638 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14639 t = gen_frame_mem (word_mode, t);
14640 insn = emit_insn (gen_push (t));
14641 RTX_FRAME_RELATED_P (insn) = 1;
14643 /* For the purposes of frame and register save area addressing,
14644 we've started over with a new frame. */
14645 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14646 m->fs.realigned = true;
14648 if (static_chain)
14650 /* Replicate static chain on the stack so that static chain
14651 can be reached via (argp - 2) slot. This is needed for
14652 nested function with stack realignment. */
14653 insn = emit_insn (gen_push (static_chain));
14654 RTX_FRAME_RELATED_P (insn) = 1;
14658 int_registers_saved = (frame.nregs == 0);
14659 sse_registers_saved = (frame.nsseregs == 0);
14661 if (frame_pointer_needed && !m->fs.fp_valid)
14663 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14664 slower on all targets. Also sdb doesn't like it. */
14665 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14666 RTX_FRAME_RELATED_P (insn) = 1;
14668 /* Push registers now, before setting the frame pointer
14669 on SEH target. */
14670 if (!int_registers_saved
14671 && TARGET_SEH
14672 && !frame.save_regs_using_mov)
14674 ix86_emit_save_regs ();
14675 int_registers_saved = true;
14676 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14679 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14681 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14682 RTX_FRAME_RELATED_P (insn) = 1;
14684 if (m->fs.cfa_reg == stack_pointer_rtx)
14685 m->fs.cfa_reg = hard_frame_pointer_rtx;
14686 m->fs.fp_offset = m->fs.sp_offset;
14687 m->fs.fp_valid = true;
14691 if (!int_registers_saved)
14693 /* If saving registers via PUSH, do so now. */
14694 if (!frame.save_regs_using_mov)
14696 ix86_emit_save_regs ();
14697 int_registers_saved = true;
14698 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14701 /* When using red zone we may start register saving before allocating
14702 the stack frame saving one cycle of the prologue. However, avoid
14703 doing this if we have to probe the stack; at least on x86_64 the
14704 stack probe can turn into a call that clobbers a red zone location. */
14705 else if (ix86_using_red_zone ()
14706 && (! TARGET_STACK_PROBE
14707 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14709 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14710 int_registers_saved = true;
14714 if (stack_realign_fp)
14716 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14717 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14719 /* Record last valid frame pointer offset. */
14720 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
14722 /* The computation of the size of the re-aligned stack frame means
14723 that we must allocate the size of the register save area before
14724 performing the actual alignment. Otherwise we cannot guarantee
14725 that there's enough storage above the realignment point. */
14726 allocate = frame.reg_save_offset - m->fs.sp_offset
14727 + frame.stack_realign_allocate;
14728 if (allocate)
14729 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14730 GEN_INT (-allocate), -1, false);
14732 /* Align the stack. */
14733 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14734 stack_pointer_rtx,
14735 GEN_INT (-align_bytes)));
14736 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14737 m->fs.sp_realigned_offset = m->fs.sp_offset
14738 - frame.stack_realign_allocate;
14739 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
14740 Beyond this point, stack access should be done via choose_baseaddr or
14741 by using sp_valid_at and fp_valid_at to determine the correct base
14742 register. Henceforth, any CFA offset should be thought of as logical
14743 and not physical. */
14744 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
14745 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14746 m->fs.sp_realigned = true;
14748 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14749 is needed to describe where a register is saved using a realigned
14750 stack pointer, so we need to invalidate the stack pointer for that
14751 target. */
14752 if (TARGET_SEH)
14753 m->fs.sp_valid = false;
14756 if (m->call_ms2sysv)
14757 ix86_emit_outlined_ms2sysv_save (frame);
14759 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14761 if (flag_stack_usage_info)
14763 /* We start to count from ARG_POINTER. */
14764 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14766 /* If it was realigned, take into account the fake frame. */
14767 if (stack_realign_drap)
14769 if (ix86_static_chain_on_stack)
14770 stack_size += UNITS_PER_WORD;
14772 if (!call_used_regs[REGNO (crtl->drap_reg)])
14773 stack_size += UNITS_PER_WORD;
14775 /* This over-estimates by 1 minimal-stack-alignment-unit but
14776 mitigates that by counting in the new return address slot. */
14777 current_function_dynamic_stack_size
14778 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14781 current_function_static_stack_size = stack_size;
14784 /* On SEH target with very large frame size, allocate an area to save
14785 SSE registers (as the very large allocation won't be described). */
14786 if (TARGET_SEH
14787 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14788 && !sse_registers_saved)
14790 HOST_WIDE_INT sse_size =
14791 frame.sse_reg_save_offset - frame.reg_save_offset;
14793 gcc_assert (int_registers_saved);
14795 /* No need to do stack checking as the area will be immediately
14796 written. */
14797 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14798 GEN_INT (-sse_size), -1,
14799 m->fs.cfa_reg == stack_pointer_rtx);
14800 allocate -= sse_size;
14801 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14802 sse_registers_saved = true;
14805 /* The stack has already been decremented by the instruction calling us
14806 so probe if the size is non-negative to preserve the protection area. */
14807 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14809 /* We expect the GP registers to be saved when probes are used. */
14810 gcc_assert (int_registers_saved);
14812 if (STACK_CHECK_MOVING_SP)
14814 if (!(crtl->is_leaf && !cfun->calls_alloca
14815 && allocate <= PROBE_INTERVAL))
14817 ix86_adjust_stack_and_probe (allocate);
14818 allocate = 0;
14821 else
14823 HOST_WIDE_INT size = allocate;
14825 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14826 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14828 if (TARGET_STACK_PROBE)
14830 if (crtl->is_leaf && !cfun->calls_alloca)
14832 if (size > PROBE_INTERVAL)
14833 ix86_emit_probe_stack_range (0, size);
14835 else
14836 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14838 else
14840 if (crtl->is_leaf && !cfun->calls_alloca)
14842 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14843 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14844 size - STACK_CHECK_PROTECT);
14846 else
14847 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14852 if (allocate == 0)
14854 else if (!ix86_target_stack_probe ()
14855 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14857 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14858 GEN_INT (-allocate), -1,
14859 m->fs.cfa_reg == stack_pointer_rtx);
14861 else
14863 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14864 rtx r10 = NULL;
14865 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14866 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14867 bool eax_live = ix86_eax_live_at_start_p ();
14868 bool r10_live = false;
14870 if (TARGET_64BIT)
14871 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14873 if (eax_live)
14875 insn = emit_insn (gen_push (eax));
14876 allocate -= UNITS_PER_WORD;
14877 /* Note that SEH directives need to continue tracking the stack
14878 pointer even after the frame pointer has been set up. */
14879 if (sp_is_cfa_reg || TARGET_SEH)
14881 if (sp_is_cfa_reg)
14882 m->fs.cfa_offset += UNITS_PER_WORD;
14883 RTX_FRAME_RELATED_P (insn) = 1;
14884 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14885 gen_rtx_SET (stack_pointer_rtx,
14886 plus_constant (Pmode, stack_pointer_rtx,
14887 -UNITS_PER_WORD)));
14891 if (r10_live)
14893 r10 = gen_rtx_REG (Pmode, R10_REG);
14894 insn = emit_insn (gen_push (r10));
14895 allocate -= UNITS_PER_WORD;
14896 if (sp_is_cfa_reg || TARGET_SEH)
14898 if (sp_is_cfa_reg)
14899 m->fs.cfa_offset += UNITS_PER_WORD;
14900 RTX_FRAME_RELATED_P (insn) = 1;
14901 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14902 gen_rtx_SET (stack_pointer_rtx,
14903 plus_constant (Pmode, stack_pointer_rtx,
14904 -UNITS_PER_WORD)));
14908 emit_move_insn (eax, GEN_INT (allocate));
14909 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14911 /* Use the fact that AX still contains ALLOCATE. */
14912 adjust_stack_insn = (Pmode == DImode
14913 ? gen_pro_epilogue_adjust_stack_di_sub
14914 : gen_pro_epilogue_adjust_stack_si_sub);
14916 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14917 stack_pointer_rtx, eax));
14919 if (sp_is_cfa_reg || TARGET_SEH)
14921 if (sp_is_cfa_reg)
14922 m->fs.cfa_offset += allocate;
14923 RTX_FRAME_RELATED_P (insn) = 1;
14924 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14925 gen_rtx_SET (stack_pointer_rtx,
14926 plus_constant (Pmode, stack_pointer_rtx,
14927 -allocate)));
14929 m->fs.sp_offset += allocate;
14931 /* Use stack_pointer_rtx for relative addressing so that code
14932 works for realigned stack, too. */
14933 if (r10_live && eax_live)
14935 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14936 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14937 gen_frame_mem (word_mode, t));
14938 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14939 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14940 gen_frame_mem (word_mode, t));
14942 else if (eax_live || r10_live)
14944 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14945 emit_move_insn (gen_rtx_REG (word_mode,
14946 (eax_live ? AX_REG : R10_REG)),
14947 gen_frame_mem (word_mode, t));
14950 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14952 /* If we havn't already set up the frame pointer, do so now. */
14953 if (frame_pointer_needed && !m->fs.fp_valid)
14955 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14956 GEN_INT (frame.stack_pointer_offset
14957 - frame.hard_frame_pointer_offset));
14958 insn = emit_insn (insn);
14959 RTX_FRAME_RELATED_P (insn) = 1;
14960 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14962 if (m->fs.cfa_reg == stack_pointer_rtx)
14963 m->fs.cfa_reg = hard_frame_pointer_rtx;
14964 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14965 m->fs.fp_valid = true;
14968 if (!int_registers_saved)
14969 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14970 if (!sse_registers_saved)
14971 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14973 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14974 in PROLOGUE. */
14975 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14977 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14978 insn = emit_insn (gen_set_got (pic));
14979 RTX_FRAME_RELATED_P (insn) = 1;
14980 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14981 emit_insn (gen_prologue_use (pic));
14982 /* Deleting already emmitted SET_GOT if exist and allocated to
14983 REAL_PIC_OFFSET_TABLE_REGNUM. */
14984 ix86_elim_entry_set_got (pic);
14987 if (crtl->drap_reg && !crtl->stack_realign_needed)
14989 /* vDRAP is setup but after reload it turns out stack realign
14990 isn't necessary, here we will emit prologue to setup DRAP
14991 without stack realign adjustment */
14992 t = choose_baseaddr (0, NULL);
14993 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14996 /* Prevent instructions from being scheduled into register save push
14997 sequence when access to the redzone area is done through frame pointer.
14998 The offset between the frame pointer and the stack pointer is calculated
14999 relative to the value of the stack pointer at the end of the function
15000 prologue, and moving instructions that access redzone area via frame
15001 pointer inside push sequence violates this assumption. */
15002 if (frame_pointer_needed && frame.red_zone_size)
15003 emit_insn (gen_memory_blockage ());
15005 /* SEH requires that the prologue end within 256 bytes of the start of
15006 the function. Prevent instruction schedules that would extend that.
15007 Further, prevent alloca modifications to the stack pointer from being
15008 combined with prologue modifications. */
15009 if (TARGET_SEH)
15010 emit_insn (gen_prologue_use (stack_pointer_rtx));
15013 /* Emit code to restore REG using a POP insn. */
15015 static void
15016 ix86_emit_restore_reg_using_pop (rtx reg)
15018 struct machine_function *m = cfun->machine;
15019 rtx_insn *insn = emit_insn (gen_pop (reg));
15021 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
15022 m->fs.sp_offset -= UNITS_PER_WORD;
15024 if (m->fs.cfa_reg == crtl->drap_reg
15025 && REGNO (reg) == REGNO (crtl->drap_reg))
15027 /* Previously we'd represented the CFA as an expression
15028 like *(%ebp - 8). We've just popped that value from
15029 the stack, which means we need to reset the CFA to
15030 the drap register. This will remain until we restore
15031 the stack pointer. */
15032 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15033 RTX_FRAME_RELATED_P (insn) = 1;
15035 /* This means that the DRAP register is valid for addressing too. */
15036 m->fs.drap_valid = true;
15037 return;
15040 if (m->fs.cfa_reg == stack_pointer_rtx)
15042 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15043 x = gen_rtx_SET (stack_pointer_rtx, x);
15044 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15045 RTX_FRAME_RELATED_P (insn) = 1;
15047 m->fs.cfa_offset -= UNITS_PER_WORD;
15050 /* When the frame pointer is the CFA, and we pop it, we are
15051 swapping back to the stack pointer as the CFA. This happens
15052 for stack frames that don't allocate other data, so we assume
15053 the stack pointer is now pointing at the return address, i.e.
15054 the function entry state, which makes the offset be 1 word. */
15055 if (reg == hard_frame_pointer_rtx)
15057 m->fs.fp_valid = false;
15058 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15060 m->fs.cfa_reg = stack_pointer_rtx;
15061 m->fs.cfa_offset -= UNITS_PER_WORD;
15063 add_reg_note (insn, REG_CFA_DEF_CFA,
15064 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15065 GEN_INT (m->fs.cfa_offset)));
15066 RTX_FRAME_RELATED_P (insn) = 1;
15071 /* Emit code to restore saved registers using POP insns. */
15073 static void
15074 ix86_emit_restore_regs_using_pop (void)
15076 unsigned int regno;
15078 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15079 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
15080 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
15083 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
15084 omits the emit and only attaches the notes. */
15086 static void
15087 ix86_emit_leave (rtx_insn *insn)
15089 struct machine_function *m = cfun->machine;
15090 if (!insn)
15091 insn = emit_insn (ix86_gen_leave ());
15093 ix86_add_queued_cfa_restore_notes (insn);
15095 gcc_assert (m->fs.fp_valid);
15096 m->fs.sp_valid = true;
15097 m->fs.sp_realigned = false;
15098 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
15099 m->fs.fp_valid = false;
15101 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15103 m->fs.cfa_reg = stack_pointer_rtx;
15104 m->fs.cfa_offset = m->fs.sp_offset;
15106 add_reg_note (insn, REG_CFA_DEF_CFA,
15107 plus_constant (Pmode, stack_pointer_rtx,
15108 m->fs.sp_offset));
15109 RTX_FRAME_RELATED_P (insn) = 1;
15111 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
15112 m->fs.fp_offset);
15115 /* Emit code to restore saved registers using MOV insns.
15116 First register is restored from CFA - CFA_OFFSET. */
15117 static void
15118 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
15119 bool maybe_eh_return)
15121 struct machine_function *m = cfun->machine;
15122 unsigned int regno;
15124 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15125 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15127 rtx reg = gen_rtx_REG (word_mode, regno);
15128 rtx mem;
15129 rtx_insn *insn;
15131 mem = choose_baseaddr (cfa_offset, NULL);
15132 mem = gen_frame_mem (word_mode, mem);
15133 insn = emit_move_insn (reg, mem);
15135 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
15137 /* Previously we'd represented the CFA as an expression
15138 like *(%ebp - 8). We've just popped that value from
15139 the stack, which means we need to reset the CFA to
15140 the drap register. This will remain until we restore
15141 the stack pointer. */
15142 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15143 RTX_FRAME_RELATED_P (insn) = 1;
15145 /* This means that the DRAP register is valid for addressing. */
15146 m->fs.drap_valid = true;
15148 else
15149 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15151 cfa_offset -= UNITS_PER_WORD;
15155 /* Emit code to restore saved registers using MOV insns.
15156 First register is restored from CFA - CFA_OFFSET. */
15157 static void
15158 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
15159 bool maybe_eh_return)
15161 unsigned int regno;
15163 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15164 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15166 rtx reg = gen_rtx_REG (V4SFmode, regno);
15167 rtx mem;
15168 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15170 mem = choose_baseaddr (cfa_offset, &align);
15171 mem = gen_rtx_MEM (V4SFmode, mem);
15173 /* The location aligment depends upon the base register. */
15174 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15175 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15176 set_mem_align (mem, align);
15177 emit_insn (gen_rtx_SET (reg, mem));
15179 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15181 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15185 static void
15186 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15187 bool use_call, int style)
15189 struct machine_function *m = cfun->machine;
15190 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15191 + m->call_ms2sysv_extra_regs;
15192 rtvec v;
15193 unsigned int elems_needed, align, i, vi = 0;
15194 rtx_insn *insn;
15195 rtx sym, tmp;
15196 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15197 rtx r10 = NULL_RTX;
15198 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15199 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15200 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15201 rtx rsi_frame_load = NULL_RTX;
15202 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15203 enum xlogue_stub stub;
15205 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15207 /* If using a realigned stack, we should never start with padding. */
15208 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15210 /* Setup RSI as the stub's base pointer. */
15211 align = GET_MODE_ALIGNMENT (V4SFmode);
15212 tmp = choose_baseaddr (rsi_offset, &align);
15213 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15214 emit_insn (gen_rtx_SET (rsi, tmp));
15216 /* Get a symbol for the stub. */
15217 if (frame_pointer_needed)
15218 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15219 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15220 else
15221 stub = use_call ? XLOGUE_STUB_RESTORE
15222 : XLOGUE_STUB_RESTORE_TAIL;
15223 sym = xlogue.get_stub_rtx (stub);
15225 elems_needed = ncregs;
15226 if (use_call)
15227 elems_needed += 1;
15228 else
15229 elems_needed += frame_pointer_needed ? 5 : 3;
15230 v = rtvec_alloc (elems_needed);
15232 /* We call the epilogue stub when we need to pop incoming args or we are
15233 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15234 epilogue stub and it is the tail-call. */
15235 if (use_call)
15236 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15237 else
15239 RTVEC_ELT (v, vi++) = ret_rtx;
15240 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15241 if (frame_pointer_needed)
15243 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15244 gcc_assert (m->fs.fp_valid);
15245 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15247 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15248 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15249 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15250 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15251 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15253 else
15255 /* If no hard frame pointer, we set R10 to the SP restore value. */
15256 gcc_assert (!m->fs.fp_valid);
15257 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15258 gcc_assert (m->fs.sp_valid);
15260 r10 = gen_rtx_REG (DImode, R10_REG);
15261 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15262 emit_insn (gen_rtx_SET (r10, tmp));
15264 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15268 /* Generate frame load insns and restore notes. */
15269 for (i = 0; i < ncregs; ++i)
15271 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15272 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15273 rtx reg, frame_load;
15275 reg = gen_rtx_REG (mode, r.regno);
15276 frame_load = gen_frame_load (reg, rsi, r.offset);
15278 /* Save RSI frame load insn & note to add last. */
15279 if (r.regno == SI_REG)
15281 gcc_assert (!rsi_frame_load);
15282 rsi_frame_load = frame_load;
15283 rsi_restore_offset = r.offset;
15285 else
15287 RTVEC_ELT (v, vi++) = frame_load;
15288 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15292 /* Add RSI frame load & restore note at the end. */
15293 gcc_assert (rsi_frame_load);
15294 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15295 RTVEC_ELT (v, vi++) = rsi_frame_load;
15296 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15297 rsi_restore_offset);
15299 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15300 if (!use_call && !frame_pointer_needed)
15302 gcc_assert (m->fs.sp_valid);
15303 gcc_assert (!m->fs.sp_realigned);
15305 /* At this point, R10 should point to frame.stack_realign_offset. */
15306 if (m->fs.cfa_reg == stack_pointer_rtx)
15307 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15308 m->fs.sp_offset = frame.stack_realign_offset;
15311 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15312 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15313 if (use_call)
15314 insn = emit_insn (tmp);
15315 else
15317 insn = emit_jump_insn (tmp);
15318 JUMP_LABEL (insn) = ret_rtx;
15320 if (frame_pointer_needed)
15321 ix86_emit_leave (insn);
15322 else
15324 /* Need CFA adjust note. */
15325 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15326 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15330 RTX_FRAME_RELATED_P (insn) = true;
15331 ix86_add_queued_cfa_restore_notes (insn);
15333 /* If we're not doing a tail-call, we need to adjust the stack. */
15334 if (use_call && m->fs.sp_valid)
15336 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15337 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15338 GEN_INT (dealloc), style,
15339 m->fs.cfa_reg == stack_pointer_rtx);
15343 /* Restore function stack, frame, and registers. */
15345 void
15346 ix86_expand_epilogue (int style)
15348 struct machine_function *m = cfun->machine;
15349 struct machine_frame_state frame_state_save = m->fs;
15350 struct ix86_frame frame;
15351 bool restore_regs_via_mov;
15352 bool using_drap;
15353 bool restore_stub_is_tail = false;
15355 if (ix86_function_naked (current_function_decl))
15357 /* The program should not reach this point. */
15358 emit_insn (gen_ud2 ());
15359 return;
15362 ix86_finalize_stack_frame_flags ();
15363 frame = m->frame;
15365 m->fs.sp_realigned = stack_realign_fp;
15366 m->fs.sp_valid = stack_realign_fp
15367 || !frame_pointer_needed
15368 || crtl->sp_is_unchanging;
15369 gcc_assert (!m->fs.sp_valid
15370 || m->fs.sp_offset == frame.stack_pointer_offset);
15372 /* The FP must be valid if the frame pointer is present. */
15373 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15374 gcc_assert (!m->fs.fp_valid
15375 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15377 /* We must have *some* valid pointer to the stack frame. */
15378 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15380 /* The DRAP is never valid at this point. */
15381 gcc_assert (!m->fs.drap_valid);
15383 /* See the comment about red zone and frame
15384 pointer usage in ix86_expand_prologue. */
15385 if (frame_pointer_needed && frame.red_zone_size)
15386 emit_insn (gen_memory_blockage ());
15388 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15389 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15391 /* Determine the CFA offset of the end of the red-zone. */
15392 m->fs.red_zone_offset = 0;
15393 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15395 /* The red-zone begins below return address and error code in
15396 exception handler. */
15397 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
15399 /* When the register save area is in the aligned portion of
15400 the stack, determine the maximum runtime displacement that
15401 matches up with the aligned frame. */
15402 if (stack_realign_drap)
15403 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15404 + UNITS_PER_WORD);
15407 /* Special care must be taken for the normal return case of a function
15408 using eh_return: the eax and edx registers are marked as saved, but
15409 not restored along this path. Adjust the save location to match. */
15410 if (crtl->calls_eh_return && style != 2)
15411 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15413 /* EH_RETURN requires the use of moves to function properly. */
15414 if (crtl->calls_eh_return)
15415 restore_regs_via_mov = true;
15416 /* SEH requires the use of pops to identify the epilogue. */
15417 else if (TARGET_SEH)
15418 restore_regs_via_mov = false;
15419 /* If we're only restoring one register and sp cannot be used then
15420 using a move instruction to restore the register since it's
15421 less work than reloading sp and popping the register. */
15422 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15423 restore_regs_via_mov = true;
15424 else if (TARGET_EPILOGUE_USING_MOVE
15425 && cfun->machine->use_fast_prologue_epilogue
15426 && (frame.nregs > 1
15427 || m->fs.sp_offset != frame.reg_save_offset))
15428 restore_regs_via_mov = true;
15429 else if (frame_pointer_needed
15430 && !frame.nregs
15431 && m->fs.sp_offset != frame.reg_save_offset)
15432 restore_regs_via_mov = true;
15433 else if (frame_pointer_needed
15434 && TARGET_USE_LEAVE
15435 && cfun->machine->use_fast_prologue_epilogue
15436 && frame.nregs == 1)
15437 restore_regs_via_mov = true;
15438 else
15439 restore_regs_via_mov = false;
15441 if (restore_regs_via_mov || frame.nsseregs)
15443 /* Ensure that the entire register save area is addressable via
15444 the stack pointer, if we will restore SSE regs via sp. */
15445 if (TARGET_64BIT
15446 && m->fs.sp_offset > 0x7fffffff
15447 && sp_valid_at (frame.stack_realign_offset)
15448 && (frame.nsseregs + frame.nregs) != 0)
15450 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15451 GEN_INT (m->fs.sp_offset
15452 - frame.sse_reg_save_offset),
15453 style,
15454 m->fs.cfa_reg == stack_pointer_rtx);
15458 /* If there are any SSE registers to restore, then we have to do it
15459 via moves, since there's obviously no pop for SSE regs. */
15460 if (frame.nsseregs)
15461 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15462 style == 2);
15464 if (m->call_ms2sysv)
15466 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15468 /* We cannot use a tail-call for the stub if:
15469 1. We have to pop incoming args,
15470 2. We have additional int regs to restore, or
15471 3. A sibling call will be the tail-call, or
15472 4. We are emitting an eh_return_internal epilogue.
15474 TODO: Item 4 has not yet tested!
15476 If any of the above are true, we will call the stub rather than
15477 jump to it. */
15478 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15479 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15482 /* If using out-of-line stub that is a tail-call, then...*/
15483 if (m->call_ms2sysv && restore_stub_is_tail)
15485 /* TODO: parinoid tests. (remove eventually) */
15486 gcc_assert (m->fs.sp_valid);
15487 gcc_assert (!m->fs.sp_realigned);
15488 gcc_assert (!m->fs.fp_valid);
15489 gcc_assert (!m->fs.realigned);
15490 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15491 gcc_assert (!crtl->drap_reg);
15492 gcc_assert (!frame.nregs);
15494 else if (restore_regs_via_mov)
15496 rtx t;
15498 if (frame.nregs)
15499 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15501 /* eh_return epilogues need %ecx added to the stack pointer. */
15502 if (style == 2)
15504 rtx sa = EH_RETURN_STACKADJ_RTX;
15505 rtx_insn *insn;
15507 /* %ecx can't be used for both DRAP register and eh_return. */
15508 if (crtl->drap_reg)
15509 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15511 /* regparm nested functions don't work with eh_return. */
15512 gcc_assert (!ix86_static_chain_on_stack);
15514 if (frame_pointer_needed)
15516 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15517 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15518 emit_insn (gen_rtx_SET (sa, t));
15520 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15521 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15523 /* Note that we use SA as a temporary CFA, as the return
15524 address is at the proper place relative to it. We
15525 pretend this happens at the FP restore insn because
15526 prior to this insn the FP would be stored at the wrong
15527 offset relative to SA, and after this insn we have no
15528 other reasonable register to use for the CFA. We don't
15529 bother resetting the CFA to the SP for the duration of
15530 the return insn. */
15531 add_reg_note (insn, REG_CFA_DEF_CFA,
15532 plus_constant (Pmode, sa, UNITS_PER_WORD));
15533 ix86_add_queued_cfa_restore_notes (insn);
15534 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15535 RTX_FRAME_RELATED_P (insn) = 1;
15537 m->fs.cfa_reg = sa;
15538 m->fs.cfa_offset = UNITS_PER_WORD;
15539 m->fs.fp_valid = false;
15541 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15542 const0_rtx, style, false);
15544 else
15546 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15547 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15548 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15549 ix86_add_queued_cfa_restore_notes (insn);
15551 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15552 if (m->fs.cfa_offset != UNITS_PER_WORD)
15554 m->fs.cfa_offset = UNITS_PER_WORD;
15555 add_reg_note (insn, REG_CFA_DEF_CFA,
15556 plus_constant (Pmode, stack_pointer_rtx,
15557 UNITS_PER_WORD));
15558 RTX_FRAME_RELATED_P (insn) = 1;
15561 m->fs.sp_offset = UNITS_PER_WORD;
15562 m->fs.sp_valid = true;
15563 m->fs.sp_realigned = false;
15566 else
15568 /* SEH requires that the function end with (1) a stack adjustment
15569 if necessary, (2) a sequence of pops, and (3) a return or
15570 jump instruction. Prevent insns from the function body from
15571 being scheduled into this sequence. */
15572 if (TARGET_SEH)
15574 /* Prevent a catch region from being adjacent to the standard
15575 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15576 several other flags that would be interesting to test are
15577 not yet set up. */
15578 if (flag_non_call_exceptions)
15579 emit_insn (gen_nops (const1_rtx));
15580 else
15581 emit_insn (gen_blockage ());
15584 /* First step is to deallocate the stack frame so that we can
15585 pop the registers. If the stack pointer was realigned, it needs
15586 to be restored now. Also do it on SEH target for very large
15587 frame as the emitted instructions aren't allowed by the ABI
15588 in epilogues. */
15589 if (!m->fs.sp_valid || m->fs.sp_realigned
15590 || (TARGET_SEH
15591 && (m->fs.sp_offset - frame.reg_save_offset
15592 >= SEH_MAX_FRAME_SIZE)))
15594 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15595 GEN_INT (m->fs.fp_offset
15596 - frame.reg_save_offset),
15597 style, false);
15599 else if (m->fs.sp_offset != frame.reg_save_offset)
15601 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15602 GEN_INT (m->fs.sp_offset
15603 - frame.reg_save_offset),
15604 style,
15605 m->fs.cfa_reg == stack_pointer_rtx);
15608 ix86_emit_restore_regs_using_pop ();
15611 /* If we used a stack pointer and haven't already got rid of it,
15612 then do so now. */
15613 if (m->fs.fp_valid)
15615 /* If the stack pointer is valid and pointing at the frame
15616 pointer store address, then we only need a pop. */
15617 if (sp_valid_at (frame.hfp_save_offset)
15618 && m->fs.sp_offset == frame.hfp_save_offset)
15619 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15620 /* Leave results in shorter dependency chains on CPUs that are
15621 able to grok it fast. */
15622 else if (TARGET_USE_LEAVE
15623 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15624 || !cfun->machine->use_fast_prologue_epilogue)
15625 ix86_emit_leave (NULL);
15626 else
15628 pro_epilogue_adjust_stack (stack_pointer_rtx,
15629 hard_frame_pointer_rtx,
15630 const0_rtx, style, !using_drap);
15631 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15635 if (using_drap)
15637 int param_ptr_offset = UNITS_PER_WORD;
15638 rtx_insn *insn;
15640 gcc_assert (stack_realign_drap);
15642 if (ix86_static_chain_on_stack)
15643 param_ptr_offset += UNITS_PER_WORD;
15644 if (!call_used_regs[REGNO (crtl->drap_reg)])
15645 param_ptr_offset += UNITS_PER_WORD;
15647 insn = emit_insn (gen_rtx_SET
15648 (stack_pointer_rtx,
15649 gen_rtx_PLUS (Pmode,
15650 crtl->drap_reg,
15651 GEN_INT (-param_ptr_offset))));
15652 m->fs.cfa_reg = stack_pointer_rtx;
15653 m->fs.cfa_offset = param_ptr_offset;
15654 m->fs.sp_offset = param_ptr_offset;
15655 m->fs.realigned = false;
15657 add_reg_note (insn, REG_CFA_DEF_CFA,
15658 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15659 GEN_INT (param_ptr_offset)));
15660 RTX_FRAME_RELATED_P (insn) = 1;
15662 if (!call_used_regs[REGNO (crtl->drap_reg)])
15663 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15666 /* At this point the stack pointer must be valid, and we must have
15667 restored all of the registers. We may not have deallocated the
15668 entire stack frame. We've delayed this until now because it may
15669 be possible to merge the local stack deallocation with the
15670 deallocation forced by ix86_static_chain_on_stack. */
15671 gcc_assert (m->fs.sp_valid);
15672 gcc_assert (!m->fs.sp_realigned);
15673 gcc_assert (!m->fs.fp_valid);
15674 gcc_assert (!m->fs.realigned);
15675 if (m->fs.sp_offset != UNITS_PER_WORD)
15677 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15678 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15679 style, true);
15681 else
15682 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15684 /* Sibcall epilogues don't want a return instruction. */
15685 if (style == 0)
15687 m->fs = frame_state_save;
15688 return;
15691 if (cfun->machine->func_type != TYPE_NORMAL)
15692 emit_jump_insn (gen_interrupt_return ());
15693 else if (crtl->args.pops_args && crtl->args.size)
15695 rtx popc = GEN_INT (crtl->args.pops_args);
15697 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15698 address, do explicit add, and jump indirectly to the caller. */
15700 if (crtl->args.pops_args >= 65536)
15702 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15703 rtx_insn *insn;
15705 /* There is no "pascal" calling convention in any 64bit ABI. */
15706 gcc_assert (!TARGET_64BIT);
15708 insn = emit_insn (gen_pop (ecx));
15709 m->fs.cfa_offset -= UNITS_PER_WORD;
15710 m->fs.sp_offset -= UNITS_PER_WORD;
15712 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15713 x = gen_rtx_SET (stack_pointer_rtx, x);
15714 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15715 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15716 RTX_FRAME_RELATED_P (insn) = 1;
15718 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15719 popc, -1, true);
15720 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15722 else
15723 emit_jump_insn (gen_simple_return_pop_internal (popc));
15725 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15726 emit_jump_insn (gen_simple_return_internal ());
15728 /* Restore the state back to the state from the prologue,
15729 so that it's correct for the next epilogue. */
15730 m->fs = frame_state_save;
15733 /* Reset from the function's potential modifications. */
15735 static void
15736 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
15738 if (pic_offset_table_rtx
15739 && !ix86_use_pseudo_pic_reg ())
15740 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15742 if (TARGET_MACHO)
15744 rtx_insn *insn = get_last_insn ();
15745 rtx_insn *deleted_debug_label = NULL;
15747 /* Mach-O doesn't support labels at the end of objects, so if
15748 it looks like we might want one, take special action.
15749 First, collect any sequence of deleted debug labels. */
15750 while (insn
15751 && NOTE_P (insn)
15752 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15754 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15755 notes only, instead set their CODE_LABEL_NUMBER to -1,
15756 otherwise there would be code generation differences
15757 in between -g and -g0. */
15758 if (NOTE_P (insn) && NOTE_KIND (insn)
15759 == NOTE_INSN_DELETED_DEBUG_LABEL)
15760 deleted_debug_label = insn;
15761 insn = PREV_INSN (insn);
15764 /* If we have:
15765 label:
15766 barrier
15767 then this needs to be detected, so skip past the barrier. */
15769 if (insn && BARRIER_P (insn))
15770 insn = PREV_INSN (insn);
15772 /* Up to now we've only seen notes or barriers. */
15773 if (insn)
15775 if (LABEL_P (insn)
15776 || (NOTE_P (insn)
15777 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15778 /* Trailing label. */
15779 fputs ("\tnop\n", file);
15780 else if (cfun && ! cfun->is_thunk)
15782 /* See if we have a completely empty function body, skipping
15783 the special case of the picbase thunk emitted as asm. */
15784 while (insn && ! INSN_P (insn))
15785 insn = PREV_INSN (insn);
15786 /* If we don't find any insns, we've got an empty function body;
15787 I.e. completely empty - without a return or branch. This is
15788 taken as the case where a function body has been removed
15789 because it contains an inline __builtin_unreachable(). GCC
15790 declares that reaching __builtin_unreachable() means UB so
15791 we're not obliged to do anything special; however, we want
15792 non-zero-sized function bodies. To meet this, and help the
15793 user out, let's trap the case. */
15794 if (insn == NULL)
15795 fputs ("\tud2\n", file);
15798 else if (deleted_debug_label)
15799 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15800 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15801 CODE_LABEL_NUMBER (insn) = -1;
15805 /* Return a scratch register to use in the split stack prologue. The
15806 split stack prologue is used for -fsplit-stack. It is the first
15807 instructions in the function, even before the regular prologue.
15808 The scratch register can be any caller-saved register which is not
15809 used for parameters or for the static chain. */
15811 static unsigned int
15812 split_stack_prologue_scratch_regno (void)
15814 if (TARGET_64BIT)
15815 return R11_REG;
15816 else
15818 bool is_fastcall, is_thiscall;
15819 int regparm;
15821 is_fastcall = (lookup_attribute ("fastcall",
15822 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15823 != NULL);
15824 is_thiscall = (lookup_attribute ("thiscall",
15825 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15826 != NULL);
15827 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15829 if (is_fastcall)
15831 if (DECL_STATIC_CHAIN (cfun->decl))
15833 sorry ("-fsplit-stack does not support fastcall with "
15834 "nested function");
15835 return INVALID_REGNUM;
15837 return AX_REG;
15839 else if (is_thiscall)
15841 if (!DECL_STATIC_CHAIN (cfun->decl))
15842 return DX_REG;
15843 return AX_REG;
15845 else if (regparm < 3)
15847 if (!DECL_STATIC_CHAIN (cfun->decl))
15848 return CX_REG;
15849 else
15851 if (regparm >= 2)
15853 sorry ("-fsplit-stack does not support 2 register "
15854 "parameters for a nested function");
15855 return INVALID_REGNUM;
15857 return DX_REG;
15860 else
15862 /* FIXME: We could make this work by pushing a register
15863 around the addition and comparison. */
15864 sorry ("-fsplit-stack does not support 3 register parameters");
15865 return INVALID_REGNUM;
15870 /* A SYMBOL_REF for the function which allocates new stackspace for
15871 -fsplit-stack. */
15873 static GTY(()) rtx split_stack_fn;
15875 /* A SYMBOL_REF for the more stack function when using the large
15876 model. */
15878 static GTY(()) rtx split_stack_fn_large;
15880 /* Return location of the stack guard value in the TLS block. */
15883 ix86_split_stack_guard (void)
15885 int offset;
15886 addr_space_t as = DEFAULT_TLS_SEG_REG;
15887 rtx r;
15889 gcc_assert (flag_split_stack);
15891 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15892 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15893 #else
15894 gcc_unreachable ();
15895 #endif
15897 r = GEN_INT (offset);
15898 r = gen_const_mem (Pmode, r);
15899 set_mem_addr_space (r, as);
15901 return r;
15904 /* Handle -fsplit-stack. These are the first instructions in the
15905 function, even before the regular prologue. */
15907 void
15908 ix86_expand_split_stack_prologue (void)
15910 struct ix86_frame frame;
15911 HOST_WIDE_INT allocate;
15912 unsigned HOST_WIDE_INT args_size;
15913 rtx_code_label *label;
15914 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15915 rtx scratch_reg = NULL_RTX;
15916 rtx_code_label *varargs_label = NULL;
15917 rtx fn;
15919 gcc_assert (flag_split_stack && reload_completed);
15921 ix86_finalize_stack_frame_flags ();
15922 frame = cfun->machine->frame;
15923 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15925 /* This is the label we will branch to if we have enough stack
15926 space. We expect the basic block reordering pass to reverse this
15927 branch if optimizing, so that we branch in the unlikely case. */
15928 label = gen_label_rtx ();
15930 /* We need to compare the stack pointer minus the frame size with
15931 the stack boundary in the TCB. The stack boundary always gives
15932 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15933 can compare directly. Otherwise we need to do an addition. */
15935 limit = ix86_split_stack_guard ();
15937 if (allocate < SPLIT_STACK_AVAILABLE)
15938 current = stack_pointer_rtx;
15939 else
15941 unsigned int scratch_regno;
15942 rtx offset;
15944 /* We need a scratch register to hold the stack pointer minus
15945 the required frame size. Since this is the very start of the
15946 function, the scratch register can be any caller-saved
15947 register which is not used for parameters. */
15948 offset = GEN_INT (- allocate);
15949 scratch_regno = split_stack_prologue_scratch_regno ();
15950 if (scratch_regno == INVALID_REGNUM)
15951 return;
15952 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15953 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15955 /* We don't use ix86_gen_add3 in this case because it will
15956 want to split to lea, but when not optimizing the insn
15957 will not be split after this point. */
15958 emit_insn (gen_rtx_SET (scratch_reg,
15959 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15960 offset)));
15962 else
15964 emit_move_insn (scratch_reg, offset);
15965 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15966 stack_pointer_rtx));
15968 current = scratch_reg;
15971 ix86_expand_branch (GEU, current, limit, label);
15972 rtx_insn *jump_insn = get_last_insn ();
15973 JUMP_LABEL (jump_insn) = label;
15975 /* Mark the jump as very likely to be taken. */
15976 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15978 if (split_stack_fn == NULL_RTX)
15980 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15981 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15983 fn = split_stack_fn;
15985 /* Get more stack space. We pass in the desired stack space and the
15986 size of the arguments to copy to the new stack. In 32-bit mode
15987 we push the parameters; __morestack will return on a new stack
15988 anyhow. In 64-bit mode we pass the parameters in r10 and
15989 r11. */
15990 allocate_rtx = GEN_INT (allocate);
15991 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
15992 call_fusage = NULL_RTX;
15993 rtx pop = NULL_RTX;
15994 if (TARGET_64BIT)
15996 rtx reg10, reg11;
15998 reg10 = gen_rtx_REG (Pmode, R10_REG);
15999 reg11 = gen_rtx_REG (Pmode, R11_REG);
16001 /* If this function uses a static chain, it will be in %r10.
16002 Preserve it across the call to __morestack. */
16003 if (DECL_STATIC_CHAIN (cfun->decl))
16005 rtx rax;
16007 rax = gen_rtx_REG (word_mode, AX_REG);
16008 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
16009 use_reg (&call_fusage, rax);
16012 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
16013 && !TARGET_PECOFF)
16015 HOST_WIDE_INT argval;
16017 gcc_assert (Pmode == DImode);
16018 /* When using the large model we need to load the address
16019 into a register, and we've run out of registers. So we
16020 switch to a different calling convention, and we call a
16021 different function: __morestack_large. We pass the
16022 argument size in the upper 32 bits of r10 and pass the
16023 frame size in the lower 32 bits. */
16024 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
16025 gcc_assert ((args_size & 0xffffffff) == args_size);
16027 if (split_stack_fn_large == NULL_RTX)
16029 split_stack_fn_large =
16030 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
16031 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
16033 if (ix86_cmodel == CM_LARGE_PIC)
16035 rtx_code_label *label;
16036 rtx x;
16038 label = gen_label_rtx ();
16039 emit_label (label);
16040 LABEL_PRESERVE_P (label) = 1;
16041 emit_insn (gen_set_rip_rex64 (reg10, label));
16042 emit_insn (gen_set_got_offset_rex64 (reg11, label));
16043 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
16044 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
16045 UNSPEC_GOT);
16046 x = gen_rtx_CONST (Pmode, x);
16047 emit_move_insn (reg11, x);
16048 x = gen_rtx_PLUS (Pmode, reg10, reg11);
16049 x = gen_const_mem (Pmode, x);
16050 emit_move_insn (reg11, x);
16052 else
16053 emit_move_insn (reg11, split_stack_fn_large);
16055 fn = reg11;
16057 argval = ((args_size << 16) << 16) + allocate;
16058 emit_move_insn (reg10, GEN_INT (argval));
16060 else
16062 emit_move_insn (reg10, allocate_rtx);
16063 emit_move_insn (reg11, GEN_INT (args_size));
16064 use_reg (&call_fusage, reg11);
16067 use_reg (&call_fusage, reg10);
16069 else
16071 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
16072 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
16073 insn = emit_insn (gen_push (allocate_rtx));
16074 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
16075 pop = GEN_INT (2 * UNITS_PER_WORD);
16077 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
16078 GEN_INT (UNITS_PER_WORD), constm1_rtx,
16079 pop, false);
16080 add_function_usage_to (call_insn, call_fusage);
16081 if (!TARGET_64BIT)
16082 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
16083 /* Indicate that this function can't jump to non-local gotos. */
16084 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
16086 /* In order to make call/return prediction work right, we now need
16087 to execute a return instruction. See
16088 libgcc/config/i386/morestack.S for the details on how this works.
16090 For flow purposes gcc must not see this as a return
16091 instruction--we need control flow to continue at the subsequent
16092 label. Therefore, we use an unspec. */
16093 gcc_assert (crtl->args.pops_args < 65536);
16094 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
16096 /* If we are in 64-bit mode and this function uses a static chain,
16097 we saved %r10 in %rax before calling _morestack. */
16098 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
16099 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
16100 gen_rtx_REG (word_mode, AX_REG));
16102 /* If this function calls va_start, we need to store a pointer to
16103 the arguments on the old stack, because they may not have been
16104 all copied to the new stack. At this point the old stack can be
16105 found at the frame pointer value used by __morestack, because
16106 __morestack has set that up before calling back to us. Here we
16107 store that pointer in a scratch register, and in
16108 ix86_expand_prologue we store the scratch register in a stack
16109 slot. */
16110 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16112 unsigned int scratch_regno;
16113 rtx frame_reg;
16114 int words;
16116 scratch_regno = split_stack_prologue_scratch_regno ();
16117 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
16118 frame_reg = gen_rtx_REG (Pmode, BP_REG);
16120 /* 64-bit:
16121 fp -> old fp value
16122 return address within this function
16123 return address of caller of this function
16124 stack arguments
16125 So we add three words to get to the stack arguments.
16127 32-bit:
16128 fp -> old fp value
16129 return address within this function
16130 first argument to __morestack
16131 second argument to __morestack
16132 return address of caller of this function
16133 stack arguments
16134 So we add five words to get to the stack arguments.
16136 words = TARGET_64BIT ? 3 : 5;
16137 emit_insn (gen_rtx_SET (scratch_reg,
16138 gen_rtx_PLUS (Pmode, frame_reg,
16139 GEN_INT (words * UNITS_PER_WORD))));
16141 varargs_label = gen_label_rtx ();
16142 emit_jump_insn (gen_jump (varargs_label));
16143 JUMP_LABEL (get_last_insn ()) = varargs_label;
16145 emit_barrier ();
16148 emit_label (label);
16149 LABEL_NUSES (label) = 1;
16151 /* If this function calls va_start, we now have to set the scratch
16152 register for the case where we do not call __morestack. In this
16153 case we need to set it based on the stack pointer. */
16154 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16156 emit_insn (gen_rtx_SET (scratch_reg,
16157 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
16158 GEN_INT (UNITS_PER_WORD))));
16160 emit_label (varargs_label);
16161 LABEL_NUSES (varargs_label) = 1;
16165 /* We may have to tell the dataflow pass that the split stack prologue
16166 is initializing a scratch register. */
16168 static void
16169 ix86_live_on_entry (bitmap regs)
16171 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16173 gcc_assert (flag_split_stack);
16174 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
16178 /* Extract the parts of an RTL expression that is a valid memory address
16179 for an instruction. Return 0 if the structure of the address is
16180 grossly off. Return -1 if the address contains ASHIFT, so it is not
16181 strictly valid, but still used for computing length of lea instruction. */
16184 ix86_decompose_address (rtx addr, struct ix86_address *out)
16186 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16187 rtx base_reg, index_reg;
16188 HOST_WIDE_INT scale = 1;
16189 rtx scale_rtx = NULL_RTX;
16190 rtx tmp;
16191 int retval = 1;
16192 addr_space_t seg = ADDR_SPACE_GENERIC;
16194 /* Allow zero-extended SImode addresses,
16195 they will be emitted with addr32 prefix. */
16196 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16198 if (GET_CODE (addr) == ZERO_EXTEND
16199 && GET_MODE (XEXP (addr, 0)) == SImode)
16201 addr = XEXP (addr, 0);
16202 if (CONST_INT_P (addr))
16203 return 0;
16205 else if (GET_CODE (addr) == AND
16206 && const_32bit_mask (XEXP (addr, 1), DImode))
16208 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16209 if (addr == NULL_RTX)
16210 return 0;
16212 if (CONST_INT_P (addr))
16213 return 0;
16217 /* Allow SImode subregs of DImode addresses,
16218 they will be emitted with addr32 prefix. */
16219 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16221 if (SUBREG_P (addr)
16222 && GET_MODE (SUBREG_REG (addr)) == DImode)
16224 addr = SUBREG_REG (addr);
16225 if (CONST_INT_P (addr))
16226 return 0;
16230 if (REG_P (addr))
16231 base = addr;
16232 else if (SUBREG_P (addr))
16234 if (REG_P (SUBREG_REG (addr)))
16235 base = addr;
16236 else
16237 return 0;
16239 else if (GET_CODE (addr) == PLUS)
16241 rtx addends[4], op;
16242 int n = 0, i;
16244 op = addr;
16247 if (n >= 4)
16248 return 0;
16249 addends[n++] = XEXP (op, 1);
16250 op = XEXP (op, 0);
16252 while (GET_CODE (op) == PLUS);
16253 if (n >= 4)
16254 return 0;
16255 addends[n] = op;
16257 for (i = n; i >= 0; --i)
16259 op = addends[i];
16260 switch (GET_CODE (op))
16262 case MULT:
16263 if (index)
16264 return 0;
16265 index = XEXP (op, 0);
16266 scale_rtx = XEXP (op, 1);
16267 break;
16269 case ASHIFT:
16270 if (index)
16271 return 0;
16272 index = XEXP (op, 0);
16273 tmp = XEXP (op, 1);
16274 if (!CONST_INT_P (tmp))
16275 return 0;
16276 scale = INTVAL (tmp);
16277 if ((unsigned HOST_WIDE_INT) scale > 3)
16278 return 0;
16279 scale = 1 << scale;
16280 break;
16282 case ZERO_EXTEND:
16283 op = XEXP (op, 0);
16284 if (GET_CODE (op) != UNSPEC)
16285 return 0;
16286 /* FALLTHRU */
16288 case UNSPEC:
16289 if (XINT (op, 1) == UNSPEC_TP
16290 && TARGET_TLS_DIRECT_SEG_REFS
16291 && seg == ADDR_SPACE_GENERIC)
16292 seg = DEFAULT_TLS_SEG_REG;
16293 else
16294 return 0;
16295 break;
16297 case SUBREG:
16298 if (!REG_P (SUBREG_REG (op)))
16299 return 0;
16300 /* FALLTHRU */
16302 case REG:
16303 if (!base)
16304 base = op;
16305 else if (!index)
16306 index = op;
16307 else
16308 return 0;
16309 break;
16311 case CONST:
16312 case CONST_INT:
16313 case SYMBOL_REF:
16314 case LABEL_REF:
16315 if (disp)
16316 return 0;
16317 disp = op;
16318 break;
16320 default:
16321 return 0;
16325 else if (GET_CODE (addr) == MULT)
16327 index = XEXP (addr, 0); /* index*scale */
16328 scale_rtx = XEXP (addr, 1);
16330 else if (GET_CODE (addr) == ASHIFT)
16332 /* We're called for lea too, which implements ashift on occasion. */
16333 index = XEXP (addr, 0);
16334 tmp = XEXP (addr, 1);
16335 if (!CONST_INT_P (tmp))
16336 return 0;
16337 scale = INTVAL (tmp);
16338 if ((unsigned HOST_WIDE_INT) scale > 3)
16339 return 0;
16340 scale = 1 << scale;
16341 retval = -1;
16343 else
16344 disp = addr; /* displacement */
16346 if (index)
16348 if (REG_P (index))
16350 else if (SUBREG_P (index)
16351 && REG_P (SUBREG_REG (index)))
16353 else
16354 return 0;
16357 /* Extract the integral value of scale. */
16358 if (scale_rtx)
16360 if (!CONST_INT_P (scale_rtx))
16361 return 0;
16362 scale = INTVAL (scale_rtx);
16365 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16366 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16368 /* Avoid useless 0 displacement. */
16369 if (disp == const0_rtx && (base || index))
16370 disp = NULL_RTX;
16372 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16373 if (base_reg && index_reg && scale == 1
16374 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16375 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16376 || REGNO (index_reg) == SP_REG))
16378 std::swap (base, index);
16379 std::swap (base_reg, index_reg);
16382 /* Special case: %ebp cannot be encoded as a base without a displacement.
16383 Similarly %r13. */
16384 if (!disp && base_reg
16385 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16386 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16387 || REGNO (base_reg) == BP_REG
16388 || REGNO (base_reg) == R13_REG))
16389 disp = const0_rtx;
16391 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16392 Avoid this by transforming to [%esi+0].
16393 Reload calls address legitimization without cfun defined, so we need
16394 to test cfun for being non-NULL. */
16395 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16396 && base_reg && !index_reg && !disp
16397 && REGNO (base_reg) == SI_REG)
16398 disp = const0_rtx;
16400 /* Special case: encode reg+reg instead of reg*2. */
16401 if (!base && index && scale == 2)
16402 base = index, base_reg = index_reg, scale = 1;
16404 /* Special case: scaling cannot be encoded without base or displacement. */
16405 if (!base && !disp && index && scale != 1)
16406 disp = const0_rtx;
16408 out->base = base;
16409 out->index = index;
16410 out->disp = disp;
16411 out->scale = scale;
16412 out->seg = seg;
16414 return retval;
16417 /* Return cost of the memory address x.
16418 For i386, it is better to use a complex address than let gcc copy
16419 the address into a reg and make a new pseudo. But not if the address
16420 requires to two regs - that would mean more pseudos with longer
16421 lifetimes. */
16422 static int
16423 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16425 struct ix86_address parts;
16426 int cost = 1;
16427 int ok = ix86_decompose_address (x, &parts);
16429 gcc_assert (ok);
16431 if (parts.base && SUBREG_P (parts.base))
16432 parts.base = SUBREG_REG (parts.base);
16433 if (parts.index && SUBREG_P (parts.index))
16434 parts.index = SUBREG_REG (parts.index);
16436 /* Attempt to minimize number of registers in the address by increasing
16437 address cost for each used register. We don't increase address cost
16438 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16439 is not invariant itself it most likely means that base or index is not
16440 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16441 which is not profitable for x86. */
16442 if (parts.base
16443 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16444 && (current_pass->type == GIMPLE_PASS
16445 || !pic_offset_table_rtx
16446 || !REG_P (parts.base)
16447 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16448 cost++;
16450 if (parts.index
16451 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16452 && (current_pass->type == GIMPLE_PASS
16453 || !pic_offset_table_rtx
16454 || !REG_P (parts.index)
16455 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16456 cost++;
16458 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16459 since it's predecode logic can't detect the length of instructions
16460 and it degenerates to vector decoded. Increase cost of such
16461 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16462 to split such addresses or even refuse such addresses at all.
16464 Following addressing modes are affected:
16465 [base+scale*index]
16466 [scale*index+disp]
16467 [base+index]
16469 The first and last case may be avoidable by explicitly coding the zero in
16470 memory address, but I don't have AMD-K6 machine handy to check this
16471 theory. */
16473 if (TARGET_K6
16474 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16475 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16476 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16477 cost += 10;
16479 return cost;
16482 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16483 this is used for to form addresses to local data when -fPIC is in
16484 use. */
16486 static bool
16487 darwin_local_data_pic (rtx disp)
16489 return (GET_CODE (disp) == UNSPEC
16490 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16493 /* True if operand X should be loaded from GOT. */
16495 bool
16496 ix86_force_load_from_GOT_p (rtx x)
16498 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16499 && !TARGET_PECOFF && !TARGET_MACHO
16500 && !flag_plt && !flag_pic
16501 && ix86_cmodel != CM_LARGE
16502 && GET_CODE (x) == SYMBOL_REF
16503 && SYMBOL_REF_FUNCTION_P (x)
16504 && !SYMBOL_REF_LOCAL_P (x));
16507 /* Determine if a given RTX is a valid constant. We already know this
16508 satisfies CONSTANT_P. */
16510 static bool
16511 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16513 /* Pointer bounds constants are not valid. */
16514 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16515 return false;
16517 switch (GET_CODE (x))
16519 case CONST:
16520 x = XEXP (x, 0);
16522 if (GET_CODE (x) == PLUS)
16524 if (!CONST_INT_P (XEXP (x, 1)))
16525 return false;
16526 x = XEXP (x, 0);
16529 if (TARGET_MACHO && darwin_local_data_pic (x))
16530 return true;
16532 /* Only some unspecs are valid as "constants". */
16533 if (GET_CODE (x) == UNSPEC)
16534 switch (XINT (x, 1))
16536 case UNSPEC_GOT:
16537 case UNSPEC_GOTOFF:
16538 case UNSPEC_PLTOFF:
16539 return TARGET_64BIT;
16540 case UNSPEC_TPOFF:
16541 case UNSPEC_NTPOFF:
16542 x = XVECEXP (x, 0, 0);
16543 return (GET_CODE (x) == SYMBOL_REF
16544 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16545 case UNSPEC_DTPOFF:
16546 x = XVECEXP (x, 0, 0);
16547 return (GET_CODE (x) == SYMBOL_REF
16548 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16549 default:
16550 return false;
16553 /* We must have drilled down to a symbol. */
16554 if (GET_CODE (x) == LABEL_REF)
16555 return true;
16556 if (GET_CODE (x) != SYMBOL_REF)
16557 return false;
16558 /* FALLTHRU */
16560 case SYMBOL_REF:
16561 /* TLS symbols are never valid. */
16562 if (SYMBOL_REF_TLS_MODEL (x))
16563 return false;
16565 /* DLLIMPORT symbols are never valid. */
16566 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16567 && SYMBOL_REF_DLLIMPORT_P (x))
16568 return false;
16570 #if TARGET_MACHO
16571 /* mdynamic-no-pic */
16572 if (MACHO_DYNAMIC_NO_PIC_P)
16573 return machopic_symbol_defined_p (x);
16574 #endif
16576 /* External function address should be loaded
16577 via the GOT slot to avoid PLT. */
16578 if (ix86_force_load_from_GOT_p (x))
16579 return false;
16581 break;
16583 CASE_CONST_SCALAR_INT:
16584 switch (mode)
16586 case E_TImode:
16587 if (TARGET_64BIT)
16588 return true;
16589 /* FALLTHRU */
16590 case E_OImode:
16591 case E_XImode:
16592 if (!standard_sse_constant_p (x, mode))
16593 return false;
16594 default:
16595 break;
16597 break;
16599 case CONST_VECTOR:
16600 if (!standard_sse_constant_p (x, mode))
16601 return false;
16603 default:
16604 break;
16607 /* Otherwise we handle everything else in the move patterns. */
16608 return true;
16611 /* Determine if it's legal to put X into the constant pool. This
16612 is not possible for the address of thread-local symbols, which
16613 is checked above. */
16615 static bool
16616 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16618 /* We can put any immediate constant in memory. */
16619 switch (GET_CODE (x))
16621 CASE_CONST_ANY:
16622 return false;
16624 default:
16625 break;
16628 return !ix86_legitimate_constant_p (mode, x);
16631 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16632 otherwise zero. */
16634 static bool
16635 is_imported_p (rtx x)
16637 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16638 || GET_CODE (x) != SYMBOL_REF)
16639 return false;
16641 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16645 /* Nonzero if the constant value X is a legitimate general operand
16646 when generating PIC code. It is given that flag_pic is on and
16647 that X satisfies CONSTANT_P. */
16649 bool
16650 legitimate_pic_operand_p (rtx x)
16652 rtx inner;
16654 switch (GET_CODE (x))
16656 case CONST:
16657 inner = XEXP (x, 0);
16658 if (GET_CODE (inner) == PLUS
16659 && CONST_INT_P (XEXP (inner, 1)))
16660 inner = XEXP (inner, 0);
16662 /* Only some unspecs are valid as "constants". */
16663 if (GET_CODE (inner) == UNSPEC)
16664 switch (XINT (inner, 1))
16666 case UNSPEC_GOT:
16667 case UNSPEC_GOTOFF:
16668 case UNSPEC_PLTOFF:
16669 return TARGET_64BIT;
16670 case UNSPEC_TPOFF:
16671 x = XVECEXP (inner, 0, 0);
16672 return (GET_CODE (x) == SYMBOL_REF
16673 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16674 case UNSPEC_MACHOPIC_OFFSET:
16675 return legitimate_pic_address_disp_p (x);
16676 default:
16677 return false;
16679 /* FALLTHRU */
16681 case SYMBOL_REF:
16682 case LABEL_REF:
16683 return legitimate_pic_address_disp_p (x);
16685 default:
16686 return true;
16690 /* Determine if a given CONST RTX is a valid memory displacement
16691 in PIC mode. */
16693 bool
16694 legitimate_pic_address_disp_p (rtx disp)
16696 bool saw_plus;
16698 /* In 64bit mode we can allow direct addresses of symbols and labels
16699 when they are not dynamic symbols. */
16700 if (TARGET_64BIT)
16702 rtx op0 = disp, op1;
16704 switch (GET_CODE (disp))
16706 case LABEL_REF:
16707 return true;
16709 case CONST:
16710 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16711 break;
16712 op0 = XEXP (XEXP (disp, 0), 0);
16713 op1 = XEXP (XEXP (disp, 0), 1);
16714 if (!CONST_INT_P (op1)
16715 || INTVAL (op1) >= 16*1024*1024
16716 || INTVAL (op1) < -16*1024*1024)
16717 break;
16718 if (GET_CODE (op0) == LABEL_REF)
16719 return true;
16720 if (GET_CODE (op0) == CONST
16721 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16722 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16723 return true;
16724 if (GET_CODE (op0) == UNSPEC
16725 && XINT (op0, 1) == UNSPEC_PCREL)
16726 return true;
16727 if (GET_CODE (op0) != SYMBOL_REF)
16728 break;
16729 /* FALLTHRU */
16731 case SYMBOL_REF:
16732 /* TLS references should always be enclosed in UNSPEC.
16733 The dllimported symbol needs always to be resolved. */
16734 if (SYMBOL_REF_TLS_MODEL (op0)
16735 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16736 return false;
16738 if (TARGET_PECOFF)
16740 if (is_imported_p (op0))
16741 return true;
16743 if (SYMBOL_REF_FAR_ADDR_P (op0)
16744 || !SYMBOL_REF_LOCAL_P (op0))
16745 break;
16747 /* Function-symbols need to be resolved only for
16748 large-model.
16749 For the small-model we don't need to resolve anything
16750 here. */
16751 if ((ix86_cmodel != CM_LARGE_PIC
16752 && SYMBOL_REF_FUNCTION_P (op0))
16753 || ix86_cmodel == CM_SMALL_PIC)
16754 return true;
16755 /* Non-external symbols don't need to be resolved for
16756 large, and medium-model. */
16757 if ((ix86_cmodel == CM_LARGE_PIC
16758 || ix86_cmodel == CM_MEDIUM_PIC)
16759 && !SYMBOL_REF_EXTERNAL_P (op0))
16760 return true;
16762 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16763 && (SYMBOL_REF_LOCAL_P (op0)
16764 || (HAVE_LD_PIE_COPYRELOC
16765 && flag_pie
16766 && !SYMBOL_REF_WEAK (op0)
16767 && !SYMBOL_REF_FUNCTION_P (op0)))
16768 && ix86_cmodel != CM_LARGE_PIC)
16769 return true;
16770 break;
16772 default:
16773 break;
16776 if (GET_CODE (disp) != CONST)
16777 return false;
16778 disp = XEXP (disp, 0);
16780 if (TARGET_64BIT)
16782 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16783 of GOT tables. We should not need these anyway. */
16784 if (GET_CODE (disp) != UNSPEC
16785 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16786 && XINT (disp, 1) != UNSPEC_GOTOFF
16787 && XINT (disp, 1) != UNSPEC_PCREL
16788 && XINT (disp, 1) != UNSPEC_PLTOFF))
16789 return false;
16791 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16792 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16793 return false;
16794 return true;
16797 saw_plus = false;
16798 if (GET_CODE (disp) == PLUS)
16800 if (!CONST_INT_P (XEXP (disp, 1)))
16801 return false;
16802 disp = XEXP (disp, 0);
16803 saw_plus = true;
16806 if (TARGET_MACHO && darwin_local_data_pic (disp))
16807 return true;
16809 if (GET_CODE (disp) != UNSPEC)
16810 return false;
16812 switch (XINT (disp, 1))
16814 case UNSPEC_GOT:
16815 if (saw_plus)
16816 return false;
16817 /* We need to check for both symbols and labels because VxWorks loads
16818 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16819 details. */
16820 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16821 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16822 case UNSPEC_GOTOFF:
16823 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16824 While ABI specify also 32bit relocation but we don't produce it in
16825 small PIC model at all. */
16826 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16827 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16828 && !TARGET_64BIT)
16829 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16830 return false;
16831 case UNSPEC_GOTTPOFF:
16832 case UNSPEC_GOTNTPOFF:
16833 case UNSPEC_INDNTPOFF:
16834 if (saw_plus)
16835 return false;
16836 disp = XVECEXP (disp, 0, 0);
16837 return (GET_CODE (disp) == SYMBOL_REF
16838 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16839 case UNSPEC_NTPOFF:
16840 disp = XVECEXP (disp, 0, 0);
16841 return (GET_CODE (disp) == SYMBOL_REF
16842 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16843 case UNSPEC_DTPOFF:
16844 disp = XVECEXP (disp, 0, 0);
16845 return (GET_CODE (disp) == SYMBOL_REF
16846 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16849 return false;
16852 /* Determine if op is suitable RTX for an address register.
16853 Return naked register if a register or a register subreg is
16854 found, otherwise return NULL_RTX. */
16856 static rtx
16857 ix86_validate_address_register (rtx op)
16859 machine_mode mode = GET_MODE (op);
16861 /* Only SImode or DImode registers can form the address. */
16862 if (mode != SImode && mode != DImode)
16863 return NULL_RTX;
16865 if (REG_P (op))
16866 return op;
16867 else if (SUBREG_P (op))
16869 rtx reg = SUBREG_REG (op);
16871 if (!REG_P (reg))
16872 return NULL_RTX;
16874 mode = GET_MODE (reg);
16876 /* Don't allow SUBREGs that span more than a word. It can
16877 lead to spill failures when the register is one word out
16878 of a two word structure. */
16879 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16880 return NULL_RTX;
16882 /* Allow only SUBREGs of non-eliminable hard registers. */
16883 if (register_no_elim_operand (reg, mode))
16884 return reg;
16887 /* Op is not a register. */
16888 return NULL_RTX;
16891 /* Recognizes RTL expressions that are valid memory addresses for an
16892 instruction. The MODE argument is the machine mode for the MEM
16893 expression that wants to use this address.
16895 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16896 convert common non-canonical forms to canonical form so that they will
16897 be recognized. */
16899 static bool
16900 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16902 struct ix86_address parts;
16903 rtx base, index, disp;
16904 HOST_WIDE_INT scale;
16905 addr_space_t seg;
16907 if (ix86_decompose_address (addr, &parts) <= 0)
16908 /* Decomposition failed. */
16909 return false;
16911 base = parts.base;
16912 index = parts.index;
16913 disp = parts.disp;
16914 scale = parts.scale;
16915 seg = parts.seg;
16917 /* Validate base register. */
16918 if (base)
16920 rtx reg = ix86_validate_address_register (base);
16922 if (reg == NULL_RTX)
16923 return false;
16925 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16926 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16927 /* Base is not valid. */
16928 return false;
16931 /* Validate index register. */
16932 if (index)
16934 rtx reg = ix86_validate_address_register (index);
16936 if (reg == NULL_RTX)
16937 return false;
16939 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16940 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16941 /* Index is not valid. */
16942 return false;
16945 /* Index and base should have the same mode. */
16946 if (base && index
16947 && GET_MODE (base) != GET_MODE (index))
16948 return false;
16950 /* Address override works only on the (%reg) part of %fs:(%reg). */
16951 if (seg != ADDR_SPACE_GENERIC
16952 && ((base && GET_MODE (base) != word_mode)
16953 || (index && GET_MODE (index) != word_mode)))
16954 return false;
16956 /* Validate scale factor. */
16957 if (scale != 1)
16959 if (!index)
16960 /* Scale without index. */
16961 return false;
16963 if (scale != 2 && scale != 4 && scale != 8)
16964 /* Scale is not a valid multiplier. */
16965 return false;
16968 /* Validate displacement. */
16969 if (disp)
16971 if (GET_CODE (disp) == CONST
16972 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16973 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16974 switch (XINT (XEXP (disp, 0), 1))
16976 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16977 when used. While ABI specify also 32bit relocations, we
16978 don't produce them at all and use IP relative instead.
16979 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16980 should be loaded via GOT. */
16981 case UNSPEC_GOT:
16982 if (!TARGET_64BIT
16983 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16984 goto is_legitimate_pic;
16985 /* FALLTHRU */
16986 case UNSPEC_GOTOFF:
16987 gcc_assert (flag_pic);
16988 if (!TARGET_64BIT)
16989 goto is_legitimate_pic;
16991 /* 64bit address unspec. */
16992 return false;
16994 case UNSPEC_GOTPCREL:
16995 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16996 goto is_legitimate_pic;
16997 /* FALLTHRU */
16998 case UNSPEC_PCREL:
16999 gcc_assert (flag_pic);
17000 goto is_legitimate_pic;
17002 case UNSPEC_GOTTPOFF:
17003 case UNSPEC_GOTNTPOFF:
17004 case UNSPEC_INDNTPOFF:
17005 case UNSPEC_NTPOFF:
17006 case UNSPEC_DTPOFF:
17007 break;
17009 default:
17010 /* Invalid address unspec. */
17011 return false;
17014 else if (SYMBOLIC_CONST (disp)
17015 && (flag_pic
17016 || (TARGET_MACHO
17017 #if TARGET_MACHO
17018 && MACHOPIC_INDIRECT
17019 && !machopic_operand_p (disp)
17020 #endif
17024 is_legitimate_pic:
17025 if (TARGET_64BIT && (index || base))
17027 /* foo@dtpoff(%rX) is ok. */
17028 if (GET_CODE (disp) != CONST
17029 || GET_CODE (XEXP (disp, 0)) != PLUS
17030 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
17031 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
17032 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
17033 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
17034 /* Non-constant pic memory reference. */
17035 return false;
17037 else if ((!TARGET_MACHO || flag_pic)
17038 && ! legitimate_pic_address_disp_p (disp))
17039 /* Displacement is an invalid pic construct. */
17040 return false;
17041 #if TARGET_MACHO
17042 else if (MACHO_DYNAMIC_NO_PIC_P
17043 && !ix86_legitimate_constant_p (Pmode, disp))
17044 /* displacment must be referenced via non_lazy_pointer */
17045 return false;
17046 #endif
17048 /* This code used to verify that a symbolic pic displacement
17049 includes the pic_offset_table_rtx register.
17051 While this is good idea, unfortunately these constructs may
17052 be created by "adds using lea" optimization for incorrect
17053 code like:
17055 int a;
17056 int foo(int i)
17058 return *(&a+i);
17061 This code is nonsensical, but results in addressing
17062 GOT table with pic_offset_table_rtx base. We can't
17063 just refuse it easily, since it gets matched by
17064 "addsi3" pattern, that later gets split to lea in the
17065 case output register differs from input. While this
17066 can be handled by separate addsi pattern for this case
17067 that never results in lea, this seems to be easier and
17068 correct fix for crash to disable this test. */
17070 else if (GET_CODE (disp) != LABEL_REF
17071 && !CONST_INT_P (disp)
17072 && (GET_CODE (disp) != CONST
17073 || !ix86_legitimate_constant_p (Pmode, disp))
17074 && (GET_CODE (disp) != SYMBOL_REF
17075 || !ix86_legitimate_constant_p (Pmode, disp)))
17076 /* Displacement is not constant. */
17077 return false;
17078 else if (TARGET_64BIT
17079 && !x86_64_immediate_operand (disp, VOIDmode))
17080 /* Displacement is out of range. */
17081 return false;
17082 /* In x32 mode, constant addresses are sign extended to 64bit, so
17083 we have to prevent addresses from 0x80000000 to 0xffffffff. */
17084 else if (TARGET_X32 && !(index || base)
17085 && CONST_INT_P (disp)
17086 && val_signbit_known_set_p (SImode, INTVAL (disp)))
17087 return false;
17090 /* Everything looks valid. */
17091 return true;
17094 /* Determine if a given RTX is a valid constant address. */
17096 bool
17097 constant_address_p (rtx x)
17099 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
17102 /* Return a unique alias set for the GOT. */
17104 static alias_set_type
17105 ix86_GOT_alias_set (void)
17107 static alias_set_type set = -1;
17108 if (set == -1)
17109 set = new_alias_set ();
17110 return set;
17113 /* Return a legitimate reference for ORIG (an address) using the
17114 register REG. If REG is 0, a new pseudo is generated.
17116 There are two types of references that must be handled:
17118 1. Global data references must load the address from the GOT, via
17119 the PIC reg. An insn is emitted to do this load, and the reg is
17120 returned.
17122 2. Static data references, constant pool addresses, and code labels
17123 compute the address as an offset from the GOT, whose base is in
17124 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
17125 differentiate them from global data objects. The returned
17126 address is the PIC reg + an unspec constant.
17128 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
17129 reg also appears in the address. */
17131 static rtx
17132 legitimize_pic_address (rtx orig, rtx reg)
17134 rtx addr = orig;
17135 rtx new_rtx = orig;
17137 #if TARGET_MACHO
17138 if (TARGET_MACHO && !TARGET_64BIT)
17140 if (reg == 0)
17141 reg = gen_reg_rtx (Pmode);
17142 /* Use the generic Mach-O PIC machinery. */
17143 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
17145 #endif
17147 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17149 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17150 if (tmp)
17151 return tmp;
17154 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
17155 new_rtx = addr;
17156 else if ((!TARGET_64BIT
17157 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
17158 && !TARGET_PECOFF
17159 && gotoff_operand (addr, Pmode))
17161 /* This symbol may be referenced via a displacement
17162 from the PIC base address (@GOTOFF). */
17163 if (GET_CODE (addr) == CONST)
17164 addr = XEXP (addr, 0);
17166 if (GET_CODE (addr) == PLUS)
17168 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
17169 UNSPEC_GOTOFF);
17170 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
17172 else
17173 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
17175 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17177 if (TARGET_64BIT)
17178 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17180 if (reg != 0)
17182 gcc_assert (REG_P (reg));
17183 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17184 new_rtx, reg, 1, OPTAB_DIRECT);
17186 else
17187 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17189 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17190 /* We can't use @GOTOFF for text labels
17191 on VxWorks, see gotoff_operand. */
17192 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17194 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17195 if (tmp)
17196 return tmp;
17198 /* For x64 PE-COFF there is no GOT table,
17199 so we use address directly. */
17200 if (TARGET_64BIT && TARGET_PECOFF)
17202 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17203 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17205 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17207 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17208 UNSPEC_GOTPCREL);
17209 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17210 new_rtx = gen_const_mem (Pmode, new_rtx);
17211 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17213 else
17215 /* This symbol must be referenced via a load
17216 from the Global Offset Table (@GOT). */
17217 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17218 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17219 if (TARGET_64BIT)
17220 new_rtx = force_reg (Pmode, new_rtx);
17221 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17222 new_rtx = gen_const_mem (Pmode, new_rtx);
17223 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17226 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17228 else
17230 if (CONST_INT_P (addr)
17231 && !x86_64_immediate_operand (addr, VOIDmode))
17232 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17233 else if (GET_CODE (addr) == CONST)
17235 addr = XEXP (addr, 0);
17237 /* We must match stuff we generate before. Assume the only
17238 unspecs that can get here are ours. Not that we could do
17239 anything with them anyway.... */
17240 if (GET_CODE (addr) == UNSPEC
17241 || (GET_CODE (addr) == PLUS
17242 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17243 return orig;
17244 gcc_assert (GET_CODE (addr) == PLUS);
17247 if (GET_CODE (addr) == PLUS)
17249 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17251 /* Check first to see if this is a constant
17252 offset from a @GOTOFF symbol reference. */
17253 if (!TARGET_PECOFF
17254 && gotoff_operand (op0, Pmode)
17255 && CONST_INT_P (op1))
17257 if (!TARGET_64BIT)
17259 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17260 UNSPEC_GOTOFF);
17261 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17262 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17264 if (reg != 0)
17266 gcc_assert (REG_P (reg));
17267 new_rtx = expand_simple_binop (Pmode, PLUS,
17268 pic_offset_table_rtx,
17269 new_rtx, reg, 1,
17270 OPTAB_DIRECT);
17272 else
17273 new_rtx
17274 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17276 else
17278 if (INTVAL (op1) < -16*1024*1024
17279 || INTVAL (op1) >= 16*1024*1024)
17281 if (!x86_64_immediate_operand (op1, Pmode))
17282 op1 = force_reg (Pmode, op1);
17284 new_rtx
17285 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17289 else
17291 rtx base = legitimize_pic_address (op0, reg);
17292 machine_mode mode = GET_MODE (base);
17293 new_rtx
17294 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17296 if (CONST_INT_P (new_rtx))
17298 if (INTVAL (new_rtx) < -16*1024*1024
17299 || INTVAL (new_rtx) >= 16*1024*1024)
17301 if (!x86_64_immediate_operand (new_rtx, mode))
17302 new_rtx = force_reg (mode, new_rtx);
17304 new_rtx
17305 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17307 else
17308 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17310 else
17312 /* For %rip addressing, we have to use
17313 just disp32, not base nor index. */
17314 if (TARGET_64BIT
17315 && (GET_CODE (base) == SYMBOL_REF
17316 || GET_CODE (base) == LABEL_REF))
17317 base = force_reg (mode, base);
17318 if (GET_CODE (new_rtx) == PLUS
17319 && CONSTANT_P (XEXP (new_rtx, 1)))
17321 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17322 new_rtx = XEXP (new_rtx, 1);
17324 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17329 return new_rtx;
17332 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17334 static rtx
17335 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17337 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17339 if (GET_MODE (tp) != tp_mode)
17341 gcc_assert (GET_MODE (tp) == SImode);
17342 gcc_assert (tp_mode == DImode);
17344 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17347 if (to_reg)
17348 tp = copy_to_mode_reg (tp_mode, tp);
17350 return tp;
17353 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17355 static GTY(()) rtx ix86_tls_symbol;
17357 static rtx
17358 ix86_tls_get_addr (void)
17360 if (!ix86_tls_symbol)
17362 const char *sym
17363 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17364 ? "___tls_get_addr" : "__tls_get_addr");
17366 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17369 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17371 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17372 UNSPEC_PLTOFF);
17373 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17374 gen_rtx_CONST (Pmode, unspec));
17377 return ix86_tls_symbol;
17380 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17382 static GTY(()) rtx ix86_tls_module_base_symbol;
17385 ix86_tls_module_base (void)
17387 if (!ix86_tls_module_base_symbol)
17389 ix86_tls_module_base_symbol
17390 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17392 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17393 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17396 return ix86_tls_module_base_symbol;
17399 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17400 false if we expect this to be used for a memory address and true if
17401 we expect to load the address into a register. */
17403 static rtx
17404 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17406 rtx dest, base, off;
17407 rtx pic = NULL_RTX, tp = NULL_RTX;
17408 machine_mode tp_mode = Pmode;
17409 int type;
17411 /* Fall back to global dynamic model if tool chain cannot support local
17412 dynamic. */
17413 if (TARGET_SUN_TLS && !TARGET_64BIT
17414 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17415 && model == TLS_MODEL_LOCAL_DYNAMIC)
17416 model = TLS_MODEL_GLOBAL_DYNAMIC;
17418 switch (model)
17420 case TLS_MODEL_GLOBAL_DYNAMIC:
17421 dest = gen_reg_rtx (Pmode);
17423 if (!TARGET_64BIT)
17425 if (flag_pic && !TARGET_PECOFF)
17426 pic = pic_offset_table_rtx;
17427 else
17429 pic = gen_reg_rtx (Pmode);
17430 emit_insn (gen_set_got (pic));
17434 if (TARGET_GNU2_TLS)
17436 if (TARGET_64BIT)
17437 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17438 else
17439 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17441 tp = get_thread_pointer (Pmode, true);
17442 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17444 if (GET_MODE (x) != Pmode)
17445 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17447 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17449 else
17451 rtx caddr = ix86_tls_get_addr ();
17453 if (TARGET_64BIT)
17455 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17456 rtx_insn *insns;
17458 start_sequence ();
17459 emit_call_insn
17460 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17461 insns = get_insns ();
17462 end_sequence ();
17464 if (GET_MODE (x) != Pmode)
17465 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17467 RTL_CONST_CALL_P (insns) = 1;
17468 emit_libcall_block (insns, dest, rax, x);
17470 else
17471 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17473 break;
17475 case TLS_MODEL_LOCAL_DYNAMIC:
17476 base = gen_reg_rtx (Pmode);
17478 if (!TARGET_64BIT)
17480 if (flag_pic)
17481 pic = pic_offset_table_rtx;
17482 else
17484 pic = gen_reg_rtx (Pmode);
17485 emit_insn (gen_set_got (pic));
17489 if (TARGET_GNU2_TLS)
17491 rtx tmp = ix86_tls_module_base ();
17493 if (TARGET_64BIT)
17494 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17495 else
17496 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17498 tp = get_thread_pointer (Pmode, true);
17499 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17500 gen_rtx_MINUS (Pmode, tmp, tp));
17502 else
17504 rtx caddr = ix86_tls_get_addr ();
17506 if (TARGET_64BIT)
17508 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17509 rtx_insn *insns;
17510 rtx eqv;
17512 start_sequence ();
17513 emit_call_insn
17514 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17515 insns = get_insns ();
17516 end_sequence ();
17518 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17519 share the LD_BASE result with other LD model accesses. */
17520 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17521 UNSPEC_TLS_LD_BASE);
17523 RTL_CONST_CALL_P (insns) = 1;
17524 emit_libcall_block (insns, base, rax, eqv);
17526 else
17527 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17530 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17531 off = gen_rtx_CONST (Pmode, off);
17533 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17535 if (TARGET_GNU2_TLS)
17537 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17539 if (GET_MODE (x) != Pmode)
17540 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17542 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17544 break;
17546 case TLS_MODEL_INITIAL_EXEC:
17547 if (TARGET_64BIT)
17549 if (TARGET_SUN_TLS && !TARGET_X32)
17551 /* The Sun linker took the AMD64 TLS spec literally
17552 and can only handle %rax as destination of the
17553 initial executable code sequence. */
17555 dest = gen_reg_rtx (DImode);
17556 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17557 return dest;
17560 /* Generate DImode references to avoid %fs:(%reg32)
17561 problems and linker IE->LE relaxation bug. */
17562 tp_mode = DImode;
17563 pic = NULL;
17564 type = UNSPEC_GOTNTPOFF;
17566 else if (flag_pic)
17568 pic = pic_offset_table_rtx;
17569 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17571 else if (!TARGET_ANY_GNU_TLS)
17573 pic = gen_reg_rtx (Pmode);
17574 emit_insn (gen_set_got (pic));
17575 type = UNSPEC_GOTTPOFF;
17577 else
17579 pic = NULL;
17580 type = UNSPEC_INDNTPOFF;
17583 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17584 off = gen_rtx_CONST (tp_mode, off);
17585 if (pic)
17586 off = gen_rtx_PLUS (tp_mode, pic, off);
17587 off = gen_const_mem (tp_mode, off);
17588 set_mem_alias_set (off, ix86_GOT_alias_set ());
17590 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17592 base = get_thread_pointer (tp_mode,
17593 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17594 off = force_reg (tp_mode, off);
17595 dest = gen_rtx_PLUS (tp_mode, base, off);
17596 if (tp_mode != Pmode)
17597 dest = convert_to_mode (Pmode, dest, 1);
17599 else
17601 base = get_thread_pointer (Pmode, true);
17602 dest = gen_reg_rtx (Pmode);
17603 emit_insn (ix86_gen_sub3 (dest, base, off));
17605 break;
17607 case TLS_MODEL_LOCAL_EXEC:
17608 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17609 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17610 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17611 off = gen_rtx_CONST (Pmode, off);
17613 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17615 base = get_thread_pointer (Pmode,
17616 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17617 return gen_rtx_PLUS (Pmode, base, off);
17619 else
17621 base = get_thread_pointer (Pmode, true);
17622 dest = gen_reg_rtx (Pmode);
17623 emit_insn (ix86_gen_sub3 (dest, base, off));
17625 break;
17627 default:
17628 gcc_unreachable ();
17631 return dest;
17634 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17635 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17636 unique refptr-DECL symbol corresponding to symbol DECL. */
17638 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17640 static inline hashval_t hash (tree_map *m) { return m->hash; }
17641 static inline bool
17642 equal (tree_map *a, tree_map *b)
17644 return a->base.from == b->base.from;
17647 static int
17648 keep_cache_entry (tree_map *&m)
17650 return ggc_marked_p (m->base.from);
17654 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17656 static tree
17657 get_dllimport_decl (tree decl, bool beimport)
17659 struct tree_map *h, in;
17660 const char *name;
17661 const char *prefix;
17662 size_t namelen, prefixlen;
17663 char *imp_name;
17664 tree to;
17665 rtx rtl;
17667 if (!dllimport_map)
17668 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17670 in.hash = htab_hash_pointer (decl);
17671 in.base.from = decl;
17672 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17673 h = *loc;
17674 if (h)
17675 return h->to;
17677 *loc = h = ggc_alloc<tree_map> ();
17678 h->hash = in.hash;
17679 h->base.from = decl;
17680 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17681 VAR_DECL, NULL, ptr_type_node);
17682 DECL_ARTIFICIAL (to) = 1;
17683 DECL_IGNORED_P (to) = 1;
17684 DECL_EXTERNAL (to) = 1;
17685 TREE_READONLY (to) = 1;
17687 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17688 name = targetm.strip_name_encoding (name);
17689 if (beimport)
17690 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17691 ? "*__imp_" : "*__imp__";
17692 else
17693 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17694 namelen = strlen (name);
17695 prefixlen = strlen (prefix);
17696 imp_name = (char *) alloca (namelen + prefixlen + 1);
17697 memcpy (imp_name, prefix, prefixlen);
17698 memcpy (imp_name + prefixlen, name, namelen + 1);
17700 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17701 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17702 SET_SYMBOL_REF_DECL (rtl, to);
17703 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17704 if (!beimport)
17706 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17707 #ifdef SUB_TARGET_RECORD_STUB
17708 SUB_TARGET_RECORD_STUB (name);
17709 #endif
17712 rtl = gen_const_mem (Pmode, rtl);
17713 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17715 SET_DECL_RTL (to, rtl);
17716 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17718 return to;
17721 /* Expand SYMBOL into its corresponding far-address symbol.
17722 WANT_REG is true if we require the result be a register. */
17724 static rtx
17725 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17727 tree imp_decl;
17728 rtx x;
17730 gcc_assert (SYMBOL_REF_DECL (symbol));
17731 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17733 x = DECL_RTL (imp_decl);
17734 if (want_reg)
17735 x = force_reg (Pmode, x);
17736 return x;
17739 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17740 true if we require the result be a register. */
17742 static rtx
17743 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17745 tree imp_decl;
17746 rtx x;
17748 gcc_assert (SYMBOL_REF_DECL (symbol));
17749 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17751 x = DECL_RTL (imp_decl);
17752 if (want_reg)
17753 x = force_reg (Pmode, x);
17754 return x;
17757 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17758 is true if we require the result be a register. */
17760 static rtx
17761 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17763 if (!TARGET_PECOFF)
17764 return NULL_RTX;
17766 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17768 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17769 return legitimize_dllimport_symbol (addr, inreg);
17770 if (GET_CODE (addr) == CONST
17771 && GET_CODE (XEXP (addr, 0)) == PLUS
17772 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17773 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17775 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17776 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17780 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17781 return NULL_RTX;
17782 if (GET_CODE (addr) == SYMBOL_REF
17783 && !is_imported_p (addr)
17784 && SYMBOL_REF_EXTERNAL_P (addr)
17785 && SYMBOL_REF_DECL (addr))
17786 return legitimize_pe_coff_extern_decl (addr, inreg);
17788 if (GET_CODE (addr) == CONST
17789 && GET_CODE (XEXP (addr, 0)) == PLUS
17790 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17791 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17792 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17793 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17795 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17796 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17798 return NULL_RTX;
17801 /* Try machine-dependent ways of modifying an illegitimate address
17802 to be legitimate. If we find one, return the new, valid address.
17803 This macro is used in only one place: `memory_address' in explow.c.
17805 OLDX is the address as it was before break_out_memory_refs was called.
17806 In some cases it is useful to look at this to decide what needs to be done.
17808 It is always safe for this macro to do nothing. It exists to recognize
17809 opportunities to optimize the output.
17811 For the 80386, we handle X+REG by loading X into a register R and
17812 using R+REG. R will go in a general reg and indexing will be used.
17813 However, if REG is a broken-out memory address or multiplication,
17814 nothing needs to be done because REG can certainly go in a general reg.
17816 When -fpic is used, special handling is needed for symbolic references.
17817 See comments by legitimize_pic_address in i386.c for details. */
17819 static rtx
17820 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17822 bool changed = false;
17823 unsigned log;
17825 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17826 if (log)
17827 return legitimize_tls_address (x, (enum tls_model) log, false);
17828 if (GET_CODE (x) == CONST
17829 && GET_CODE (XEXP (x, 0)) == PLUS
17830 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17831 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17833 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17834 (enum tls_model) log, false);
17835 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17838 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17840 rtx tmp = legitimize_pe_coff_symbol (x, true);
17841 if (tmp)
17842 return tmp;
17845 if (flag_pic && SYMBOLIC_CONST (x))
17846 return legitimize_pic_address (x, 0);
17848 #if TARGET_MACHO
17849 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17850 return machopic_indirect_data_reference (x, 0);
17851 #endif
17853 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17854 if (GET_CODE (x) == ASHIFT
17855 && CONST_INT_P (XEXP (x, 1))
17856 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17858 changed = true;
17859 log = INTVAL (XEXP (x, 1));
17860 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17861 GEN_INT (1 << log));
17864 if (GET_CODE (x) == PLUS)
17866 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17868 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17869 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17870 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17872 changed = true;
17873 log = INTVAL (XEXP (XEXP (x, 0), 1));
17874 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17875 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17876 GEN_INT (1 << log));
17879 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17880 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17881 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17883 changed = true;
17884 log = INTVAL (XEXP (XEXP (x, 1), 1));
17885 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17886 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17887 GEN_INT (1 << log));
17890 /* Put multiply first if it isn't already. */
17891 if (GET_CODE (XEXP (x, 1)) == MULT)
17893 std::swap (XEXP (x, 0), XEXP (x, 1));
17894 changed = true;
17897 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17898 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17899 created by virtual register instantiation, register elimination, and
17900 similar optimizations. */
17901 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17903 changed = true;
17904 x = gen_rtx_PLUS (Pmode,
17905 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17906 XEXP (XEXP (x, 1), 0)),
17907 XEXP (XEXP (x, 1), 1));
17910 /* Canonicalize
17911 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17912 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17913 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17914 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17915 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17916 && CONSTANT_P (XEXP (x, 1)))
17918 rtx constant;
17919 rtx other = NULL_RTX;
17921 if (CONST_INT_P (XEXP (x, 1)))
17923 constant = XEXP (x, 1);
17924 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17926 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17928 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17929 other = XEXP (x, 1);
17931 else
17932 constant = 0;
17934 if (constant)
17936 changed = true;
17937 x = gen_rtx_PLUS (Pmode,
17938 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17939 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17940 plus_constant (Pmode, other,
17941 INTVAL (constant)));
17945 if (changed && ix86_legitimate_address_p (mode, x, false))
17946 return x;
17948 if (GET_CODE (XEXP (x, 0)) == MULT)
17950 changed = true;
17951 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17954 if (GET_CODE (XEXP (x, 1)) == MULT)
17956 changed = true;
17957 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17960 if (changed
17961 && REG_P (XEXP (x, 1))
17962 && REG_P (XEXP (x, 0)))
17963 return x;
17965 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17967 changed = true;
17968 x = legitimize_pic_address (x, 0);
17971 if (changed && ix86_legitimate_address_p (mode, x, false))
17972 return x;
17974 if (REG_P (XEXP (x, 0)))
17976 rtx temp = gen_reg_rtx (Pmode);
17977 rtx val = force_operand (XEXP (x, 1), temp);
17978 if (val != temp)
17980 val = convert_to_mode (Pmode, val, 1);
17981 emit_move_insn (temp, val);
17984 XEXP (x, 1) = temp;
17985 return x;
17988 else if (REG_P (XEXP (x, 1)))
17990 rtx temp = gen_reg_rtx (Pmode);
17991 rtx val = force_operand (XEXP (x, 0), temp);
17992 if (val != temp)
17994 val = convert_to_mode (Pmode, val, 1);
17995 emit_move_insn (temp, val);
17998 XEXP (x, 0) = temp;
17999 return x;
18003 return x;
18006 /* Print an integer constant expression in assembler syntax. Addition
18007 and subtraction are the only arithmetic that may appear in these
18008 expressions. FILE is the stdio stream to write to, X is the rtx, and
18009 CODE is the operand print code from the output string. */
18011 static void
18012 output_pic_addr_const (FILE *file, rtx x, int code)
18014 char buf[256];
18016 switch (GET_CODE (x))
18018 case PC:
18019 gcc_assert (flag_pic);
18020 putc ('.', file);
18021 break;
18023 case SYMBOL_REF:
18024 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
18025 output_addr_const (file, x);
18026 else
18028 const char *name = XSTR (x, 0);
18030 /* Mark the decl as referenced so that cgraph will
18031 output the function. */
18032 if (SYMBOL_REF_DECL (x))
18033 mark_decl_referenced (SYMBOL_REF_DECL (x));
18035 #if TARGET_MACHO
18036 if (MACHOPIC_INDIRECT
18037 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
18038 name = machopic_indirection_name (x, /*stub_p=*/true);
18039 #endif
18040 assemble_name (file, name);
18042 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
18043 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
18044 fputs ("@PLT", file);
18045 break;
18047 case LABEL_REF:
18048 x = XEXP (x, 0);
18049 /* FALLTHRU */
18050 case CODE_LABEL:
18051 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
18052 assemble_name (asm_out_file, buf);
18053 break;
18055 case CONST_INT:
18056 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18057 break;
18059 case CONST:
18060 /* This used to output parentheses around the expression,
18061 but that does not work on the 386 (either ATT or BSD assembler). */
18062 output_pic_addr_const (file, XEXP (x, 0), code);
18063 break;
18065 case CONST_DOUBLE:
18066 /* We can't handle floating point constants;
18067 TARGET_PRINT_OPERAND must handle them. */
18068 output_operand_lossage ("floating constant misused");
18069 break;
18071 case PLUS:
18072 /* Some assemblers need integer constants to appear first. */
18073 if (CONST_INT_P (XEXP (x, 0)))
18075 output_pic_addr_const (file, XEXP (x, 0), code);
18076 putc ('+', file);
18077 output_pic_addr_const (file, XEXP (x, 1), code);
18079 else
18081 gcc_assert (CONST_INT_P (XEXP (x, 1)));
18082 output_pic_addr_const (file, XEXP (x, 1), code);
18083 putc ('+', file);
18084 output_pic_addr_const (file, XEXP (x, 0), code);
18086 break;
18088 case MINUS:
18089 if (!TARGET_MACHO)
18090 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
18091 output_pic_addr_const (file, XEXP (x, 0), code);
18092 putc ('-', file);
18093 output_pic_addr_const (file, XEXP (x, 1), code);
18094 if (!TARGET_MACHO)
18095 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
18096 break;
18098 case UNSPEC:
18099 gcc_assert (XVECLEN (x, 0) == 1);
18100 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
18101 switch (XINT (x, 1))
18103 case UNSPEC_GOT:
18104 fputs ("@GOT", file);
18105 break;
18106 case UNSPEC_GOTOFF:
18107 fputs ("@GOTOFF", file);
18108 break;
18109 case UNSPEC_PLTOFF:
18110 fputs ("@PLTOFF", file);
18111 break;
18112 case UNSPEC_PCREL:
18113 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18114 "(%rip)" : "[rip]", file);
18115 break;
18116 case UNSPEC_GOTPCREL:
18117 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18118 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
18119 break;
18120 case UNSPEC_GOTTPOFF:
18121 /* FIXME: This might be @TPOFF in Sun ld too. */
18122 fputs ("@gottpoff", file);
18123 break;
18124 case UNSPEC_TPOFF:
18125 fputs ("@tpoff", file);
18126 break;
18127 case UNSPEC_NTPOFF:
18128 if (TARGET_64BIT)
18129 fputs ("@tpoff", file);
18130 else
18131 fputs ("@ntpoff", file);
18132 break;
18133 case UNSPEC_DTPOFF:
18134 fputs ("@dtpoff", file);
18135 break;
18136 case UNSPEC_GOTNTPOFF:
18137 if (TARGET_64BIT)
18138 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18139 "@gottpoff(%rip)": "@gottpoff[rip]", file);
18140 else
18141 fputs ("@gotntpoff", file);
18142 break;
18143 case UNSPEC_INDNTPOFF:
18144 fputs ("@indntpoff", file);
18145 break;
18146 #if TARGET_MACHO
18147 case UNSPEC_MACHOPIC_OFFSET:
18148 putc ('-', file);
18149 machopic_output_function_base_name (file);
18150 break;
18151 #endif
18152 default:
18153 output_operand_lossage ("invalid UNSPEC as operand");
18154 break;
18156 break;
18158 default:
18159 output_operand_lossage ("invalid expression as operand");
18163 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
18164 We need to emit DTP-relative relocations. */
18166 static void ATTRIBUTE_UNUSED
18167 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
18169 fputs (ASM_LONG, file);
18170 output_addr_const (file, x);
18171 fputs ("@dtpoff", file);
18172 switch (size)
18174 case 4:
18175 break;
18176 case 8:
18177 fputs (", 0", file);
18178 break;
18179 default:
18180 gcc_unreachable ();
18184 /* Return true if X is a representation of the PIC register. This copes
18185 with calls from ix86_find_base_term, where the register might have
18186 been replaced by a cselib value. */
18188 static bool
18189 ix86_pic_register_p (rtx x)
18191 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18192 return (pic_offset_table_rtx
18193 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18194 else if (!REG_P (x))
18195 return false;
18196 else if (pic_offset_table_rtx)
18198 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18199 return true;
18200 if (HARD_REGISTER_P (x)
18201 && !HARD_REGISTER_P (pic_offset_table_rtx)
18202 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18203 return true;
18204 return false;
18206 else
18207 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18210 /* Helper function for ix86_delegitimize_address.
18211 Attempt to delegitimize TLS local-exec accesses. */
18213 static rtx
18214 ix86_delegitimize_tls_address (rtx orig_x)
18216 rtx x = orig_x, unspec;
18217 struct ix86_address addr;
18219 if (!TARGET_TLS_DIRECT_SEG_REFS)
18220 return orig_x;
18221 if (MEM_P (x))
18222 x = XEXP (x, 0);
18223 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18224 return orig_x;
18225 if (ix86_decompose_address (x, &addr) == 0
18226 || addr.seg != DEFAULT_TLS_SEG_REG
18227 || addr.disp == NULL_RTX
18228 || GET_CODE (addr.disp) != CONST)
18229 return orig_x;
18230 unspec = XEXP (addr.disp, 0);
18231 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18232 unspec = XEXP (unspec, 0);
18233 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18234 return orig_x;
18235 x = XVECEXP (unspec, 0, 0);
18236 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18237 if (unspec != XEXP (addr.disp, 0))
18238 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18239 if (addr.index)
18241 rtx idx = addr.index;
18242 if (addr.scale != 1)
18243 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18244 x = gen_rtx_PLUS (Pmode, idx, x);
18246 if (addr.base)
18247 x = gen_rtx_PLUS (Pmode, addr.base, x);
18248 if (MEM_P (orig_x))
18249 x = replace_equiv_address_nv (orig_x, x);
18250 return x;
18253 /* In the name of slightly smaller debug output, and to cater to
18254 general assembler lossage, recognize PIC+GOTOFF and turn it back
18255 into a direct symbol reference.
18257 On Darwin, this is necessary to avoid a crash, because Darwin
18258 has a different PIC label for each routine but the DWARF debugging
18259 information is not associated with any particular routine, so it's
18260 necessary to remove references to the PIC label from RTL stored by
18261 the DWARF output code.
18263 This helper is used in the normal ix86_delegitimize_address
18264 entrypoint (e.g. used in the target delegitimization hook) and
18265 in ix86_find_base_term. As compile time memory optimization, we
18266 avoid allocating rtxes that will not change anything on the outcome
18267 of the callers (find_base_value and find_base_term). */
18269 static inline rtx
18270 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18272 rtx orig_x = delegitimize_mem_from_attrs (x);
18273 /* addend is NULL or some rtx if x is something+GOTOFF where
18274 something doesn't include the PIC register. */
18275 rtx addend = NULL_RTX;
18276 /* reg_addend is NULL or a multiple of some register. */
18277 rtx reg_addend = NULL_RTX;
18278 /* const_addend is NULL or a const_int. */
18279 rtx const_addend = NULL_RTX;
18280 /* This is the result, or NULL. */
18281 rtx result = NULL_RTX;
18283 x = orig_x;
18285 if (MEM_P (x))
18286 x = XEXP (x, 0);
18288 if (TARGET_64BIT)
18290 if (GET_CODE (x) == CONST
18291 && GET_CODE (XEXP (x, 0)) == PLUS
18292 && GET_MODE (XEXP (x, 0)) == Pmode
18293 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18294 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18295 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18297 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18298 base. A CONST can't be arg_pointer_rtx based. */
18299 if (base_term_p && MEM_P (orig_x))
18300 return orig_x;
18301 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18302 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18303 if (MEM_P (orig_x))
18304 x = replace_equiv_address_nv (orig_x, x);
18305 return x;
18308 if (GET_CODE (x) == CONST
18309 && GET_CODE (XEXP (x, 0)) == UNSPEC
18310 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18311 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18312 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18314 x = XVECEXP (XEXP (x, 0), 0, 0);
18315 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18317 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18318 if (x == NULL_RTX)
18319 return orig_x;
18321 return x;
18324 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18325 return ix86_delegitimize_tls_address (orig_x);
18327 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18328 and -mcmodel=medium -fpic. */
18331 if (GET_CODE (x) != PLUS
18332 || GET_CODE (XEXP (x, 1)) != CONST)
18333 return ix86_delegitimize_tls_address (orig_x);
18335 if (ix86_pic_register_p (XEXP (x, 0)))
18336 /* %ebx + GOT/GOTOFF */
18338 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18340 /* %ebx + %reg * scale + GOT/GOTOFF */
18341 reg_addend = XEXP (x, 0);
18342 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18343 reg_addend = XEXP (reg_addend, 1);
18344 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18345 reg_addend = XEXP (reg_addend, 0);
18346 else
18348 reg_addend = NULL_RTX;
18349 addend = XEXP (x, 0);
18352 else
18353 addend = XEXP (x, 0);
18355 x = XEXP (XEXP (x, 1), 0);
18356 if (GET_CODE (x) == PLUS
18357 && CONST_INT_P (XEXP (x, 1)))
18359 const_addend = XEXP (x, 1);
18360 x = XEXP (x, 0);
18363 if (GET_CODE (x) == UNSPEC
18364 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18365 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18366 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18367 && !MEM_P (orig_x) && !addend)))
18368 result = XVECEXP (x, 0, 0);
18370 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18371 && !MEM_P (orig_x))
18372 result = XVECEXP (x, 0, 0);
18374 if (! result)
18375 return ix86_delegitimize_tls_address (orig_x);
18377 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18378 recurse on the first operand. */
18379 if (const_addend && !base_term_p)
18380 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18381 if (reg_addend)
18382 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18383 if (addend)
18385 /* If the rest of original X doesn't involve the PIC register, add
18386 addend and subtract pic_offset_table_rtx. This can happen e.g.
18387 for code like:
18388 leal (%ebx, %ecx, 4), %ecx
18390 movl foo@GOTOFF(%ecx), %edx
18391 in which case we return (%ecx - %ebx) + foo
18392 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18393 and reload has completed. */
18394 if (pic_offset_table_rtx
18395 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18396 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18397 pic_offset_table_rtx),
18398 result);
18399 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18401 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18402 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18403 result = gen_rtx_PLUS (Pmode, tmp, result);
18405 else
18406 return orig_x;
18408 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18410 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18411 if (result == NULL_RTX)
18412 return orig_x;
18414 return result;
18417 /* The normal instantiation of the above template. */
18419 static rtx
18420 ix86_delegitimize_address (rtx x)
18422 return ix86_delegitimize_address_1 (x, false);
18425 /* If X is a machine specific address (i.e. a symbol or label being
18426 referenced as a displacement from the GOT implemented using an
18427 UNSPEC), then return the base term. Otherwise return X. */
18430 ix86_find_base_term (rtx x)
18432 rtx term;
18434 if (TARGET_64BIT)
18436 if (GET_CODE (x) != CONST)
18437 return x;
18438 term = XEXP (x, 0);
18439 if (GET_CODE (term) == PLUS
18440 && CONST_INT_P (XEXP (term, 1)))
18441 term = XEXP (term, 0);
18442 if (GET_CODE (term) != UNSPEC
18443 || (XINT (term, 1) != UNSPEC_GOTPCREL
18444 && XINT (term, 1) != UNSPEC_PCREL))
18445 return x;
18447 return XVECEXP (term, 0, 0);
18450 return ix86_delegitimize_address_1 (x, true);
18453 static void
18454 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18455 bool fp, FILE *file)
18457 const char *suffix;
18459 if (mode == CCFPmode || mode == CCFPUmode)
18461 code = ix86_fp_compare_code_to_integer (code);
18462 mode = CCmode;
18464 if (reverse)
18465 code = reverse_condition (code);
18467 switch (code)
18469 case EQ:
18470 switch (mode)
18472 case E_CCAmode:
18473 suffix = "a";
18474 break;
18475 case E_CCCmode:
18476 suffix = "c";
18477 break;
18478 case E_CCOmode:
18479 suffix = "o";
18480 break;
18481 case E_CCPmode:
18482 suffix = "p";
18483 break;
18484 case E_CCSmode:
18485 suffix = "s";
18486 break;
18487 default:
18488 suffix = "e";
18489 break;
18491 break;
18492 case NE:
18493 switch (mode)
18495 case E_CCAmode:
18496 suffix = "na";
18497 break;
18498 case E_CCCmode:
18499 suffix = "nc";
18500 break;
18501 case E_CCOmode:
18502 suffix = "no";
18503 break;
18504 case E_CCPmode:
18505 suffix = "np";
18506 break;
18507 case E_CCSmode:
18508 suffix = "ns";
18509 break;
18510 default:
18511 suffix = "ne";
18512 break;
18514 break;
18515 case GT:
18516 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18517 suffix = "g";
18518 break;
18519 case GTU:
18520 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18521 Those same assemblers have the same but opposite lossage on cmov. */
18522 if (mode == CCmode)
18523 suffix = fp ? "nbe" : "a";
18524 else
18525 gcc_unreachable ();
18526 break;
18527 case LT:
18528 switch (mode)
18530 case E_CCNOmode:
18531 case E_CCGOCmode:
18532 suffix = "s";
18533 break;
18535 case E_CCmode:
18536 case E_CCGCmode:
18537 suffix = "l";
18538 break;
18540 default:
18541 gcc_unreachable ();
18543 break;
18544 case LTU:
18545 if (mode == CCmode)
18546 suffix = "b";
18547 else if (mode == CCCmode)
18548 suffix = fp ? "b" : "c";
18549 else
18550 gcc_unreachable ();
18551 break;
18552 case GE:
18553 switch (mode)
18555 case E_CCNOmode:
18556 case E_CCGOCmode:
18557 suffix = "ns";
18558 break;
18560 case E_CCmode:
18561 case E_CCGCmode:
18562 suffix = "ge";
18563 break;
18565 default:
18566 gcc_unreachable ();
18568 break;
18569 case GEU:
18570 if (mode == CCmode)
18571 suffix = "nb";
18572 else if (mode == CCCmode)
18573 suffix = fp ? "nb" : "nc";
18574 else
18575 gcc_unreachable ();
18576 break;
18577 case LE:
18578 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18579 suffix = "le";
18580 break;
18581 case LEU:
18582 if (mode == CCmode)
18583 suffix = "be";
18584 else
18585 gcc_unreachable ();
18586 break;
18587 case UNORDERED:
18588 suffix = fp ? "u" : "p";
18589 break;
18590 case ORDERED:
18591 suffix = fp ? "nu" : "np";
18592 break;
18593 default:
18594 gcc_unreachable ();
18596 fputs (suffix, file);
18599 /* Print the name of register X to FILE based on its machine mode and number.
18600 If CODE is 'w', pretend the mode is HImode.
18601 If CODE is 'b', pretend the mode is QImode.
18602 If CODE is 'k', pretend the mode is SImode.
18603 If CODE is 'q', pretend the mode is DImode.
18604 If CODE is 'x', pretend the mode is V4SFmode.
18605 If CODE is 't', pretend the mode is V8SFmode.
18606 If CODE is 'g', pretend the mode is V16SFmode.
18607 If CODE is 'h', pretend the reg is the 'high' byte register.
18608 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18609 If CODE is 'd', duplicate the operand for AVX instruction.
18612 void
18613 print_reg (rtx x, int code, FILE *file)
18615 const char *reg;
18616 int msize;
18617 unsigned int regno;
18618 bool duplicated;
18620 if (ASSEMBLER_DIALECT == ASM_ATT)
18621 putc ('%', file);
18623 if (x == pc_rtx)
18625 gcc_assert (TARGET_64BIT);
18626 fputs ("rip", file);
18627 return;
18630 if (code == 'y' && STACK_TOP_P (x))
18632 fputs ("st(0)", file);
18633 return;
18636 if (code == 'w')
18637 msize = 2;
18638 else if (code == 'b')
18639 msize = 1;
18640 else if (code == 'k')
18641 msize = 4;
18642 else if (code == 'q')
18643 msize = 8;
18644 else if (code == 'h')
18645 msize = 0;
18646 else if (code == 'x')
18647 msize = 16;
18648 else if (code == 't')
18649 msize = 32;
18650 else if (code == 'g')
18651 msize = 64;
18652 else
18653 msize = GET_MODE_SIZE (GET_MODE (x));
18655 regno = REGNO (x);
18657 if (regno == ARG_POINTER_REGNUM
18658 || regno == FRAME_POINTER_REGNUM
18659 || regno == FPSR_REG
18660 || regno == FPCR_REG)
18662 output_operand_lossage
18663 ("invalid use of register '%s'", reg_names[regno]);
18664 return;
18666 else if (regno == FLAGS_REG)
18668 output_operand_lossage ("invalid use of asm flag output");
18669 return;
18672 duplicated = code == 'd' && TARGET_AVX;
18674 switch (msize)
18676 case 16:
18677 case 12:
18678 case 8:
18679 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18680 warning (0, "unsupported size for integer register");
18681 /* FALLTHRU */
18682 case 4:
18683 if (LEGACY_INT_REGNO_P (regno))
18684 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18685 /* FALLTHRU */
18686 case 2:
18687 normal:
18688 reg = hi_reg_name[regno];
18689 break;
18690 case 1:
18691 if (regno >= ARRAY_SIZE (qi_reg_name))
18692 goto normal;
18693 if (!ANY_QI_REGNO_P (regno))
18694 error ("unsupported size for integer register");
18695 reg = qi_reg_name[regno];
18696 break;
18697 case 0:
18698 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18699 goto normal;
18700 reg = qi_high_reg_name[regno];
18701 break;
18702 case 32:
18703 case 64:
18704 if (SSE_REGNO_P (regno))
18706 gcc_assert (!duplicated);
18707 putc (msize == 32 ? 'y' : 'z', file);
18708 reg = hi_reg_name[regno] + 1;
18709 break;
18711 goto normal;
18712 default:
18713 gcc_unreachable ();
18716 fputs (reg, file);
18718 /* Irritatingly, AMD extended registers use
18719 different naming convention: "r%d[bwd]" */
18720 if (REX_INT_REGNO_P (regno))
18722 gcc_assert (TARGET_64BIT);
18723 switch (msize)
18725 case 0:
18726 error ("extended registers have no high halves");
18727 break;
18728 case 1:
18729 putc ('b', file);
18730 break;
18731 case 2:
18732 putc ('w', file);
18733 break;
18734 case 4:
18735 putc ('d', file);
18736 break;
18737 case 8:
18738 /* no suffix */
18739 break;
18740 default:
18741 error ("unsupported operand size for extended register");
18742 break;
18744 return;
18747 if (duplicated)
18749 if (ASSEMBLER_DIALECT == ASM_ATT)
18750 fprintf (file, ", %%%s", reg);
18751 else
18752 fprintf (file, ", %s", reg);
18756 /* Meaning of CODE:
18757 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18758 C -- print opcode suffix for set/cmov insn.
18759 c -- like C, but print reversed condition
18760 F,f -- likewise, but for floating-point.
18761 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18762 otherwise nothing
18763 R -- print embeded rounding and sae.
18764 r -- print only sae.
18765 z -- print the opcode suffix for the size of the current operand.
18766 Z -- likewise, with special suffixes for x87 instructions.
18767 * -- print a star (in certain assembler syntax)
18768 A -- print an absolute memory reference.
18769 E -- print address with DImode register names if TARGET_64BIT.
18770 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18771 s -- print a shift double count, followed by the assemblers argument
18772 delimiter.
18773 b -- print the QImode name of the register for the indicated operand.
18774 %b0 would print %al if operands[0] is reg 0.
18775 w -- likewise, print the HImode name of the register.
18776 k -- likewise, print the SImode name of the register.
18777 q -- likewise, print the DImode name of the register.
18778 x -- likewise, print the V4SFmode name of the register.
18779 t -- likewise, print the V8SFmode name of the register.
18780 g -- likewise, print the V16SFmode name of the register.
18781 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18782 y -- print "st(0)" instead of "st" as a register.
18783 d -- print duplicated register operand for AVX instruction.
18784 D -- print condition for SSE cmp instruction.
18785 P -- if PIC, print an @PLT suffix.
18786 p -- print raw symbol name.
18787 X -- don't print any sort of PIC '@' suffix for a symbol.
18788 & -- print some in-use local-dynamic symbol name.
18789 H -- print a memory address offset by 8; used for sse high-parts
18790 Y -- print condition for XOP pcom* instruction.
18791 + -- print a branch hint as 'cs' or 'ds' prefix
18792 ; -- print a semicolon (after prefixes due to bug in older gas).
18793 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18794 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18795 ! -- print MPX prefix for jxx/call/ret instructions if required.
18798 void
18799 ix86_print_operand (FILE *file, rtx x, int code)
18801 if (code)
18803 switch (code)
18805 case 'A':
18806 switch (ASSEMBLER_DIALECT)
18808 case ASM_ATT:
18809 putc ('*', file);
18810 break;
18812 case ASM_INTEL:
18813 /* Intel syntax. For absolute addresses, registers should not
18814 be surrounded by braces. */
18815 if (!REG_P (x))
18817 putc ('[', file);
18818 ix86_print_operand (file, x, 0);
18819 putc (']', file);
18820 return;
18822 break;
18824 default:
18825 gcc_unreachable ();
18828 ix86_print_operand (file, x, 0);
18829 return;
18831 case 'E':
18832 /* Wrap address in an UNSPEC to declare special handling. */
18833 if (TARGET_64BIT)
18834 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18836 output_address (VOIDmode, x);
18837 return;
18839 case 'L':
18840 if (ASSEMBLER_DIALECT == ASM_ATT)
18841 putc ('l', file);
18842 return;
18844 case 'W':
18845 if (ASSEMBLER_DIALECT == ASM_ATT)
18846 putc ('w', file);
18847 return;
18849 case 'B':
18850 if (ASSEMBLER_DIALECT == ASM_ATT)
18851 putc ('b', file);
18852 return;
18854 case 'Q':
18855 if (ASSEMBLER_DIALECT == ASM_ATT)
18856 putc ('l', file);
18857 return;
18859 case 'S':
18860 if (ASSEMBLER_DIALECT == ASM_ATT)
18861 putc ('s', file);
18862 return;
18864 case 'T':
18865 if (ASSEMBLER_DIALECT == ASM_ATT)
18866 putc ('t', file);
18867 return;
18869 case 'O':
18870 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18871 if (ASSEMBLER_DIALECT != ASM_ATT)
18872 return;
18874 switch (GET_MODE_SIZE (GET_MODE (x)))
18876 case 2:
18877 putc ('w', file);
18878 break;
18880 case 4:
18881 putc ('l', file);
18882 break;
18884 case 8:
18885 putc ('q', file);
18886 break;
18888 default:
18889 output_operand_lossage ("invalid operand size for operand "
18890 "code 'O'");
18891 return;
18894 putc ('.', file);
18895 #endif
18896 return;
18898 case 'z':
18899 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18901 /* Opcodes don't get size suffixes if using Intel opcodes. */
18902 if (ASSEMBLER_DIALECT == ASM_INTEL)
18903 return;
18905 switch (GET_MODE_SIZE (GET_MODE (x)))
18907 case 1:
18908 putc ('b', file);
18909 return;
18911 case 2:
18912 putc ('w', file);
18913 return;
18915 case 4:
18916 putc ('l', file);
18917 return;
18919 case 8:
18920 putc ('q', file);
18921 return;
18923 default:
18924 output_operand_lossage ("invalid operand size for operand "
18925 "code 'z'");
18926 return;
18930 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18931 warning (0, "non-integer operand used with operand code 'z'");
18932 /* FALLTHRU */
18934 case 'Z':
18935 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18936 if (ASSEMBLER_DIALECT == ASM_INTEL)
18937 return;
18939 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18941 switch (GET_MODE_SIZE (GET_MODE (x)))
18943 case 2:
18944 #ifdef HAVE_AS_IX86_FILDS
18945 putc ('s', file);
18946 #endif
18947 return;
18949 case 4:
18950 putc ('l', file);
18951 return;
18953 case 8:
18954 #ifdef HAVE_AS_IX86_FILDQ
18955 putc ('q', file);
18956 #else
18957 fputs ("ll", file);
18958 #endif
18959 return;
18961 default:
18962 break;
18965 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18967 /* 387 opcodes don't get size suffixes
18968 if the operands are registers. */
18969 if (STACK_REG_P (x))
18970 return;
18972 switch (GET_MODE_SIZE (GET_MODE (x)))
18974 case 4:
18975 putc ('s', file);
18976 return;
18978 case 8:
18979 putc ('l', file);
18980 return;
18982 case 12:
18983 case 16:
18984 putc ('t', file);
18985 return;
18987 default:
18988 break;
18991 else
18993 output_operand_lossage ("invalid operand type used with "
18994 "operand code 'Z'");
18995 return;
18998 output_operand_lossage ("invalid operand size for operand code 'Z'");
18999 return;
19001 case 'd':
19002 case 'b':
19003 case 'w':
19004 case 'k':
19005 case 'q':
19006 case 'h':
19007 case 't':
19008 case 'g':
19009 case 'y':
19010 case 'x':
19011 case 'X':
19012 case 'P':
19013 case 'p':
19014 break;
19016 case 's':
19017 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
19019 ix86_print_operand (file, x, 0);
19020 fputs (", ", file);
19022 return;
19024 case 'Y':
19025 switch (GET_CODE (x))
19027 case NE:
19028 fputs ("neq", file);
19029 break;
19030 case EQ:
19031 fputs ("eq", file);
19032 break;
19033 case GE:
19034 case GEU:
19035 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
19036 break;
19037 case GT:
19038 case GTU:
19039 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
19040 break;
19041 case LE:
19042 case LEU:
19043 fputs ("le", file);
19044 break;
19045 case LT:
19046 case LTU:
19047 fputs ("lt", file);
19048 break;
19049 case UNORDERED:
19050 fputs ("unord", file);
19051 break;
19052 case ORDERED:
19053 fputs ("ord", file);
19054 break;
19055 case UNEQ:
19056 fputs ("ueq", file);
19057 break;
19058 case UNGE:
19059 fputs ("nlt", file);
19060 break;
19061 case UNGT:
19062 fputs ("nle", file);
19063 break;
19064 case UNLE:
19065 fputs ("ule", file);
19066 break;
19067 case UNLT:
19068 fputs ("ult", file);
19069 break;
19070 case LTGT:
19071 fputs ("une", file);
19072 break;
19073 default:
19074 output_operand_lossage ("operand is not a condition code, "
19075 "invalid operand code 'Y'");
19076 return;
19078 return;
19080 case 'D':
19081 /* Little bit of braindamage here. The SSE compare instructions
19082 does use completely different names for the comparisons that the
19083 fp conditional moves. */
19084 switch (GET_CODE (x))
19086 case UNEQ:
19087 if (TARGET_AVX)
19089 fputs ("eq_us", file);
19090 break;
19092 /* FALLTHRU */
19093 case EQ:
19094 fputs ("eq", file);
19095 break;
19096 case UNLT:
19097 if (TARGET_AVX)
19099 fputs ("nge", file);
19100 break;
19102 /* FALLTHRU */
19103 case LT:
19104 fputs ("lt", file);
19105 break;
19106 case UNLE:
19107 if (TARGET_AVX)
19109 fputs ("ngt", file);
19110 break;
19112 /* FALLTHRU */
19113 case LE:
19114 fputs ("le", file);
19115 break;
19116 case UNORDERED:
19117 fputs ("unord", file);
19118 break;
19119 case LTGT:
19120 if (TARGET_AVX)
19122 fputs ("neq_oq", file);
19123 break;
19125 /* FALLTHRU */
19126 case NE:
19127 fputs ("neq", file);
19128 break;
19129 case GE:
19130 if (TARGET_AVX)
19132 fputs ("ge", file);
19133 break;
19135 /* FALLTHRU */
19136 case UNGE:
19137 fputs ("nlt", file);
19138 break;
19139 case GT:
19140 if (TARGET_AVX)
19142 fputs ("gt", file);
19143 break;
19145 /* FALLTHRU */
19146 case UNGT:
19147 fputs ("nle", file);
19148 break;
19149 case ORDERED:
19150 fputs ("ord", file);
19151 break;
19152 default:
19153 output_operand_lossage ("operand is not a condition code, "
19154 "invalid operand code 'D'");
19155 return;
19157 return;
19159 case 'F':
19160 case 'f':
19161 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19162 if (ASSEMBLER_DIALECT == ASM_ATT)
19163 putc ('.', file);
19164 gcc_fallthrough ();
19165 #endif
19167 case 'C':
19168 case 'c':
19169 if (!COMPARISON_P (x))
19171 output_operand_lossage ("operand is not a condition code, "
19172 "invalid operand code '%c'", code);
19173 return;
19175 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19176 code == 'c' || code == 'f',
19177 code == 'F' || code == 'f',
19178 file);
19179 return;
19181 case 'H':
19182 if (!offsettable_memref_p (x))
19184 output_operand_lossage ("operand is not an offsettable memory "
19185 "reference, invalid operand code 'H'");
19186 return;
19188 /* It doesn't actually matter what mode we use here, as we're
19189 only going to use this for printing. */
19190 x = adjust_address_nv (x, DImode, 8);
19191 /* Output 'qword ptr' for intel assembler dialect. */
19192 if (ASSEMBLER_DIALECT == ASM_INTEL)
19193 code = 'q';
19194 break;
19196 case 'K':
19197 if (!CONST_INT_P (x))
19199 output_operand_lossage ("operand is not an integer, invalid "
19200 "operand code 'K'");
19201 return;
19204 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19205 #ifdef HAVE_AS_IX86_HLE
19206 fputs ("xacquire ", file);
19207 #else
19208 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19209 #endif
19210 else if (INTVAL (x) & IX86_HLE_RELEASE)
19211 #ifdef HAVE_AS_IX86_HLE
19212 fputs ("xrelease ", file);
19213 #else
19214 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19215 #endif
19216 /* We do not want to print value of the operand. */
19217 return;
19219 case 'N':
19220 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19221 fputs ("{z}", file);
19222 return;
19224 case 'r':
19225 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19227 output_operand_lossage ("operand is not a specific integer, "
19228 "invalid operand code 'r'");
19229 return;
19232 if (ASSEMBLER_DIALECT == ASM_INTEL)
19233 fputs (", ", file);
19235 fputs ("{sae}", file);
19237 if (ASSEMBLER_DIALECT == ASM_ATT)
19238 fputs (", ", file);
19240 return;
19242 case 'R':
19243 if (!CONST_INT_P (x))
19245 output_operand_lossage ("operand is not an integer, invalid "
19246 "operand code 'R'");
19247 return;
19250 if (ASSEMBLER_DIALECT == ASM_INTEL)
19251 fputs (", ", file);
19253 switch (INTVAL (x))
19255 case ROUND_NEAREST_INT | ROUND_SAE:
19256 fputs ("{rn-sae}", file);
19257 break;
19258 case ROUND_NEG_INF | ROUND_SAE:
19259 fputs ("{rd-sae}", file);
19260 break;
19261 case ROUND_POS_INF | ROUND_SAE:
19262 fputs ("{ru-sae}", file);
19263 break;
19264 case ROUND_ZERO | ROUND_SAE:
19265 fputs ("{rz-sae}", file);
19266 break;
19267 default:
19268 output_operand_lossage ("operand is not a specific integer, "
19269 "invalid operand code 'R'");
19272 if (ASSEMBLER_DIALECT == ASM_ATT)
19273 fputs (", ", file);
19275 return;
19277 case '*':
19278 if (ASSEMBLER_DIALECT == ASM_ATT)
19279 putc ('*', file);
19280 return;
19282 case '&':
19284 const char *name = get_some_local_dynamic_name ();
19285 if (name == NULL)
19286 output_operand_lossage ("'%%&' used without any "
19287 "local dynamic TLS references");
19288 else
19289 assemble_name (file, name);
19290 return;
19293 case '+':
19295 rtx x;
19297 if (!optimize
19298 || optimize_function_for_size_p (cfun)
19299 || !TARGET_BRANCH_PREDICTION_HINTS)
19300 return;
19302 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19303 if (x)
19305 int pred_val = profile_probability::from_reg_br_prob_note
19306 (XINT (x, 0)).to_reg_br_prob_base ();
19308 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19309 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19311 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19312 bool cputaken
19313 = final_forward_branch_p (current_output_insn) == 0;
19315 /* Emit hints only in the case default branch prediction
19316 heuristics would fail. */
19317 if (taken != cputaken)
19319 /* We use 3e (DS) prefix for taken branches and
19320 2e (CS) prefix for not taken branches. */
19321 if (taken)
19322 fputs ("ds ; ", file);
19323 else
19324 fputs ("cs ; ", file);
19328 return;
19331 case ';':
19332 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19333 putc (';', file);
19334 #endif
19335 return;
19337 case '~':
19338 putc (TARGET_AVX2 ? 'i' : 'f', file);
19339 return;
19341 case '^':
19342 if (TARGET_64BIT && Pmode != word_mode)
19343 fputs ("addr32 ", file);
19344 return;
19346 case '!':
19347 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19348 fputs ("bnd ", file);
19349 return;
19351 default:
19352 output_operand_lossage ("invalid operand code '%c'", code);
19356 if (REG_P (x))
19357 print_reg (x, code, file);
19359 else if (MEM_P (x))
19361 rtx addr = XEXP (x, 0);
19363 /* No `byte ptr' prefix for call instructions ... */
19364 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19366 machine_mode mode = GET_MODE (x);
19367 const char *size;
19369 /* Check for explicit size override codes. */
19370 if (code == 'b')
19371 size = "BYTE";
19372 else if (code == 'w')
19373 size = "WORD";
19374 else if (code == 'k')
19375 size = "DWORD";
19376 else if (code == 'q')
19377 size = "QWORD";
19378 else if (code == 'x')
19379 size = "XMMWORD";
19380 else if (code == 't')
19381 size = "YMMWORD";
19382 else if (code == 'g')
19383 size = "ZMMWORD";
19384 else if (mode == BLKmode)
19385 /* ... or BLKmode operands, when not overridden. */
19386 size = NULL;
19387 else
19388 switch (GET_MODE_SIZE (mode))
19390 case 1: size = "BYTE"; break;
19391 case 2: size = "WORD"; break;
19392 case 4: size = "DWORD"; break;
19393 case 8: size = "QWORD"; break;
19394 case 12: size = "TBYTE"; break;
19395 case 16:
19396 if (mode == XFmode)
19397 size = "TBYTE";
19398 else
19399 size = "XMMWORD";
19400 break;
19401 case 32: size = "YMMWORD"; break;
19402 case 64: size = "ZMMWORD"; break;
19403 default:
19404 gcc_unreachable ();
19406 if (size)
19408 fputs (size, file);
19409 fputs (" PTR ", file);
19413 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19414 output_operand_lossage ("invalid constraints for operand");
19415 else
19416 ix86_print_operand_address_as
19417 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19420 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19422 long l;
19424 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19426 if (ASSEMBLER_DIALECT == ASM_ATT)
19427 putc ('$', file);
19428 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19429 if (code == 'q')
19430 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19431 (unsigned long long) (int) l);
19432 else
19433 fprintf (file, "0x%08x", (unsigned int) l);
19436 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19438 long l[2];
19440 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19442 if (ASSEMBLER_DIALECT == ASM_ATT)
19443 putc ('$', file);
19444 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19447 /* These float cases don't actually occur as immediate operands. */
19448 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19450 char dstr[30];
19452 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19453 fputs (dstr, file);
19456 else
19458 /* We have patterns that allow zero sets of memory, for instance.
19459 In 64-bit mode, we should probably support all 8-byte vectors,
19460 since we can in fact encode that into an immediate. */
19461 if (GET_CODE (x) == CONST_VECTOR)
19463 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19464 x = const0_rtx;
19467 if (code != 'P' && code != 'p')
19469 if (CONST_INT_P (x))
19471 if (ASSEMBLER_DIALECT == ASM_ATT)
19472 putc ('$', file);
19474 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19475 || GET_CODE (x) == LABEL_REF)
19477 if (ASSEMBLER_DIALECT == ASM_ATT)
19478 putc ('$', file);
19479 else
19480 fputs ("OFFSET FLAT:", file);
19483 if (CONST_INT_P (x))
19484 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19485 else if (flag_pic || MACHOPIC_INDIRECT)
19486 output_pic_addr_const (file, x, code);
19487 else
19488 output_addr_const (file, x);
19492 static bool
19493 ix86_print_operand_punct_valid_p (unsigned char code)
19495 return (code == '*' || code == '+' || code == '&' || code == ';'
19496 || code == '~' || code == '^' || code == '!');
19499 /* Print a memory operand whose address is ADDR. */
19501 static void
19502 ix86_print_operand_address_as (FILE *file, rtx addr,
19503 addr_space_t as, bool no_rip)
19505 struct ix86_address parts;
19506 rtx base, index, disp;
19507 int scale;
19508 int ok;
19509 bool vsib = false;
19510 int code = 0;
19512 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19514 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19515 gcc_assert (parts.index == NULL_RTX);
19516 parts.index = XVECEXP (addr, 0, 1);
19517 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19518 addr = XVECEXP (addr, 0, 0);
19519 vsib = true;
19521 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19523 gcc_assert (TARGET_64BIT);
19524 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19525 code = 'q';
19527 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19529 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19530 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19531 if (parts.base != NULL_RTX)
19533 parts.index = parts.base;
19534 parts.scale = 1;
19536 parts.base = XVECEXP (addr, 0, 0);
19537 addr = XVECEXP (addr, 0, 0);
19539 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19541 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19542 gcc_assert (parts.index == NULL_RTX);
19543 parts.index = XVECEXP (addr, 0, 1);
19544 addr = XVECEXP (addr, 0, 0);
19546 else
19547 ok = ix86_decompose_address (addr, &parts);
19549 gcc_assert (ok);
19551 base = parts.base;
19552 index = parts.index;
19553 disp = parts.disp;
19554 scale = parts.scale;
19556 if (ADDR_SPACE_GENERIC_P (as))
19557 as = parts.seg;
19558 else
19559 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19561 if (!ADDR_SPACE_GENERIC_P (as))
19563 const char *string;
19565 if (as == ADDR_SPACE_SEG_FS)
19566 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19567 else if (as == ADDR_SPACE_SEG_GS)
19568 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19569 else
19570 gcc_unreachable ();
19571 fputs (string, file);
19574 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19575 if (TARGET_64BIT && !base && !index && !no_rip)
19577 rtx symbol = disp;
19579 if (GET_CODE (disp) == CONST
19580 && GET_CODE (XEXP (disp, 0)) == PLUS
19581 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19582 symbol = XEXP (XEXP (disp, 0), 0);
19584 if (GET_CODE (symbol) == LABEL_REF
19585 || (GET_CODE (symbol) == SYMBOL_REF
19586 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19587 base = pc_rtx;
19590 if (!base && !index)
19592 /* Displacement only requires special attention. */
19593 if (CONST_INT_P (disp))
19595 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
19596 fputs ("ds:", file);
19597 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19599 /* Load the external function address via the GOT slot to avoid PLT. */
19600 else if (GET_CODE (disp) == CONST
19601 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19602 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19603 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19604 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19605 output_pic_addr_const (file, disp, 0);
19606 else if (flag_pic)
19607 output_pic_addr_const (file, disp, 0);
19608 else
19609 output_addr_const (file, disp);
19611 else
19613 /* Print SImode register names to force addr32 prefix. */
19614 if (SImode_address_operand (addr, VOIDmode))
19616 if (flag_checking)
19618 gcc_assert (TARGET_64BIT);
19619 switch (GET_CODE (addr))
19621 case SUBREG:
19622 gcc_assert (GET_MODE (addr) == SImode);
19623 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19624 break;
19625 case ZERO_EXTEND:
19626 case AND:
19627 gcc_assert (GET_MODE (addr) == DImode);
19628 break;
19629 default:
19630 gcc_unreachable ();
19633 gcc_assert (!code);
19634 code = 'k';
19636 else if (code == 0
19637 && TARGET_X32
19638 && disp
19639 && CONST_INT_P (disp)
19640 && INTVAL (disp) < -16*1024*1024)
19642 /* X32 runs in 64-bit mode, where displacement, DISP, in
19643 address DISP(%r64), is encoded as 32-bit immediate sign-
19644 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19645 address is %r64 + 0xffffffffbffffd00. When %r64 <
19646 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19647 which is invalid for x32. The correct address is %r64
19648 - 0x40000300 == 0xf7ffdd64. To properly encode
19649 -0x40000300(%r64) for x32, we zero-extend negative
19650 displacement by forcing addr32 prefix which truncates
19651 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19652 zero-extend all negative displacements, including -1(%rsp).
19653 However, for small negative displacements, sign-extension
19654 won't cause overflow. We only zero-extend negative
19655 displacements if they < -16*1024*1024, which is also used
19656 to check legitimate address displacements for PIC. */
19657 code = 'k';
19660 if (ASSEMBLER_DIALECT == ASM_ATT)
19662 if (disp)
19664 if (flag_pic)
19665 output_pic_addr_const (file, disp, 0);
19666 else if (GET_CODE (disp) == LABEL_REF)
19667 output_asm_label (disp);
19668 else
19669 output_addr_const (file, disp);
19672 putc ('(', file);
19673 if (base)
19674 print_reg (base, code, file);
19675 if (index)
19677 putc (',', file);
19678 print_reg (index, vsib ? 0 : code, file);
19679 if (scale != 1 || vsib)
19680 fprintf (file, ",%d", scale);
19682 putc (')', file);
19684 else
19686 rtx offset = NULL_RTX;
19688 if (disp)
19690 /* Pull out the offset of a symbol; print any symbol itself. */
19691 if (GET_CODE (disp) == CONST
19692 && GET_CODE (XEXP (disp, 0)) == PLUS
19693 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19695 offset = XEXP (XEXP (disp, 0), 1);
19696 disp = gen_rtx_CONST (VOIDmode,
19697 XEXP (XEXP (disp, 0), 0));
19700 if (flag_pic)
19701 output_pic_addr_const (file, disp, 0);
19702 else if (GET_CODE (disp) == LABEL_REF)
19703 output_asm_label (disp);
19704 else if (CONST_INT_P (disp))
19705 offset = disp;
19706 else
19707 output_addr_const (file, disp);
19710 putc ('[', file);
19711 if (base)
19713 print_reg (base, code, file);
19714 if (offset)
19716 if (INTVAL (offset) >= 0)
19717 putc ('+', file);
19718 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19721 else if (offset)
19722 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19723 else
19724 putc ('0', file);
19726 if (index)
19728 putc ('+', file);
19729 print_reg (index, vsib ? 0 : code, file);
19730 if (scale != 1 || vsib)
19731 fprintf (file, "*%d", scale);
19733 putc (']', file);
19738 static void
19739 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19741 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19744 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19746 static bool
19747 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19749 rtx op;
19751 if (GET_CODE (x) != UNSPEC)
19752 return false;
19754 op = XVECEXP (x, 0, 0);
19755 switch (XINT (x, 1))
19757 case UNSPEC_GOTTPOFF:
19758 output_addr_const (file, op);
19759 /* FIXME: This might be @TPOFF in Sun ld. */
19760 fputs ("@gottpoff", file);
19761 break;
19762 case UNSPEC_TPOFF:
19763 output_addr_const (file, op);
19764 fputs ("@tpoff", file);
19765 break;
19766 case UNSPEC_NTPOFF:
19767 output_addr_const (file, op);
19768 if (TARGET_64BIT)
19769 fputs ("@tpoff", file);
19770 else
19771 fputs ("@ntpoff", file);
19772 break;
19773 case UNSPEC_DTPOFF:
19774 output_addr_const (file, op);
19775 fputs ("@dtpoff", file);
19776 break;
19777 case UNSPEC_GOTNTPOFF:
19778 output_addr_const (file, op);
19779 if (TARGET_64BIT)
19780 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19781 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19782 else
19783 fputs ("@gotntpoff", file);
19784 break;
19785 case UNSPEC_INDNTPOFF:
19786 output_addr_const (file, op);
19787 fputs ("@indntpoff", file);
19788 break;
19789 #if TARGET_MACHO
19790 case UNSPEC_MACHOPIC_OFFSET:
19791 output_addr_const (file, op);
19792 putc ('-', file);
19793 machopic_output_function_base_name (file);
19794 break;
19795 #endif
19797 default:
19798 return false;
19801 return true;
19804 /* Split one or more double-mode RTL references into pairs of half-mode
19805 references. The RTL can be REG, offsettable MEM, integer constant, or
19806 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19807 split and "num" is its length. lo_half and hi_half are output arrays
19808 that parallel "operands". */
19810 void
19811 split_double_mode (machine_mode mode, rtx operands[],
19812 int num, rtx lo_half[], rtx hi_half[])
19814 machine_mode half_mode;
19815 unsigned int byte;
19817 switch (mode)
19819 case E_TImode:
19820 half_mode = DImode;
19821 break;
19822 case E_DImode:
19823 half_mode = SImode;
19824 break;
19825 default:
19826 gcc_unreachable ();
19829 byte = GET_MODE_SIZE (half_mode);
19831 while (num--)
19833 rtx op = operands[num];
19835 /* simplify_subreg refuse to split volatile memory addresses,
19836 but we still have to handle it. */
19837 if (MEM_P (op))
19839 lo_half[num] = adjust_address (op, half_mode, 0);
19840 hi_half[num] = adjust_address (op, half_mode, byte);
19842 else
19844 lo_half[num] = simplify_gen_subreg (half_mode, op,
19845 GET_MODE (op) == VOIDmode
19846 ? mode : GET_MODE (op), 0);
19847 hi_half[num] = simplify_gen_subreg (half_mode, op,
19848 GET_MODE (op) == VOIDmode
19849 ? mode : GET_MODE (op), byte);
19854 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19855 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19856 is the expression of the binary operation. The output may either be
19857 emitted here, or returned to the caller, like all output_* functions.
19859 There is no guarantee that the operands are the same mode, as they
19860 might be within FLOAT or FLOAT_EXTEND expressions. */
19862 #ifndef SYSV386_COMPAT
19863 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19864 wants to fix the assemblers because that causes incompatibility
19865 with gcc. No-one wants to fix gcc because that causes
19866 incompatibility with assemblers... You can use the option of
19867 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19868 #define SYSV386_COMPAT 1
19869 #endif
19871 const char *
19872 output_387_binary_op (rtx_insn *insn, rtx *operands)
19874 static char buf[40];
19875 const char *p;
19876 const char *ssep;
19877 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
19879 /* Even if we do not want to check the inputs, this documents input
19880 constraints. Which helps in understanding the following code. */
19881 if (flag_checking)
19883 if (STACK_REG_P (operands[0])
19884 && ((REG_P (operands[1])
19885 && REGNO (operands[0]) == REGNO (operands[1])
19886 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19887 || (REG_P (operands[2])
19888 && REGNO (operands[0]) == REGNO (operands[2])
19889 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19890 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19891 ; /* ok */
19892 else
19893 gcc_assert (is_sse);
19896 switch (GET_CODE (operands[3]))
19898 case PLUS:
19899 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19900 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19901 p = "fiadd";
19902 else
19903 p = "fadd";
19904 ssep = "vadd";
19905 break;
19907 case MINUS:
19908 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19909 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19910 p = "fisub";
19911 else
19912 p = "fsub";
19913 ssep = "vsub";
19914 break;
19916 case MULT:
19917 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19918 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19919 p = "fimul";
19920 else
19921 p = "fmul";
19922 ssep = "vmul";
19923 break;
19925 case DIV:
19926 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19927 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19928 p = "fidiv";
19929 else
19930 p = "fdiv";
19931 ssep = "vdiv";
19932 break;
19934 default:
19935 gcc_unreachable ();
19938 if (is_sse)
19940 if (TARGET_AVX)
19942 strcpy (buf, ssep);
19943 if (GET_MODE (operands[0]) == SFmode)
19944 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
19945 else
19946 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
19948 else
19950 strcpy (buf, ssep + 1);
19951 if (GET_MODE (operands[0]) == SFmode)
19952 strcat (buf, "ss\t{%2, %0|%0, %2}");
19953 else
19954 strcat (buf, "sd\t{%2, %0|%0, %2}");
19956 return buf;
19958 strcpy (buf, p);
19960 switch (GET_CODE (operands[3]))
19962 case MULT:
19963 case PLUS:
19964 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19965 std::swap (operands[1], operands[2]);
19967 /* know operands[0] == operands[1]. */
19969 if (MEM_P (operands[2]))
19971 p = "%Z2\t%2";
19972 break;
19975 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19977 if (STACK_TOP_P (operands[0]))
19978 /* How is it that we are storing to a dead operand[2]?
19979 Well, presumably operands[1] is dead too. We can't
19980 store the result to st(0) as st(0) gets popped on this
19981 instruction. Instead store to operands[2] (which I
19982 think has to be st(1)). st(1) will be popped later.
19983 gcc <= 2.8.1 didn't have this check and generated
19984 assembly code that the Unixware assembler rejected. */
19985 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19986 else
19987 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19988 break;
19991 if (STACK_TOP_P (operands[0]))
19992 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19993 else
19994 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19995 break;
19997 case MINUS:
19998 case DIV:
19999 if (MEM_P (operands[1]))
20001 p = "r%Z1\t%1";
20002 break;
20005 if (MEM_P (operands[2]))
20007 p = "%Z2\t%2";
20008 break;
20011 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
20013 #if SYSV386_COMPAT
20014 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
20015 derived assemblers, confusingly reverse the direction of
20016 the operation for fsub{r} and fdiv{r} when the
20017 destination register is not st(0). The Intel assembler
20018 doesn't have this brain damage. Read !SYSV386_COMPAT to
20019 figure out what the hardware really does. */
20020 if (STACK_TOP_P (operands[0]))
20021 p = "{p\t%0, %2|rp\t%2, %0}";
20022 else
20023 p = "{rp\t%2, %0|p\t%0, %2}";
20024 #else
20025 if (STACK_TOP_P (operands[0]))
20026 /* As above for fmul/fadd, we can't store to st(0). */
20027 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
20028 else
20029 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
20030 #endif
20031 break;
20034 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
20036 #if SYSV386_COMPAT
20037 if (STACK_TOP_P (operands[0]))
20038 p = "{rp\t%0, %1|p\t%1, %0}";
20039 else
20040 p = "{p\t%1, %0|rp\t%0, %1}";
20041 #else
20042 if (STACK_TOP_P (operands[0]))
20043 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
20044 else
20045 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
20046 #endif
20047 break;
20050 if (STACK_TOP_P (operands[0]))
20052 if (STACK_TOP_P (operands[1]))
20053 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
20054 else
20055 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
20056 break;
20058 else if (STACK_TOP_P (operands[1]))
20060 #if SYSV386_COMPAT
20061 p = "{\t%1, %0|r\t%0, %1}";
20062 #else
20063 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
20064 #endif
20066 else
20068 #if SYSV386_COMPAT
20069 p = "{r\t%2, %0|\t%0, %2}";
20070 #else
20071 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
20072 #endif
20074 break;
20076 default:
20077 gcc_unreachable ();
20080 strcat (buf, p);
20081 return buf;
20084 /* Return needed mode for entity in optimize_mode_switching pass. */
20086 static int
20087 ix86_dirflag_mode_needed (rtx_insn *insn)
20089 if (CALL_P (insn))
20091 if (cfun->machine->func_type == TYPE_NORMAL)
20092 return X86_DIRFLAG_ANY;
20093 else
20094 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
20095 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
20098 if (recog_memoized (insn) < 0)
20099 return X86_DIRFLAG_ANY;
20101 if (get_attr_type (insn) == TYPE_STR)
20103 /* Emit cld instruction if stringops are used in the function. */
20104 if (cfun->machine->func_type == TYPE_NORMAL)
20105 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
20106 else
20107 return X86_DIRFLAG_RESET;
20110 return X86_DIRFLAG_ANY;
20113 /* Check if a 256bit AVX register is referenced inside of EXP. */
20115 static bool
20116 ix86_check_avx256_register (const_rtx exp)
20118 if (SUBREG_P (exp))
20119 exp = SUBREG_REG (exp);
20121 return (REG_P (exp)
20122 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
20125 /* Return needed mode for entity in optimize_mode_switching pass. */
20127 static int
20128 ix86_avx_u128_mode_needed (rtx_insn *insn)
20130 if (CALL_P (insn))
20132 rtx link;
20134 /* Needed mode is set to AVX_U128_CLEAN if there are
20135 no 256bit modes used in function arguments. */
20136 for (link = CALL_INSN_FUNCTION_USAGE (insn);
20137 link;
20138 link = XEXP (link, 1))
20140 if (GET_CODE (XEXP (link, 0)) == USE)
20142 rtx arg = XEXP (XEXP (link, 0), 0);
20144 if (ix86_check_avx256_register (arg))
20145 return AVX_U128_DIRTY;
20149 return AVX_U128_CLEAN;
20152 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20153 changes state only when a 256bit register is written to, but we need
20154 to prevent the compiler from moving optimal insertion point above
20155 eventual read from 256bit register. */
20156 subrtx_iterator::array_type array;
20157 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20158 if (ix86_check_avx256_register (*iter))
20159 return AVX_U128_DIRTY;
20161 return AVX_U128_ANY;
20164 /* Return mode that i387 must be switched into
20165 prior to the execution of insn. */
20167 static int
20168 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20170 enum attr_i387_cw mode;
20172 /* The mode UNINITIALIZED is used to store control word after a
20173 function call or ASM pattern. The mode ANY specify that function
20174 has no requirements on the control word and make no changes in the
20175 bits we are interested in. */
20177 if (CALL_P (insn)
20178 || (NONJUMP_INSN_P (insn)
20179 && (asm_noperands (PATTERN (insn)) >= 0
20180 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20181 return I387_CW_UNINITIALIZED;
20183 if (recog_memoized (insn) < 0)
20184 return I387_CW_ANY;
20186 mode = get_attr_i387_cw (insn);
20188 switch (entity)
20190 case I387_TRUNC:
20191 if (mode == I387_CW_TRUNC)
20192 return mode;
20193 break;
20195 case I387_FLOOR:
20196 if (mode == I387_CW_FLOOR)
20197 return mode;
20198 break;
20200 case I387_CEIL:
20201 if (mode == I387_CW_CEIL)
20202 return mode;
20203 break;
20205 case I387_MASK_PM:
20206 if (mode == I387_CW_MASK_PM)
20207 return mode;
20208 break;
20210 default:
20211 gcc_unreachable ();
20214 return I387_CW_ANY;
20217 /* Return mode that entity must be switched into
20218 prior to the execution of insn. */
20220 static int
20221 ix86_mode_needed (int entity, rtx_insn *insn)
20223 switch (entity)
20225 case X86_DIRFLAG:
20226 return ix86_dirflag_mode_needed (insn);
20227 case AVX_U128:
20228 return ix86_avx_u128_mode_needed (insn);
20229 case I387_TRUNC:
20230 case I387_FLOOR:
20231 case I387_CEIL:
20232 case I387_MASK_PM:
20233 return ix86_i387_mode_needed (entity, insn);
20234 default:
20235 gcc_unreachable ();
20237 return 0;
20240 /* Check if a 256bit AVX register is referenced in stores. */
20242 static void
20243 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20245 if (ix86_check_avx256_register (dest))
20247 bool *used = (bool *) data;
20248 *used = true;
20252 /* Calculate mode of upper 128bit AVX registers after the insn. */
20254 static int
20255 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20257 rtx pat = PATTERN (insn);
20259 if (vzeroupper_operation (pat, VOIDmode)
20260 || vzeroall_operation (pat, VOIDmode))
20261 return AVX_U128_CLEAN;
20263 /* We know that state is clean after CALL insn if there are no
20264 256bit registers used in the function return register. */
20265 if (CALL_P (insn))
20267 bool avx_reg256_found = false;
20268 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20270 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20273 /* Otherwise, return current mode. Remember that if insn
20274 references AVX 256bit registers, the mode was already changed
20275 to DIRTY from MODE_NEEDED. */
20276 return mode;
20279 /* Return the mode that an insn results in. */
20281 static int
20282 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20284 switch (entity)
20286 case X86_DIRFLAG:
20287 return mode;
20288 case AVX_U128:
20289 return ix86_avx_u128_mode_after (mode, insn);
20290 case I387_TRUNC:
20291 case I387_FLOOR:
20292 case I387_CEIL:
20293 case I387_MASK_PM:
20294 return mode;
20295 default:
20296 gcc_unreachable ();
20300 static int
20301 ix86_dirflag_mode_entry (void)
20303 /* For TARGET_CLD or in the interrupt handler we can't assume
20304 direction flag state at function entry. */
20305 if (TARGET_CLD
20306 || cfun->machine->func_type != TYPE_NORMAL)
20307 return X86_DIRFLAG_ANY;
20309 return X86_DIRFLAG_RESET;
20312 static int
20313 ix86_avx_u128_mode_entry (void)
20315 tree arg;
20317 /* Entry mode is set to AVX_U128_DIRTY if there are
20318 256bit modes used in function arguments. */
20319 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20320 arg = TREE_CHAIN (arg))
20322 rtx incoming = DECL_INCOMING_RTL (arg);
20324 if (incoming && ix86_check_avx256_register (incoming))
20325 return AVX_U128_DIRTY;
20328 return AVX_U128_CLEAN;
20331 /* Return a mode that ENTITY is assumed to be
20332 switched to at function entry. */
20334 static int
20335 ix86_mode_entry (int entity)
20337 switch (entity)
20339 case X86_DIRFLAG:
20340 return ix86_dirflag_mode_entry ();
20341 case AVX_U128:
20342 return ix86_avx_u128_mode_entry ();
20343 case I387_TRUNC:
20344 case I387_FLOOR:
20345 case I387_CEIL:
20346 case I387_MASK_PM:
20347 return I387_CW_ANY;
20348 default:
20349 gcc_unreachable ();
20353 static int
20354 ix86_avx_u128_mode_exit (void)
20356 rtx reg = crtl->return_rtx;
20358 /* Exit mode is set to AVX_U128_DIRTY if there are
20359 256bit modes used in the function return register. */
20360 if (reg && ix86_check_avx256_register (reg))
20361 return AVX_U128_DIRTY;
20363 return AVX_U128_CLEAN;
20366 /* Return a mode that ENTITY is assumed to be
20367 switched to at function exit. */
20369 static int
20370 ix86_mode_exit (int entity)
20372 switch (entity)
20374 case X86_DIRFLAG:
20375 return X86_DIRFLAG_ANY;
20376 case AVX_U128:
20377 return ix86_avx_u128_mode_exit ();
20378 case I387_TRUNC:
20379 case I387_FLOOR:
20380 case I387_CEIL:
20381 case I387_MASK_PM:
20382 return I387_CW_ANY;
20383 default:
20384 gcc_unreachable ();
20388 static int
20389 ix86_mode_priority (int, int n)
20391 return n;
20394 /* Output code to initialize control word copies used by trunc?f?i and
20395 rounding patterns. CURRENT_MODE is set to current control word,
20396 while NEW_MODE is set to new control word. */
20398 static void
20399 emit_i387_cw_initialization (int mode)
20401 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20402 rtx new_mode;
20404 enum ix86_stack_slot slot;
20406 rtx reg = gen_reg_rtx (HImode);
20408 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20409 emit_move_insn (reg, copy_rtx (stored_mode));
20411 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20412 || optimize_insn_for_size_p ())
20414 switch (mode)
20416 case I387_CW_TRUNC:
20417 /* round toward zero (truncate) */
20418 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20419 slot = SLOT_CW_TRUNC;
20420 break;
20422 case I387_CW_FLOOR:
20423 /* round down toward -oo */
20424 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20425 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20426 slot = SLOT_CW_FLOOR;
20427 break;
20429 case I387_CW_CEIL:
20430 /* round up toward +oo */
20431 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20432 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20433 slot = SLOT_CW_CEIL;
20434 break;
20436 case I387_CW_MASK_PM:
20437 /* mask precision exception for nearbyint() */
20438 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20439 slot = SLOT_CW_MASK_PM;
20440 break;
20442 default:
20443 gcc_unreachable ();
20446 else
20448 switch (mode)
20450 case I387_CW_TRUNC:
20451 /* round toward zero (truncate) */
20452 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20453 slot = SLOT_CW_TRUNC;
20454 break;
20456 case I387_CW_FLOOR:
20457 /* round down toward -oo */
20458 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20459 slot = SLOT_CW_FLOOR;
20460 break;
20462 case I387_CW_CEIL:
20463 /* round up toward +oo */
20464 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20465 slot = SLOT_CW_CEIL;
20466 break;
20468 case I387_CW_MASK_PM:
20469 /* mask precision exception for nearbyint() */
20470 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20471 slot = SLOT_CW_MASK_PM;
20472 break;
20474 default:
20475 gcc_unreachable ();
20479 gcc_assert (slot < MAX_386_STACK_LOCALS);
20481 new_mode = assign_386_stack_local (HImode, slot);
20482 emit_move_insn (new_mode, reg);
20485 /* Emit vzeroupper. */
20487 void
20488 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20490 int i;
20492 /* Cancel automatic vzeroupper insertion if there are
20493 live call-saved SSE registers at the insertion point. */
20495 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20496 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20497 return;
20499 if (TARGET_64BIT)
20500 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20501 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20502 return;
20504 emit_insn (gen_avx_vzeroupper ());
20507 /* Generate one or more insns to set ENTITY to MODE. */
20509 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20510 is the set of hard registers live at the point where the insn(s)
20511 are to be inserted. */
20513 static void
20514 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20515 HARD_REG_SET regs_live)
20517 switch (entity)
20519 case X86_DIRFLAG:
20520 if (mode == X86_DIRFLAG_RESET)
20521 emit_insn (gen_cld ());
20522 break;
20523 case AVX_U128:
20524 if (mode == AVX_U128_CLEAN)
20525 ix86_avx_emit_vzeroupper (regs_live);
20526 break;
20527 case I387_TRUNC:
20528 case I387_FLOOR:
20529 case I387_CEIL:
20530 case I387_MASK_PM:
20531 if (mode != I387_CW_ANY
20532 && mode != I387_CW_UNINITIALIZED)
20533 emit_i387_cw_initialization (mode);
20534 break;
20535 default:
20536 gcc_unreachable ();
20540 /* Output code for INSN to convert a float to a signed int. OPERANDS
20541 are the insn operands. The output may be [HSD]Imode and the input
20542 operand may be [SDX]Fmode. */
20544 const char *
20545 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20547 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20548 int dimode_p = GET_MODE (operands[0]) == DImode;
20549 int round_mode = get_attr_i387_cw (insn);
20551 /* Jump through a hoop or two for DImode, since the hardware has no
20552 non-popping instruction. We used to do this a different way, but
20553 that was somewhat fragile and broke with post-reload splitters. */
20554 if ((dimode_p || fisttp) && !stack_top_dies)
20555 output_asm_insn ("fld\t%y1", operands);
20557 gcc_assert (STACK_TOP_P (operands[1]));
20558 gcc_assert (MEM_P (operands[0]));
20559 gcc_assert (GET_MODE (operands[1]) != TFmode);
20561 if (fisttp)
20562 output_asm_insn ("fisttp%Z0\t%0", operands);
20563 else
20565 if (round_mode != I387_CW_ANY)
20566 output_asm_insn ("fldcw\t%3", operands);
20567 if (stack_top_dies || dimode_p)
20568 output_asm_insn ("fistp%Z0\t%0", operands);
20569 else
20570 output_asm_insn ("fist%Z0\t%0", operands);
20571 if (round_mode != I387_CW_ANY)
20572 output_asm_insn ("fldcw\t%2", operands);
20575 return "";
20578 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20579 have the values zero or one, indicates the ffreep insn's operand
20580 from the OPERANDS array. */
20582 static const char *
20583 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20585 if (TARGET_USE_FFREEP)
20586 #ifdef HAVE_AS_IX86_FFREEP
20587 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20588 #else
20590 static char retval[32];
20591 int regno = REGNO (operands[opno]);
20593 gcc_assert (STACK_REGNO_P (regno));
20595 regno -= FIRST_STACK_REG;
20597 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20598 return retval;
20600 #endif
20602 return opno ? "fstp\t%y1" : "fstp\t%y0";
20606 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20607 should be used. UNORDERED_P is true when fucom should be used. */
20609 const char *
20610 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20612 int stack_top_dies;
20613 rtx cmp_op0, cmp_op1;
20614 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20616 if (eflags_p)
20618 cmp_op0 = operands[0];
20619 cmp_op1 = operands[1];
20621 else
20623 cmp_op0 = operands[1];
20624 cmp_op1 = operands[2];
20627 if (is_sse)
20629 if (GET_MODE (operands[0]) == SFmode)
20630 if (unordered_p)
20631 return "%vucomiss\t{%1, %0|%0, %1}";
20632 else
20633 return "%vcomiss\t{%1, %0|%0, %1}";
20634 else
20635 if (unordered_p)
20636 return "%vucomisd\t{%1, %0|%0, %1}";
20637 else
20638 return "%vcomisd\t{%1, %0|%0, %1}";
20641 gcc_assert (STACK_TOP_P (cmp_op0));
20643 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20645 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20647 if (stack_top_dies)
20649 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20650 return output_387_ffreep (operands, 1);
20652 else
20653 return "ftst\n\tfnstsw\t%0";
20656 if (STACK_REG_P (cmp_op1)
20657 && stack_top_dies
20658 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20659 && REGNO (cmp_op1) != FIRST_STACK_REG)
20661 /* If both the top of the 387 stack dies, and the other operand
20662 is also a stack register that dies, then this must be a
20663 `fcompp' float compare */
20665 if (eflags_p)
20667 /* There is no double popping fcomi variant. Fortunately,
20668 eflags is immune from the fstp's cc clobbering. */
20669 if (unordered_p)
20670 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20671 else
20672 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20673 return output_387_ffreep (operands, 0);
20675 else
20677 if (unordered_p)
20678 return "fucompp\n\tfnstsw\t%0";
20679 else
20680 return "fcompp\n\tfnstsw\t%0";
20683 else
20685 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20687 static const char * const alt[16] =
20689 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20690 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20691 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20692 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20694 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20695 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20696 NULL,
20697 NULL,
20699 "fcomi\t{%y1, %0|%0, %y1}",
20700 "fcomip\t{%y1, %0|%0, %y1}",
20701 "fucomi\t{%y1, %0|%0, %y1}",
20702 "fucomip\t{%y1, %0|%0, %y1}",
20704 NULL,
20705 NULL,
20706 NULL,
20707 NULL
20710 int mask;
20711 const char *ret;
20713 mask = eflags_p << 3;
20714 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20715 mask |= unordered_p << 1;
20716 mask |= stack_top_dies;
20718 gcc_assert (mask < 16);
20719 ret = alt[mask];
20720 gcc_assert (ret);
20722 return ret;
20726 void
20727 ix86_output_addr_vec_elt (FILE *file, int value)
20729 const char *directive = ASM_LONG;
20731 #ifdef ASM_QUAD
20732 if (TARGET_LP64)
20733 directive = ASM_QUAD;
20734 #else
20735 gcc_assert (!TARGET_64BIT);
20736 #endif
20738 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20741 void
20742 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20744 const char *directive = ASM_LONG;
20746 #ifdef ASM_QUAD
20747 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20748 directive = ASM_QUAD;
20749 #else
20750 gcc_assert (!TARGET_64BIT);
20751 #endif
20752 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20753 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20754 fprintf (file, "%s%s%d-%s%d\n",
20755 directive, LPREFIX, value, LPREFIX, rel);
20756 else if (HAVE_AS_GOTOFF_IN_DATA)
20757 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20758 #if TARGET_MACHO
20759 else if (TARGET_MACHO)
20761 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20762 machopic_output_function_base_name (file);
20763 putc ('\n', file);
20765 #endif
20766 else
20767 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20768 GOT_SYMBOL_NAME, LPREFIX, value);
20771 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20772 for the target. */
20774 void
20775 ix86_expand_clear (rtx dest)
20777 rtx tmp;
20779 /* We play register width games, which are only valid after reload. */
20780 gcc_assert (reload_completed);
20782 /* Avoid HImode and its attendant prefix byte. */
20783 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20784 dest = gen_rtx_REG (SImode, REGNO (dest));
20785 tmp = gen_rtx_SET (dest, const0_rtx);
20787 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20789 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20790 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20793 emit_insn (tmp);
20796 /* X is an unchanging MEM. If it is a constant pool reference, return
20797 the constant pool rtx, else NULL. */
20800 maybe_get_pool_constant (rtx x)
20802 x = ix86_delegitimize_address (XEXP (x, 0));
20804 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20805 return get_pool_constant (x);
20807 return NULL_RTX;
20810 void
20811 ix86_expand_move (machine_mode mode, rtx operands[])
20813 rtx op0, op1;
20814 rtx tmp, addend = NULL_RTX;
20815 enum tls_model model;
20817 op0 = operands[0];
20818 op1 = operands[1];
20820 switch (GET_CODE (op1))
20822 case CONST:
20823 tmp = XEXP (op1, 0);
20825 if (GET_CODE (tmp) != PLUS
20826 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20827 break;
20829 op1 = XEXP (tmp, 0);
20830 addend = XEXP (tmp, 1);
20831 /* FALLTHRU */
20833 case SYMBOL_REF:
20834 model = SYMBOL_REF_TLS_MODEL (op1);
20836 if (model)
20837 op1 = legitimize_tls_address (op1, model, true);
20838 else if (ix86_force_load_from_GOT_p (op1))
20840 /* Load the external function address via GOT slot to avoid PLT. */
20841 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20842 (TARGET_64BIT
20843 ? UNSPEC_GOTPCREL
20844 : UNSPEC_GOT));
20845 op1 = gen_rtx_CONST (Pmode, op1);
20846 op1 = gen_const_mem (Pmode, op1);
20847 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20849 else
20851 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20852 if (tmp)
20854 op1 = tmp;
20855 if (!addend)
20856 break;
20858 else
20860 op1 = operands[1];
20861 break;
20865 if (addend)
20867 op1 = force_operand (op1, NULL_RTX);
20868 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20869 op0, 1, OPTAB_DIRECT);
20871 else
20872 op1 = force_operand (op1, op0);
20874 if (op1 == op0)
20875 return;
20877 op1 = convert_to_mode (mode, op1, 1);
20879 default:
20880 break;
20883 if ((flag_pic || MACHOPIC_INDIRECT)
20884 && symbolic_operand (op1, mode))
20886 if (TARGET_MACHO && !TARGET_64BIT)
20888 #if TARGET_MACHO
20889 /* dynamic-no-pic */
20890 if (MACHOPIC_INDIRECT)
20892 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20893 ? op0 : gen_reg_rtx (Pmode);
20894 op1 = machopic_indirect_data_reference (op1, temp);
20895 if (MACHOPIC_PURE)
20896 op1 = machopic_legitimize_pic_address (op1, mode,
20897 temp == op1 ? 0 : temp);
20899 if (op0 != op1 && GET_CODE (op0) != MEM)
20901 rtx insn = gen_rtx_SET (op0, op1);
20902 emit_insn (insn);
20903 return;
20905 if (GET_CODE (op0) == MEM)
20906 op1 = force_reg (Pmode, op1);
20907 else
20909 rtx temp = op0;
20910 if (GET_CODE (temp) != REG)
20911 temp = gen_reg_rtx (Pmode);
20912 temp = legitimize_pic_address (op1, temp);
20913 if (temp == op0)
20914 return;
20915 op1 = temp;
20917 /* dynamic-no-pic */
20918 #endif
20920 else
20922 if (MEM_P (op0))
20923 op1 = force_reg (mode, op1);
20924 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20926 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20927 op1 = legitimize_pic_address (op1, reg);
20928 if (op0 == op1)
20929 return;
20930 op1 = convert_to_mode (mode, op1, 1);
20934 else
20936 if (MEM_P (op0)
20937 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20938 || !push_operand (op0, mode))
20939 && MEM_P (op1))
20940 op1 = force_reg (mode, op1);
20942 if (push_operand (op0, mode)
20943 && ! general_no_elim_operand (op1, mode))
20944 op1 = copy_to_mode_reg (mode, op1);
20946 /* Force large constants in 64bit compilation into register
20947 to get them CSEed. */
20948 if (can_create_pseudo_p ()
20949 && (mode == DImode) && TARGET_64BIT
20950 && immediate_operand (op1, mode)
20951 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20952 && !register_operand (op0, mode)
20953 && optimize)
20954 op1 = copy_to_mode_reg (mode, op1);
20956 if (can_create_pseudo_p ()
20957 && CONST_DOUBLE_P (op1))
20959 /* If we are loading a floating point constant to a register,
20960 force the value to memory now, since we'll get better code
20961 out the back end. */
20963 op1 = validize_mem (force_const_mem (mode, op1));
20964 if (!register_operand (op0, mode))
20966 rtx temp = gen_reg_rtx (mode);
20967 emit_insn (gen_rtx_SET (temp, op1));
20968 emit_move_insn (op0, temp);
20969 return;
20974 emit_insn (gen_rtx_SET (op0, op1));
20977 void
20978 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20980 rtx op0 = operands[0], op1 = operands[1];
20981 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20982 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20983 unsigned int align = (TARGET_IAMCU
20984 ? GET_MODE_BITSIZE (mode)
20985 : GET_MODE_ALIGNMENT (mode));
20987 if (push_operand (op0, VOIDmode))
20988 op0 = emit_move_resolve_push (mode, op0);
20990 /* Force constants other than zero into memory. We do not know how
20991 the instructions used to build constants modify the upper 64 bits
20992 of the register, once we have that information we may be able
20993 to handle some of them more efficiently. */
20994 if (can_create_pseudo_p ()
20995 && (CONSTANT_P (op1)
20996 || (SUBREG_P (op1)
20997 && CONSTANT_P (SUBREG_REG (op1))))
20998 && ((register_operand (op0, mode)
20999 && !standard_sse_constant_p (op1, mode))
21000 /* ix86_expand_vector_move_misalign() does not like constants. */
21001 || (SSE_REG_MODE_P (mode)
21002 && MEM_P (op0)
21003 && MEM_ALIGN (op0) < align)))
21005 if (SUBREG_P (op1))
21007 machine_mode imode = GET_MODE (SUBREG_REG (op1));
21008 rtx r = force_const_mem (imode, SUBREG_REG (op1));
21009 if (r)
21010 r = validize_mem (r);
21011 else
21012 r = force_reg (imode, SUBREG_REG (op1));
21013 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
21015 else
21016 op1 = validize_mem (force_const_mem (mode, op1));
21019 /* We need to check memory alignment for SSE mode since attribute
21020 can make operands unaligned. */
21021 if (can_create_pseudo_p ()
21022 && SSE_REG_MODE_P (mode)
21023 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
21024 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
21026 rtx tmp[2];
21028 /* ix86_expand_vector_move_misalign() does not like both
21029 arguments in memory. */
21030 if (!register_operand (op0, mode)
21031 && !register_operand (op1, mode))
21032 op1 = force_reg (mode, op1);
21034 tmp[0] = op0; tmp[1] = op1;
21035 ix86_expand_vector_move_misalign (mode, tmp);
21036 return;
21039 /* Make operand1 a register if it isn't already. */
21040 if (can_create_pseudo_p ()
21041 && !register_operand (op0, mode)
21042 && !register_operand (op1, mode))
21044 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
21045 return;
21048 emit_insn (gen_rtx_SET (op0, op1));
21051 /* Split 32-byte AVX unaligned load and store if needed. */
21053 static void
21054 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
21056 rtx m;
21057 rtx (*extract) (rtx, rtx, rtx);
21058 machine_mode mode;
21060 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
21061 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
21063 emit_insn (gen_rtx_SET (op0, op1));
21064 return;
21067 rtx orig_op0 = NULL_RTX;
21068 mode = GET_MODE (op0);
21069 switch (GET_MODE_CLASS (mode))
21071 case MODE_VECTOR_INT:
21072 case MODE_INT:
21073 if (mode != V32QImode)
21075 if (!MEM_P (op0))
21077 orig_op0 = op0;
21078 op0 = gen_reg_rtx (V32QImode);
21080 else
21081 op0 = gen_lowpart (V32QImode, op0);
21082 op1 = gen_lowpart (V32QImode, op1);
21083 mode = V32QImode;
21085 break;
21086 case MODE_VECTOR_FLOAT:
21087 break;
21088 default:
21089 gcc_unreachable ();
21092 switch (mode)
21094 default:
21095 gcc_unreachable ();
21096 case E_V32QImode:
21097 extract = gen_avx_vextractf128v32qi;
21098 mode = V16QImode;
21099 break;
21100 case E_V8SFmode:
21101 extract = gen_avx_vextractf128v8sf;
21102 mode = V4SFmode;
21103 break;
21104 case E_V4DFmode:
21105 extract = gen_avx_vextractf128v4df;
21106 mode = V2DFmode;
21107 break;
21110 if (MEM_P (op1))
21112 rtx r = gen_reg_rtx (mode);
21113 m = adjust_address (op1, mode, 0);
21114 emit_move_insn (r, m);
21115 m = adjust_address (op1, mode, 16);
21116 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
21117 emit_move_insn (op0, r);
21119 else if (MEM_P (op0))
21121 m = adjust_address (op0, mode, 0);
21122 emit_insn (extract (m, op1, const0_rtx));
21123 m = adjust_address (op0, mode, 16);
21124 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
21126 else
21127 gcc_unreachable ();
21129 if (orig_op0)
21130 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
21133 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
21134 straight to ix86_expand_vector_move. */
21135 /* Code generation for scalar reg-reg moves of single and double precision data:
21136 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
21137 movaps reg, reg
21138 else
21139 movss reg, reg
21140 if (x86_sse_partial_reg_dependency == true)
21141 movapd reg, reg
21142 else
21143 movsd reg, reg
21145 Code generation for scalar loads of double precision data:
21146 if (x86_sse_split_regs == true)
21147 movlpd mem, reg (gas syntax)
21148 else
21149 movsd mem, reg
21151 Code generation for unaligned packed loads of single precision data
21152 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21153 if (x86_sse_unaligned_move_optimal)
21154 movups mem, reg
21156 if (x86_sse_partial_reg_dependency == true)
21158 xorps reg, reg
21159 movlps mem, reg
21160 movhps mem+8, reg
21162 else
21164 movlps mem, reg
21165 movhps mem+8, reg
21168 Code generation for unaligned packed loads of double precision data
21169 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21170 if (x86_sse_unaligned_move_optimal)
21171 movupd mem, reg
21173 if (x86_sse_split_regs == true)
21175 movlpd mem, reg
21176 movhpd mem+8, reg
21178 else
21180 movsd mem, reg
21181 movhpd mem+8, reg
21185 void
21186 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21188 rtx op0, op1, m;
21190 op0 = operands[0];
21191 op1 = operands[1];
21193 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21194 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21196 emit_insn (gen_rtx_SET (op0, op1));
21197 return;
21200 if (TARGET_AVX)
21202 if (GET_MODE_SIZE (mode) == 32)
21203 ix86_avx256_split_vector_move_misalign (op0, op1);
21204 else
21205 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21206 emit_insn (gen_rtx_SET (op0, op1));
21207 return;
21210 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21211 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21213 emit_insn (gen_rtx_SET (op0, op1));
21214 return;
21217 /* ??? If we have typed data, then it would appear that using
21218 movdqu is the only way to get unaligned data loaded with
21219 integer type. */
21220 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21222 emit_insn (gen_rtx_SET (op0, op1));
21223 return;
21226 if (MEM_P (op1))
21228 if (TARGET_SSE2 && mode == V2DFmode)
21230 rtx zero;
21232 /* When SSE registers are split into halves, we can avoid
21233 writing to the top half twice. */
21234 if (TARGET_SSE_SPLIT_REGS)
21236 emit_clobber (op0);
21237 zero = op0;
21239 else
21241 /* ??? Not sure about the best option for the Intel chips.
21242 The following would seem to satisfy; the register is
21243 entirely cleared, breaking the dependency chain. We
21244 then store to the upper half, with a dependency depth
21245 of one. A rumor has it that Intel recommends two movsd
21246 followed by an unpacklpd, but this is unconfirmed. And
21247 given that the dependency depth of the unpacklpd would
21248 still be one, I'm not sure why this would be better. */
21249 zero = CONST0_RTX (V2DFmode);
21252 m = adjust_address (op1, DFmode, 0);
21253 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21254 m = adjust_address (op1, DFmode, 8);
21255 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21257 else
21259 rtx t;
21261 if (mode != V4SFmode)
21262 t = gen_reg_rtx (V4SFmode);
21263 else
21264 t = op0;
21266 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21267 emit_move_insn (t, CONST0_RTX (V4SFmode));
21268 else
21269 emit_clobber (t);
21271 m = adjust_address (op1, V2SFmode, 0);
21272 emit_insn (gen_sse_loadlps (t, t, m));
21273 m = adjust_address (op1, V2SFmode, 8);
21274 emit_insn (gen_sse_loadhps (t, t, m));
21275 if (mode != V4SFmode)
21276 emit_move_insn (op0, gen_lowpart (mode, t));
21279 else if (MEM_P (op0))
21281 if (TARGET_SSE2 && mode == V2DFmode)
21283 m = adjust_address (op0, DFmode, 0);
21284 emit_insn (gen_sse2_storelpd (m, op1));
21285 m = adjust_address (op0, DFmode, 8);
21286 emit_insn (gen_sse2_storehpd (m, op1));
21288 else
21290 if (mode != V4SFmode)
21291 op1 = gen_lowpart (V4SFmode, op1);
21293 m = adjust_address (op0, V2SFmode, 0);
21294 emit_insn (gen_sse_storelps (m, op1));
21295 m = adjust_address (op0, V2SFmode, 8);
21296 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21299 else
21300 gcc_unreachable ();
21303 /* Helper function of ix86_fixup_binary_operands to canonicalize
21304 operand order. Returns true if the operands should be swapped. */
21306 static bool
21307 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21308 rtx operands[])
21310 rtx dst = operands[0];
21311 rtx src1 = operands[1];
21312 rtx src2 = operands[2];
21314 /* If the operation is not commutative, we can't do anything. */
21315 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21316 return false;
21318 /* Highest priority is that src1 should match dst. */
21319 if (rtx_equal_p (dst, src1))
21320 return false;
21321 if (rtx_equal_p (dst, src2))
21322 return true;
21324 /* Next highest priority is that immediate constants come second. */
21325 if (immediate_operand (src2, mode))
21326 return false;
21327 if (immediate_operand (src1, mode))
21328 return true;
21330 /* Lowest priority is that memory references should come second. */
21331 if (MEM_P (src2))
21332 return false;
21333 if (MEM_P (src1))
21334 return true;
21336 return false;
21340 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21341 destination to use for the operation. If different from the true
21342 destination in operands[0], a copy operation will be required. */
21345 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21346 rtx operands[])
21348 rtx dst = operands[0];
21349 rtx src1 = operands[1];
21350 rtx src2 = operands[2];
21352 /* Canonicalize operand order. */
21353 if (ix86_swap_binary_operands_p (code, mode, operands))
21355 /* It is invalid to swap operands of different modes. */
21356 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21358 std::swap (src1, src2);
21361 /* Both source operands cannot be in memory. */
21362 if (MEM_P (src1) && MEM_P (src2))
21364 /* Optimization: Only read from memory once. */
21365 if (rtx_equal_p (src1, src2))
21367 src2 = force_reg (mode, src2);
21368 src1 = src2;
21370 else if (rtx_equal_p (dst, src1))
21371 src2 = force_reg (mode, src2);
21372 else
21373 src1 = force_reg (mode, src1);
21376 /* If the destination is memory, and we do not have matching source
21377 operands, do things in registers. */
21378 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21379 dst = gen_reg_rtx (mode);
21381 /* Source 1 cannot be a constant. */
21382 if (CONSTANT_P (src1))
21383 src1 = force_reg (mode, src1);
21385 /* Source 1 cannot be a non-matching memory. */
21386 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21387 src1 = force_reg (mode, src1);
21389 /* Improve address combine. */
21390 if (code == PLUS
21391 && GET_MODE_CLASS (mode) == MODE_INT
21392 && MEM_P (src2))
21393 src2 = force_reg (mode, src2);
21395 operands[1] = src1;
21396 operands[2] = src2;
21397 return dst;
21400 /* Similarly, but assume that the destination has already been
21401 set up properly. */
21403 void
21404 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21405 machine_mode mode, rtx operands[])
21407 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21408 gcc_assert (dst == operands[0]);
21411 /* Attempt to expand a binary operator. Make the expansion closer to the
21412 actual machine, then just general_operand, which will allow 3 separate
21413 memory references (one output, two input) in a single insn. */
21415 void
21416 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21417 rtx operands[])
21419 rtx src1, src2, dst, op, clob;
21421 dst = ix86_fixup_binary_operands (code, mode, operands);
21422 src1 = operands[1];
21423 src2 = operands[2];
21425 /* Emit the instruction. */
21427 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21429 if (reload_completed
21430 && code == PLUS
21431 && !rtx_equal_p (dst, src1))
21433 /* This is going to be an LEA; avoid splitting it later. */
21434 emit_insn (op);
21436 else
21438 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21439 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21442 /* Fix up the destination if needed. */
21443 if (dst != operands[0])
21444 emit_move_insn (operands[0], dst);
21447 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21448 the given OPERANDS. */
21450 void
21451 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21452 rtx operands[])
21454 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21455 if (SUBREG_P (operands[1]))
21457 op1 = operands[1];
21458 op2 = operands[2];
21460 else if (SUBREG_P (operands[2]))
21462 op1 = operands[2];
21463 op2 = operands[1];
21465 /* Optimize (__m128i) d | (__m128i) e and similar code
21466 when d and e are float vectors into float vector logical
21467 insn. In C/C++ without using intrinsics there is no other way
21468 to express vector logical operation on float vectors than
21469 to cast them temporarily to integer vectors. */
21470 if (op1
21471 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21472 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21473 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21474 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21475 && SUBREG_BYTE (op1) == 0
21476 && (GET_CODE (op2) == CONST_VECTOR
21477 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21478 && SUBREG_BYTE (op2) == 0))
21479 && can_create_pseudo_p ())
21481 rtx dst;
21482 switch (GET_MODE (SUBREG_REG (op1)))
21484 case E_V4SFmode:
21485 case E_V8SFmode:
21486 case E_V16SFmode:
21487 case E_V2DFmode:
21488 case E_V4DFmode:
21489 case E_V8DFmode:
21490 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21491 if (GET_CODE (op2) == CONST_VECTOR)
21493 op2 = gen_lowpart (GET_MODE (dst), op2);
21494 op2 = force_reg (GET_MODE (dst), op2);
21496 else
21498 op1 = operands[1];
21499 op2 = SUBREG_REG (operands[2]);
21500 if (!vector_operand (op2, GET_MODE (dst)))
21501 op2 = force_reg (GET_MODE (dst), op2);
21503 op1 = SUBREG_REG (op1);
21504 if (!vector_operand (op1, GET_MODE (dst)))
21505 op1 = force_reg (GET_MODE (dst), op1);
21506 emit_insn (gen_rtx_SET (dst,
21507 gen_rtx_fmt_ee (code, GET_MODE (dst),
21508 op1, op2)));
21509 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21510 return;
21511 default:
21512 break;
21515 if (!vector_operand (operands[1], mode))
21516 operands[1] = force_reg (mode, operands[1]);
21517 if (!vector_operand (operands[2], mode))
21518 operands[2] = force_reg (mode, operands[2]);
21519 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21520 emit_insn (gen_rtx_SET (operands[0],
21521 gen_rtx_fmt_ee (code, mode, operands[1],
21522 operands[2])));
21525 /* Return TRUE or FALSE depending on whether the binary operator meets the
21526 appropriate constraints. */
21528 bool
21529 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21530 rtx operands[3])
21532 rtx dst = operands[0];
21533 rtx src1 = operands[1];
21534 rtx src2 = operands[2];
21536 /* Both source operands cannot be in memory. */
21537 if (MEM_P (src1) && MEM_P (src2))
21538 return false;
21540 /* Canonicalize operand order for commutative operators. */
21541 if (ix86_swap_binary_operands_p (code, mode, operands))
21542 std::swap (src1, src2);
21544 /* If the destination is memory, we must have a matching source operand. */
21545 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21546 return false;
21548 /* Source 1 cannot be a constant. */
21549 if (CONSTANT_P (src1))
21550 return false;
21552 /* Source 1 cannot be a non-matching memory. */
21553 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21554 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21555 return (code == AND
21556 && (mode == HImode
21557 || mode == SImode
21558 || (TARGET_64BIT && mode == DImode))
21559 && satisfies_constraint_L (src2));
21561 return true;
21564 /* Attempt to expand a unary operator. Make the expansion closer to the
21565 actual machine, then just general_operand, which will allow 2 separate
21566 memory references (one output, one input) in a single insn. */
21568 void
21569 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21570 rtx operands[])
21572 bool matching_memory = false;
21573 rtx src, dst, op, clob;
21575 dst = operands[0];
21576 src = operands[1];
21578 /* If the destination is memory, and we do not have matching source
21579 operands, do things in registers. */
21580 if (MEM_P (dst))
21582 if (rtx_equal_p (dst, src))
21583 matching_memory = true;
21584 else
21585 dst = gen_reg_rtx (mode);
21588 /* When source operand is memory, destination must match. */
21589 if (MEM_P (src) && !matching_memory)
21590 src = force_reg (mode, src);
21592 /* Emit the instruction. */
21594 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21596 if (code == NOT)
21597 emit_insn (op);
21598 else
21600 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21601 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21604 /* Fix up the destination if needed. */
21605 if (dst != operands[0])
21606 emit_move_insn (operands[0], dst);
21609 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21610 divisor are within the range [0-255]. */
21612 void
21613 ix86_split_idivmod (machine_mode mode, rtx operands[],
21614 bool signed_p)
21616 rtx_code_label *end_label, *qimode_label;
21617 rtx div, mod;
21618 rtx_insn *insn;
21619 rtx scratch, tmp0, tmp1, tmp2;
21620 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21621 rtx (*gen_zero_extend) (rtx, rtx);
21622 rtx (*gen_test_ccno_1) (rtx, rtx);
21624 switch (mode)
21626 case E_SImode:
21627 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21628 gen_test_ccno_1 = gen_testsi_ccno_1;
21629 gen_zero_extend = gen_zero_extendqisi2;
21630 break;
21631 case E_DImode:
21632 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21633 gen_test_ccno_1 = gen_testdi_ccno_1;
21634 gen_zero_extend = gen_zero_extendqidi2;
21635 break;
21636 default:
21637 gcc_unreachable ();
21640 end_label = gen_label_rtx ();
21641 qimode_label = gen_label_rtx ();
21643 scratch = gen_reg_rtx (mode);
21645 /* Use 8bit unsigned divimod if dividend and divisor are within
21646 the range [0-255]. */
21647 emit_move_insn (scratch, operands[2]);
21648 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21649 scratch, 1, OPTAB_DIRECT);
21650 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21651 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21652 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21653 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21654 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21655 pc_rtx);
21656 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21657 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21658 JUMP_LABEL (insn) = qimode_label;
21660 /* Generate original signed/unsigned divimod. */
21661 div = gen_divmod4_1 (operands[0], operands[1],
21662 operands[2], operands[3]);
21663 emit_insn (div);
21665 /* Branch to the end. */
21666 emit_jump_insn (gen_jump (end_label));
21667 emit_barrier ();
21669 /* Generate 8bit unsigned divide. */
21670 emit_label (qimode_label);
21671 /* Don't use operands[0] for result of 8bit divide since not all
21672 registers support QImode ZERO_EXTRACT. */
21673 tmp0 = lowpart_subreg (HImode, scratch, mode);
21674 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21675 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21676 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21678 if (signed_p)
21680 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21681 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21683 else
21685 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21686 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21689 /* Extract remainder from AH. */
21690 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21691 if (REG_P (operands[1]))
21692 insn = emit_move_insn (operands[1], tmp1);
21693 else
21695 /* Need a new scratch register since the old one has result
21696 of 8bit divide. */
21697 scratch = gen_reg_rtx (mode);
21698 emit_move_insn (scratch, tmp1);
21699 insn = emit_move_insn (operands[1], scratch);
21701 set_unique_reg_note (insn, REG_EQUAL, mod);
21703 /* Zero extend quotient from AL. */
21704 tmp1 = gen_lowpart (QImode, tmp0);
21705 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21706 set_unique_reg_note (insn, REG_EQUAL, div);
21708 emit_label (end_label);
21711 #define LEA_MAX_STALL (3)
21712 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21714 /* Increase given DISTANCE in half-cycles according to
21715 dependencies between PREV and NEXT instructions.
21716 Add 1 half-cycle if there is no dependency and
21717 go to next cycle if there is some dependecy. */
21719 static unsigned int
21720 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21722 df_ref def, use;
21724 if (!prev || !next)
21725 return distance + (distance & 1) + 2;
21727 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21728 return distance + 1;
21730 FOR_EACH_INSN_USE (use, next)
21731 FOR_EACH_INSN_DEF (def, prev)
21732 if (!DF_REF_IS_ARTIFICIAL (def)
21733 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21734 return distance + (distance & 1) + 2;
21736 return distance + 1;
21739 /* Function checks if instruction INSN defines register number
21740 REGNO1 or REGNO2. */
21742 static bool
21743 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21744 rtx_insn *insn)
21746 df_ref def;
21748 FOR_EACH_INSN_DEF (def, insn)
21749 if (DF_REF_REG_DEF_P (def)
21750 && !DF_REF_IS_ARTIFICIAL (def)
21751 && (regno1 == DF_REF_REGNO (def)
21752 || regno2 == DF_REF_REGNO (def)))
21753 return true;
21755 return false;
21758 /* Function checks if instruction INSN uses register number
21759 REGNO as a part of address expression. */
21761 static bool
21762 insn_uses_reg_mem (unsigned int regno, rtx insn)
21764 df_ref use;
21766 FOR_EACH_INSN_USE (use, insn)
21767 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21768 return true;
21770 return false;
21773 /* Search backward for non-agu definition of register number REGNO1
21774 or register number REGNO2 in basic block starting from instruction
21775 START up to head of basic block or instruction INSN.
21777 Function puts true value into *FOUND var if definition was found
21778 and false otherwise.
21780 Distance in half-cycles between START and found instruction or head
21781 of BB is added to DISTANCE and returned. */
21783 static int
21784 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21785 rtx_insn *insn, int distance,
21786 rtx_insn *start, bool *found)
21788 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21789 rtx_insn *prev = start;
21790 rtx_insn *next = NULL;
21792 *found = false;
21794 while (prev
21795 && prev != insn
21796 && distance < LEA_SEARCH_THRESHOLD)
21798 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21800 distance = increase_distance (prev, next, distance);
21801 if (insn_defines_reg (regno1, regno2, prev))
21803 if (recog_memoized (prev) < 0
21804 || get_attr_type (prev) != TYPE_LEA)
21806 *found = true;
21807 return distance;
21811 next = prev;
21813 if (prev == BB_HEAD (bb))
21814 break;
21816 prev = PREV_INSN (prev);
21819 return distance;
21822 /* Search backward for non-agu definition of register number REGNO1
21823 or register number REGNO2 in INSN's basic block until
21824 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21825 2. Reach neighbor BBs boundary, or
21826 3. Reach agu definition.
21827 Returns the distance between the non-agu definition point and INSN.
21828 If no definition point, returns -1. */
21830 static int
21831 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21832 rtx_insn *insn)
21834 basic_block bb = BLOCK_FOR_INSN (insn);
21835 int distance = 0;
21836 bool found = false;
21838 if (insn != BB_HEAD (bb))
21839 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21840 distance, PREV_INSN (insn),
21841 &found);
21843 if (!found && distance < LEA_SEARCH_THRESHOLD)
21845 edge e;
21846 edge_iterator ei;
21847 bool simple_loop = false;
21849 FOR_EACH_EDGE (e, ei, bb->preds)
21850 if (e->src == bb)
21852 simple_loop = true;
21853 break;
21856 if (simple_loop)
21857 distance = distance_non_agu_define_in_bb (regno1, regno2,
21858 insn, distance,
21859 BB_END (bb), &found);
21860 else
21862 int shortest_dist = -1;
21863 bool found_in_bb = false;
21865 FOR_EACH_EDGE (e, ei, bb->preds)
21867 int bb_dist
21868 = distance_non_agu_define_in_bb (regno1, regno2,
21869 insn, distance,
21870 BB_END (e->src),
21871 &found_in_bb);
21872 if (found_in_bb)
21874 if (shortest_dist < 0)
21875 shortest_dist = bb_dist;
21876 else if (bb_dist > 0)
21877 shortest_dist = MIN (bb_dist, shortest_dist);
21879 found = true;
21883 distance = shortest_dist;
21887 /* get_attr_type may modify recog data. We want to make sure
21888 that recog data is valid for instruction INSN, on which
21889 distance_non_agu_define is called. INSN is unchanged here. */
21890 extract_insn_cached (insn);
21892 if (!found)
21893 return -1;
21895 return distance >> 1;
21898 /* Return the distance in half-cycles between INSN and the next
21899 insn that uses register number REGNO in memory address added
21900 to DISTANCE. Return -1 if REGNO0 is set.
21902 Put true value into *FOUND if register usage was found and
21903 false otherwise.
21904 Put true value into *REDEFINED if register redefinition was
21905 found and false otherwise. */
21907 static int
21908 distance_agu_use_in_bb (unsigned int regno,
21909 rtx_insn *insn, int distance, rtx_insn *start,
21910 bool *found, bool *redefined)
21912 basic_block bb = NULL;
21913 rtx_insn *next = start;
21914 rtx_insn *prev = NULL;
21916 *found = false;
21917 *redefined = false;
21919 if (start != NULL_RTX)
21921 bb = BLOCK_FOR_INSN (start);
21922 if (start != BB_HEAD (bb))
21923 /* If insn and start belong to the same bb, set prev to insn,
21924 so the call to increase_distance will increase the distance
21925 between insns by 1. */
21926 prev = insn;
21929 while (next
21930 && next != insn
21931 && distance < LEA_SEARCH_THRESHOLD)
21933 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21935 distance = increase_distance(prev, next, distance);
21936 if (insn_uses_reg_mem (regno, next))
21938 /* Return DISTANCE if OP0 is used in memory
21939 address in NEXT. */
21940 *found = true;
21941 return distance;
21944 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21946 /* Return -1 if OP0 is set in NEXT. */
21947 *redefined = true;
21948 return -1;
21951 prev = next;
21954 if (next == BB_END (bb))
21955 break;
21957 next = NEXT_INSN (next);
21960 return distance;
21963 /* Return the distance between INSN and the next insn that uses
21964 register number REGNO0 in memory address. Return -1 if no such
21965 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21967 static int
21968 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21970 basic_block bb = BLOCK_FOR_INSN (insn);
21971 int distance = 0;
21972 bool found = false;
21973 bool redefined = false;
21975 if (insn != BB_END (bb))
21976 distance = distance_agu_use_in_bb (regno0, insn, distance,
21977 NEXT_INSN (insn),
21978 &found, &redefined);
21980 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21982 edge e;
21983 edge_iterator ei;
21984 bool simple_loop = false;
21986 FOR_EACH_EDGE (e, ei, bb->succs)
21987 if (e->dest == bb)
21989 simple_loop = true;
21990 break;
21993 if (simple_loop)
21994 distance = distance_agu_use_in_bb (regno0, insn,
21995 distance, BB_HEAD (bb),
21996 &found, &redefined);
21997 else
21999 int shortest_dist = -1;
22000 bool found_in_bb = false;
22001 bool redefined_in_bb = false;
22003 FOR_EACH_EDGE (e, ei, bb->succs)
22005 int bb_dist
22006 = distance_agu_use_in_bb (regno0, insn,
22007 distance, BB_HEAD (e->dest),
22008 &found_in_bb, &redefined_in_bb);
22009 if (found_in_bb)
22011 if (shortest_dist < 0)
22012 shortest_dist = bb_dist;
22013 else if (bb_dist > 0)
22014 shortest_dist = MIN (bb_dist, shortest_dist);
22016 found = true;
22020 distance = shortest_dist;
22024 if (!found || redefined)
22025 return -1;
22027 return distance >> 1;
22030 /* Define this macro to tune LEA priority vs ADD, it take effect when
22031 there is a dilemma of choicing LEA or ADD
22032 Negative value: ADD is more preferred than LEA
22033 Zero: Netrual
22034 Positive value: LEA is more preferred than ADD*/
22035 #define IX86_LEA_PRIORITY 0
22037 /* Return true if usage of lea INSN has performance advantage
22038 over a sequence of instructions. Instructions sequence has
22039 SPLIT_COST cycles higher latency than lea latency. */
22041 static bool
22042 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
22043 unsigned int regno2, int split_cost, bool has_scale)
22045 int dist_define, dist_use;
22047 /* For Silvermont if using a 2-source or 3-source LEA for
22048 non-destructive destination purposes, or due to wanting
22049 ability to use SCALE, the use of LEA is justified. */
22050 if (TARGET_SILVERMONT || TARGET_INTEL)
22052 if (has_scale)
22053 return true;
22054 if (split_cost < 1)
22055 return false;
22056 if (regno0 == regno1 || regno0 == regno2)
22057 return false;
22058 return true;
22061 dist_define = distance_non_agu_define (regno1, regno2, insn);
22062 dist_use = distance_agu_use (regno0, insn);
22064 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
22066 /* If there is no non AGU operand definition, no AGU
22067 operand usage and split cost is 0 then both lea
22068 and non lea variants have same priority. Currently
22069 we prefer lea for 64 bit code and non lea on 32 bit
22070 code. */
22071 if (dist_use < 0 && split_cost == 0)
22072 return TARGET_64BIT || IX86_LEA_PRIORITY;
22073 else
22074 return true;
22077 /* With longer definitions distance lea is more preferable.
22078 Here we change it to take into account splitting cost and
22079 lea priority. */
22080 dist_define += split_cost + IX86_LEA_PRIORITY;
22082 /* If there is no use in memory addess then we just check
22083 that split cost exceeds AGU stall. */
22084 if (dist_use < 0)
22085 return dist_define > LEA_MAX_STALL;
22087 /* If this insn has both backward non-agu dependence and forward
22088 agu dependence, the one with short distance takes effect. */
22089 return dist_define >= dist_use;
22092 /* Return true if it is legal to clobber flags by INSN and
22093 false otherwise. */
22095 static bool
22096 ix86_ok_to_clobber_flags (rtx_insn *insn)
22098 basic_block bb = BLOCK_FOR_INSN (insn);
22099 df_ref use;
22100 bitmap live;
22102 while (insn)
22104 if (NONDEBUG_INSN_P (insn))
22106 FOR_EACH_INSN_USE (use, insn)
22107 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
22108 return false;
22110 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
22111 return true;
22114 if (insn == BB_END (bb))
22115 break;
22117 insn = NEXT_INSN (insn);
22120 live = df_get_live_out(bb);
22121 return !REGNO_REG_SET_P (live, FLAGS_REG);
22124 /* Return true if we need to split op0 = op1 + op2 into a sequence of
22125 move and add to avoid AGU stalls. */
22127 bool
22128 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
22130 unsigned int regno0, regno1, regno2;
22132 /* Check if we need to optimize. */
22133 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22134 return false;
22136 /* Check it is correct to split here. */
22137 if (!ix86_ok_to_clobber_flags(insn))
22138 return false;
22140 regno0 = true_regnum (operands[0]);
22141 regno1 = true_regnum (operands[1]);
22142 regno2 = true_regnum (operands[2]);
22144 /* We need to split only adds with non destructive
22145 destination operand. */
22146 if (regno0 == regno1 || regno0 == regno2)
22147 return false;
22148 else
22149 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22152 /* Return true if we should emit lea instruction instead of mov
22153 instruction. */
22155 bool
22156 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22158 unsigned int regno0, regno1;
22160 /* Check if we need to optimize. */
22161 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22162 return false;
22164 /* Use lea for reg to reg moves only. */
22165 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22166 return false;
22168 regno0 = true_regnum (operands[0]);
22169 regno1 = true_regnum (operands[1]);
22171 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22174 /* Return true if we need to split lea into a sequence of
22175 instructions to avoid AGU stalls. */
22177 bool
22178 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22180 unsigned int regno0, regno1, regno2;
22181 int split_cost;
22182 struct ix86_address parts;
22183 int ok;
22185 /* Check we need to optimize. */
22186 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22187 return false;
22189 /* The "at least two components" test below might not catch simple
22190 move or zero extension insns if parts.base is non-NULL and parts.disp
22191 is const0_rtx as the only components in the address, e.g. if the
22192 register is %rbp or %r13. As this test is much cheaper and moves or
22193 zero extensions are the common case, do this check first. */
22194 if (REG_P (operands[1])
22195 || (SImode_address_operand (operands[1], VOIDmode)
22196 && REG_P (XEXP (operands[1], 0))))
22197 return false;
22199 /* Check if it is OK to split here. */
22200 if (!ix86_ok_to_clobber_flags (insn))
22201 return false;
22203 ok = ix86_decompose_address (operands[1], &parts);
22204 gcc_assert (ok);
22206 /* There should be at least two components in the address. */
22207 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22208 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22209 return false;
22211 /* We should not split into add if non legitimate pic
22212 operand is used as displacement. */
22213 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22214 return false;
22216 regno0 = true_regnum (operands[0]) ;
22217 regno1 = INVALID_REGNUM;
22218 regno2 = INVALID_REGNUM;
22220 if (parts.base)
22221 regno1 = true_regnum (parts.base);
22222 if (parts.index)
22223 regno2 = true_regnum (parts.index);
22225 split_cost = 0;
22227 /* Compute how many cycles we will add to execution time
22228 if split lea into a sequence of instructions. */
22229 if (parts.base || parts.index)
22231 /* Have to use mov instruction if non desctructive
22232 destination form is used. */
22233 if (regno1 != regno0 && regno2 != regno0)
22234 split_cost += 1;
22236 /* Have to add index to base if both exist. */
22237 if (parts.base && parts.index)
22238 split_cost += 1;
22240 /* Have to use shift and adds if scale is 2 or greater. */
22241 if (parts.scale > 1)
22243 if (regno0 != regno1)
22244 split_cost += 1;
22245 else if (regno2 == regno0)
22246 split_cost += 4;
22247 else
22248 split_cost += parts.scale;
22251 /* Have to use add instruction with immediate if
22252 disp is non zero. */
22253 if (parts.disp && parts.disp != const0_rtx)
22254 split_cost += 1;
22256 /* Subtract the price of lea. */
22257 split_cost -= 1;
22260 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22261 parts.scale > 1);
22264 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22265 matches destination. RTX includes clobber of FLAGS_REG. */
22267 static void
22268 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22269 rtx dst, rtx src)
22271 rtx op, clob;
22273 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22274 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22276 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22279 /* Return true if regno1 def is nearest to the insn. */
22281 static bool
22282 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22284 rtx_insn *prev = insn;
22285 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22287 if (insn == start)
22288 return false;
22289 while (prev && prev != start)
22291 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22293 prev = PREV_INSN (prev);
22294 continue;
22296 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22297 return true;
22298 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22299 return false;
22300 prev = PREV_INSN (prev);
22303 /* None of the regs is defined in the bb. */
22304 return false;
22307 /* Split lea instructions into a sequence of instructions
22308 which are executed on ALU to avoid AGU stalls.
22309 It is assumed that it is allowed to clobber flags register
22310 at lea position. */
22312 void
22313 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22315 unsigned int regno0, regno1, regno2;
22316 struct ix86_address parts;
22317 rtx target, tmp;
22318 int ok, adds;
22320 ok = ix86_decompose_address (operands[1], &parts);
22321 gcc_assert (ok);
22323 target = gen_lowpart (mode, operands[0]);
22325 regno0 = true_regnum (target);
22326 regno1 = INVALID_REGNUM;
22327 regno2 = INVALID_REGNUM;
22329 if (parts.base)
22331 parts.base = gen_lowpart (mode, parts.base);
22332 regno1 = true_regnum (parts.base);
22335 if (parts.index)
22337 parts.index = gen_lowpart (mode, parts.index);
22338 regno2 = true_regnum (parts.index);
22341 if (parts.disp)
22342 parts.disp = gen_lowpart (mode, parts.disp);
22344 if (parts.scale > 1)
22346 /* Case r1 = r1 + ... */
22347 if (regno1 == regno0)
22349 /* If we have a case r1 = r1 + C * r2 then we
22350 should use multiplication which is very
22351 expensive. Assume cost model is wrong if we
22352 have such case here. */
22353 gcc_assert (regno2 != regno0);
22355 for (adds = parts.scale; adds > 0; adds--)
22356 ix86_emit_binop (PLUS, mode, target, parts.index);
22358 else
22360 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22361 if (regno0 != regno2)
22362 emit_insn (gen_rtx_SET (target, parts.index));
22364 /* Use shift for scaling. */
22365 ix86_emit_binop (ASHIFT, mode, target,
22366 GEN_INT (exact_log2 (parts.scale)));
22368 if (parts.base)
22369 ix86_emit_binop (PLUS, mode, target, parts.base);
22371 if (parts.disp && parts.disp != const0_rtx)
22372 ix86_emit_binop (PLUS, mode, target, parts.disp);
22375 else if (!parts.base && !parts.index)
22377 gcc_assert(parts.disp);
22378 emit_insn (gen_rtx_SET (target, parts.disp));
22380 else
22382 if (!parts.base)
22384 if (regno0 != regno2)
22385 emit_insn (gen_rtx_SET (target, parts.index));
22387 else if (!parts.index)
22389 if (regno0 != regno1)
22390 emit_insn (gen_rtx_SET (target, parts.base));
22392 else
22394 if (regno0 == regno1)
22395 tmp = parts.index;
22396 else if (regno0 == regno2)
22397 tmp = parts.base;
22398 else
22400 rtx tmp1;
22402 /* Find better operand for SET instruction, depending
22403 on which definition is farther from the insn. */
22404 if (find_nearest_reg_def (insn, regno1, regno2))
22405 tmp = parts.index, tmp1 = parts.base;
22406 else
22407 tmp = parts.base, tmp1 = parts.index;
22409 emit_insn (gen_rtx_SET (target, tmp));
22411 if (parts.disp && parts.disp != const0_rtx)
22412 ix86_emit_binop (PLUS, mode, target, parts.disp);
22414 ix86_emit_binop (PLUS, mode, target, tmp1);
22415 return;
22418 ix86_emit_binop (PLUS, mode, target, tmp);
22421 if (parts.disp && parts.disp != const0_rtx)
22422 ix86_emit_binop (PLUS, mode, target, parts.disp);
22426 /* Return true if it is ok to optimize an ADD operation to LEA
22427 operation to avoid flag register consumation. For most processors,
22428 ADD is faster than LEA. For the processors like BONNELL, if the
22429 destination register of LEA holds an actual address which will be
22430 used soon, LEA is better and otherwise ADD is better. */
22432 bool
22433 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22435 unsigned int regno0 = true_regnum (operands[0]);
22436 unsigned int regno1 = true_regnum (operands[1]);
22437 unsigned int regno2 = true_regnum (operands[2]);
22439 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22440 if (regno0 != regno1 && regno0 != regno2)
22441 return true;
22443 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22444 return false;
22446 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22449 /* Return true if destination reg of SET_BODY is shift count of
22450 USE_BODY. */
22452 static bool
22453 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22455 rtx set_dest;
22456 rtx shift_rtx;
22457 int i;
22459 /* Retrieve destination of SET_BODY. */
22460 switch (GET_CODE (set_body))
22462 case SET:
22463 set_dest = SET_DEST (set_body);
22464 if (!set_dest || !REG_P (set_dest))
22465 return false;
22466 break;
22467 case PARALLEL:
22468 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22469 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22470 use_body))
22471 return true;
22472 /* FALLTHROUGH */
22473 default:
22474 return false;
22477 /* Retrieve shift count of USE_BODY. */
22478 switch (GET_CODE (use_body))
22480 case SET:
22481 shift_rtx = XEXP (use_body, 1);
22482 break;
22483 case PARALLEL:
22484 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22485 if (ix86_dep_by_shift_count_body (set_body,
22486 XVECEXP (use_body, 0, i)))
22487 return true;
22488 /* FALLTHROUGH */
22489 default:
22490 return false;
22493 if (shift_rtx
22494 && (GET_CODE (shift_rtx) == ASHIFT
22495 || GET_CODE (shift_rtx) == LSHIFTRT
22496 || GET_CODE (shift_rtx) == ASHIFTRT
22497 || GET_CODE (shift_rtx) == ROTATE
22498 || GET_CODE (shift_rtx) == ROTATERT))
22500 rtx shift_count = XEXP (shift_rtx, 1);
22502 /* Return true if shift count is dest of SET_BODY. */
22503 if (REG_P (shift_count))
22505 /* Add check since it can be invoked before register
22506 allocation in pre-reload schedule. */
22507 if (reload_completed
22508 && true_regnum (set_dest) == true_regnum (shift_count))
22509 return true;
22510 else if (REGNO(set_dest) == REGNO(shift_count))
22511 return true;
22515 return false;
22518 /* Return true if destination reg of SET_INSN is shift count of
22519 USE_INSN. */
22521 bool
22522 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22524 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22525 PATTERN (use_insn));
22528 /* Return TRUE or FALSE depending on whether the unary operator meets the
22529 appropriate constraints. */
22531 bool
22532 ix86_unary_operator_ok (enum rtx_code,
22533 machine_mode,
22534 rtx operands[2])
22536 /* If one of operands is memory, source and destination must match. */
22537 if ((MEM_P (operands[0])
22538 || MEM_P (operands[1]))
22539 && ! rtx_equal_p (operands[0], operands[1]))
22540 return false;
22541 return true;
22544 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22545 are ok, keeping in mind the possible movddup alternative. */
22547 bool
22548 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22550 if (MEM_P (operands[0]))
22551 return rtx_equal_p (operands[0], operands[1 + high]);
22552 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22553 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22554 return true;
22557 /* Post-reload splitter for converting an SF or DFmode value in an
22558 SSE register into an unsigned SImode. */
22560 void
22561 ix86_split_convert_uns_si_sse (rtx operands[])
22563 machine_mode vecmode;
22564 rtx value, large, zero_or_two31, input, two31, x;
22566 large = operands[1];
22567 zero_or_two31 = operands[2];
22568 input = operands[3];
22569 two31 = operands[4];
22570 vecmode = GET_MODE (large);
22571 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22573 /* Load up the value into the low element. We must ensure that the other
22574 elements are valid floats -- zero is the easiest such value. */
22575 if (MEM_P (input))
22577 if (vecmode == V4SFmode)
22578 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22579 else
22580 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22582 else
22584 input = gen_rtx_REG (vecmode, REGNO (input));
22585 emit_move_insn (value, CONST0_RTX (vecmode));
22586 if (vecmode == V4SFmode)
22587 emit_insn (gen_sse_movss (value, value, input));
22588 else
22589 emit_insn (gen_sse2_movsd (value, value, input));
22592 emit_move_insn (large, two31);
22593 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22595 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22596 emit_insn (gen_rtx_SET (large, x));
22598 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22599 emit_insn (gen_rtx_SET (zero_or_two31, x));
22601 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22602 emit_insn (gen_rtx_SET (value, x));
22604 large = gen_rtx_REG (V4SImode, REGNO (large));
22605 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22607 x = gen_rtx_REG (V4SImode, REGNO (value));
22608 if (vecmode == V4SFmode)
22609 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22610 else
22611 emit_insn (gen_sse2_cvttpd2dq (x, value));
22612 value = x;
22614 emit_insn (gen_xorv4si3 (value, value, large));
22617 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22618 Expects the 64-bit DImode to be supplied in a pair of integral
22619 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22620 -mfpmath=sse, !optimize_size only. */
22622 void
22623 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22625 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22626 rtx int_xmm, fp_xmm;
22627 rtx biases, exponents;
22628 rtx x;
22630 int_xmm = gen_reg_rtx (V4SImode);
22631 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22632 emit_insn (gen_movdi_to_sse (int_xmm, input));
22633 else if (TARGET_SSE_SPLIT_REGS)
22635 emit_clobber (int_xmm);
22636 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22638 else
22640 x = gen_reg_rtx (V2DImode);
22641 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22642 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22645 x = gen_rtx_CONST_VECTOR (V4SImode,
22646 gen_rtvec (4, GEN_INT (0x43300000UL),
22647 GEN_INT (0x45300000UL),
22648 const0_rtx, const0_rtx));
22649 exponents = validize_mem (force_const_mem (V4SImode, x));
22651 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22652 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22654 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22655 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22656 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22657 (0x1.0p84 + double(fp_value_hi_xmm)).
22658 Note these exponents differ by 32. */
22660 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22662 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22663 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22664 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22665 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22666 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22667 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22668 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22669 biases = validize_mem (force_const_mem (V2DFmode, biases));
22670 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22672 /* Add the upper and lower DFmode values together. */
22673 if (TARGET_SSE3)
22674 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22675 else
22677 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22678 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22679 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22682 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22685 /* Not used, but eases macroization of patterns. */
22686 void
22687 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22689 gcc_unreachable ();
22692 /* Convert an unsigned SImode value into a DFmode. Only currently used
22693 for SSE, but applicable anywhere. */
22695 void
22696 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22698 REAL_VALUE_TYPE TWO31r;
22699 rtx x, fp;
22701 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22702 NULL, 1, OPTAB_DIRECT);
22704 fp = gen_reg_rtx (DFmode);
22705 emit_insn (gen_floatsidf2 (fp, x));
22707 real_ldexp (&TWO31r, &dconst1, 31);
22708 x = const_double_from_real_value (TWO31r, DFmode);
22710 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22711 if (x != target)
22712 emit_move_insn (target, x);
22715 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22716 32-bit mode; otherwise we have a direct convert instruction. */
22718 void
22719 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22721 REAL_VALUE_TYPE TWO32r;
22722 rtx fp_lo, fp_hi, x;
22724 fp_lo = gen_reg_rtx (DFmode);
22725 fp_hi = gen_reg_rtx (DFmode);
22727 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22729 real_ldexp (&TWO32r, &dconst1, 32);
22730 x = const_double_from_real_value (TWO32r, DFmode);
22731 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22733 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22735 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22736 0, OPTAB_DIRECT);
22737 if (x != target)
22738 emit_move_insn (target, x);
22741 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22742 For x86_32, -mfpmath=sse, !optimize_size only. */
22743 void
22744 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22746 REAL_VALUE_TYPE ONE16r;
22747 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22749 real_ldexp (&ONE16r, &dconst1, 16);
22750 x = const_double_from_real_value (ONE16r, SFmode);
22751 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22752 NULL, 0, OPTAB_DIRECT);
22753 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22754 NULL, 0, OPTAB_DIRECT);
22755 fp_hi = gen_reg_rtx (SFmode);
22756 fp_lo = gen_reg_rtx (SFmode);
22757 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22758 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22759 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22760 0, OPTAB_DIRECT);
22761 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22762 0, OPTAB_DIRECT);
22763 if (!rtx_equal_p (target, fp_hi))
22764 emit_move_insn (target, fp_hi);
22767 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22768 a vector of unsigned ints VAL to vector of floats TARGET. */
22770 void
22771 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22773 rtx tmp[8];
22774 REAL_VALUE_TYPE TWO16r;
22775 machine_mode intmode = GET_MODE (val);
22776 machine_mode fltmode = GET_MODE (target);
22777 rtx (*cvt) (rtx, rtx);
22779 if (intmode == V4SImode)
22780 cvt = gen_floatv4siv4sf2;
22781 else
22782 cvt = gen_floatv8siv8sf2;
22783 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22784 tmp[0] = force_reg (intmode, tmp[0]);
22785 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22786 OPTAB_DIRECT);
22787 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22788 NULL_RTX, 1, OPTAB_DIRECT);
22789 tmp[3] = gen_reg_rtx (fltmode);
22790 emit_insn (cvt (tmp[3], tmp[1]));
22791 tmp[4] = gen_reg_rtx (fltmode);
22792 emit_insn (cvt (tmp[4], tmp[2]));
22793 real_ldexp (&TWO16r, &dconst1, 16);
22794 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22795 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22796 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22797 OPTAB_DIRECT);
22798 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22799 OPTAB_DIRECT);
22800 if (tmp[7] != target)
22801 emit_move_insn (target, tmp[7]);
22804 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22805 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22806 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22807 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22810 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22812 REAL_VALUE_TYPE TWO31r;
22813 rtx two31r, tmp[4];
22814 machine_mode mode = GET_MODE (val);
22815 machine_mode scalarmode = GET_MODE_INNER (mode);
22816 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22817 rtx (*cmp) (rtx, rtx, rtx, rtx);
22818 int i;
22820 for (i = 0; i < 3; i++)
22821 tmp[i] = gen_reg_rtx (mode);
22822 real_ldexp (&TWO31r, &dconst1, 31);
22823 two31r = const_double_from_real_value (TWO31r, scalarmode);
22824 two31r = ix86_build_const_vector (mode, 1, two31r);
22825 two31r = force_reg (mode, two31r);
22826 switch (mode)
22828 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22829 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22830 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22831 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22832 default: gcc_unreachable ();
22834 tmp[3] = gen_rtx_LE (mode, two31r, val);
22835 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22836 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22837 0, OPTAB_DIRECT);
22838 if (intmode == V4SImode || TARGET_AVX2)
22839 *xorp = expand_simple_binop (intmode, ASHIFT,
22840 gen_lowpart (intmode, tmp[0]),
22841 GEN_INT (31), NULL_RTX, 0,
22842 OPTAB_DIRECT);
22843 else
22845 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22846 two31 = ix86_build_const_vector (intmode, 1, two31);
22847 *xorp = expand_simple_binop (intmode, AND,
22848 gen_lowpart (intmode, tmp[0]),
22849 two31, NULL_RTX, 0,
22850 OPTAB_DIRECT);
22852 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22853 0, OPTAB_DIRECT);
22856 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22857 then replicate the value for all elements of the vector
22858 register. */
22861 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22863 int i, n_elt;
22864 rtvec v;
22865 machine_mode scalar_mode;
22867 switch (mode)
22869 case E_V64QImode:
22870 case E_V32QImode:
22871 case E_V16QImode:
22872 case E_V32HImode:
22873 case E_V16HImode:
22874 case E_V8HImode:
22875 case E_V16SImode:
22876 case E_V8SImode:
22877 case E_V4SImode:
22878 case E_V8DImode:
22879 case E_V4DImode:
22880 case E_V2DImode:
22881 gcc_assert (vect);
22882 /* FALLTHRU */
22883 case E_V16SFmode:
22884 case E_V8SFmode:
22885 case E_V4SFmode:
22886 case E_V8DFmode:
22887 case E_V4DFmode:
22888 case E_V2DFmode:
22889 n_elt = GET_MODE_NUNITS (mode);
22890 v = rtvec_alloc (n_elt);
22891 scalar_mode = GET_MODE_INNER (mode);
22893 RTVEC_ELT (v, 0) = value;
22895 for (i = 1; i < n_elt; ++i)
22896 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22898 return gen_rtx_CONST_VECTOR (mode, v);
22900 default:
22901 gcc_unreachable ();
22905 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22906 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22907 for an SSE register. If VECT is true, then replicate the mask for
22908 all elements of the vector register. If INVERT is true, then create
22909 a mask excluding the sign bit. */
22912 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22914 machine_mode vec_mode, imode;
22915 wide_int w;
22916 rtx mask, v;
22918 switch (mode)
22920 case E_V16SImode:
22921 case E_V16SFmode:
22922 case E_V8SImode:
22923 case E_V4SImode:
22924 case E_V8SFmode:
22925 case E_V4SFmode:
22926 vec_mode = mode;
22927 imode = SImode;
22928 break;
22930 case E_V8DImode:
22931 case E_V4DImode:
22932 case E_V2DImode:
22933 case E_V8DFmode:
22934 case E_V4DFmode:
22935 case E_V2DFmode:
22936 vec_mode = mode;
22937 imode = DImode;
22938 break;
22940 case E_TImode:
22941 case E_TFmode:
22942 vec_mode = VOIDmode;
22943 imode = TImode;
22944 break;
22946 default:
22947 gcc_unreachable ();
22950 machine_mode inner_mode = GET_MODE_INNER (mode);
22951 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22952 GET_MODE_BITSIZE (inner_mode));
22953 if (invert)
22954 w = wi::bit_not (w);
22956 /* Force this value into the low part of a fp vector constant. */
22957 mask = immed_wide_int_const (w, imode);
22958 mask = gen_lowpart (inner_mode, mask);
22960 if (vec_mode == VOIDmode)
22961 return force_reg (inner_mode, mask);
22963 v = ix86_build_const_vector (vec_mode, vect, mask);
22964 return force_reg (vec_mode, v);
22967 /* Generate code for floating point ABS or NEG. */
22969 void
22970 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22971 rtx operands[])
22973 rtx mask, set, dst, src;
22974 bool use_sse = false;
22975 bool vector_mode = VECTOR_MODE_P (mode);
22976 machine_mode vmode = mode;
22978 if (vector_mode)
22979 use_sse = true;
22980 else if (mode == TFmode)
22981 use_sse = true;
22982 else if (TARGET_SSE_MATH)
22984 use_sse = SSE_FLOAT_MODE_P (mode);
22985 if (mode == SFmode)
22986 vmode = V4SFmode;
22987 else if (mode == DFmode)
22988 vmode = V2DFmode;
22991 /* NEG and ABS performed with SSE use bitwise mask operations.
22992 Create the appropriate mask now. */
22993 if (use_sse)
22994 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22995 else
22996 mask = NULL_RTX;
22998 dst = operands[0];
22999 src = operands[1];
23001 set = gen_rtx_fmt_e (code, mode, src);
23002 set = gen_rtx_SET (dst, set);
23004 if (mask)
23006 rtx use, clob;
23007 rtvec par;
23009 use = gen_rtx_USE (VOIDmode, mask);
23010 if (vector_mode)
23011 par = gen_rtvec (2, set, use);
23012 else
23014 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
23015 par = gen_rtvec (3, set, use, clob);
23017 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
23019 else
23020 emit_insn (set);
23023 /* Expand a copysign operation. Special case operand 0 being a constant. */
23025 void
23026 ix86_expand_copysign (rtx operands[])
23028 machine_mode mode, vmode;
23029 rtx dest, op0, op1, mask, nmask;
23031 dest = operands[0];
23032 op0 = operands[1];
23033 op1 = operands[2];
23035 mode = GET_MODE (dest);
23037 if (mode == SFmode)
23038 vmode = V4SFmode;
23039 else if (mode == DFmode)
23040 vmode = V2DFmode;
23041 else
23042 vmode = mode;
23044 if (CONST_DOUBLE_P (op0))
23046 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
23048 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
23049 op0 = simplify_unary_operation (ABS, mode, op0, mode);
23051 if (mode == SFmode || mode == DFmode)
23053 if (op0 == CONST0_RTX (mode))
23054 op0 = CONST0_RTX (vmode);
23055 else
23057 rtx v = ix86_build_const_vector (vmode, false, op0);
23059 op0 = force_reg (vmode, v);
23062 else if (op0 != CONST0_RTX (mode))
23063 op0 = force_reg (mode, op0);
23065 mask = ix86_build_signbit_mask (vmode, 0, 0);
23067 if (mode == SFmode)
23068 copysign_insn = gen_copysignsf3_const;
23069 else if (mode == DFmode)
23070 copysign_insn = gen_copysigndf3_const;
23071 else
23072 copysign_insn = gen_copysigntf3_const;
23074 emit_insn (copysign_insn (dest, op0, op1, mask));
23076 else
23078 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
23080 nmask = ix86_build_signbit_mask (vmode, 0, 1);
23081 mask = ix86_build_signbit_mask (vmode, 0, 0);
23083 if (mode == SFmode)
23084 copysign_insn = gen_copysignsf3_var;
23085 else if (mode == DFmode)
23086 copysign_insn = gen_copysigndf3_var;
23087 else
23088 copysign_insn = gen_copysigntf3_var;
23090 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
23094 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
23095 be a constant, and so has already been expanded into a vector constant. */
23097 void
23098 ix86_split_copysign_const (rtx operands[])
23100 machine_mode mode, vmode;
23101 rtx dest, op0, mask, x;
23103 dest = operands[0];
23104 op0 = operands[1];
23105 mask = operands[3];
23107 mode = GET_MODE (dest);
23108 vmode = GET_MODE (mask);
23110 dest = lowpart_subreg (vmode, dest, mode);
23111 x = gen_rtx_AND (vmode, dest, mask);
23112 emit_insn (gen_rtx_SET (dest, x));
23114 if (op0 != CONST0_RTX (vmode))
23116 x = gen_rtx_IOR (vmode, dest, op0);
23117 emit_insn (gen_rtx_SET (dest, x));
23121 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
23122 so we have to do two masks. */
23124 void
23125 ix86_split_copysign_var (rtx operands[])
23127 machine_mode mode, vmode;
23128 rtx dest, scratch, op0, op1, mask, nmask, x;
23130 dest = operands[0];
23131 scratch = operands[1];
23132 op0 = operands[2];
23133 op1 = operands[3];
23134 nmask = operands[4];
23135 mask = operands[5];
23137 mode = GET_MODE (dest);
23138 vmode = GET_MODE (mask);
23140 if (rtx_equal_p (op0, op1))
23142 /* Shouldn't happen often (it's useless, obviously), but when it does
23143 we'd generate incorrect code if we continue below. */
23144 emit_move_insn (dest, op0);
23145 return;
23148 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23150 gcc_assert (REGNO (op1) == REGNO (scratch));
23152 x = gen_rtx_AND (vmode, scratch, mask);
23153 emit_insn (gen_rtx_SET (scratch, x));
23155 dest = mask;
23156 op0 = lowpart_subreg (vmode, op0, mode);
23157 x = gen_rtx_NOT (vmode, dest);
23158 x = gen_rtx_AND (vmode, x, op0);
23159 emit_insn (gen_rtx_SET (dest, x));
23161 else
23163 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23165 x = gen_rtx_AND (vmode, scratch, mask);
23167 else /* alternative 2,4 */
23169 gcc_assert (REGNO (mask) == REGNO (scratch));
23170 op1 = lowpart_subreg (vmode, op1, mode);
23171 x = gen_rtx_AND (vmode, scratch, op1);
23173 emit_insn (gen_rtx_SET (scratch, x));
23175 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23177 dest = lowpart_subreg (vmode, op0, mode);
23178 x = gen_rtx_AND (vmode, dest, nmask);
23180 else /* alternative 3,4 */
23182 gcc_assert (REGNO (nmask) == REGNO (dest));
23183 dest = nmask;
23184 op0 = lowpart_subreg (vmode, op0, mode);
23185 x = gen_rtx_AND (vmode, dest, op0);
23187 emit_insn (gen_rtx_SET (dest, x));
23190 x = gen_rtx_IOR (vmode, dest, scratch);
23191 emit_insn (gen_rtx_SET (dest, x));
23194 /* Return TRUE or FALSE depending on whether the first SET in INSN
23195 has source and destination with matching CC modes, and that the
23196 CC mode is at least as constrained as REQ_MODE. */
23198 bool
23199 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23201 rtx set;
23202 machine_mode set_mode;
23204 set = PATTERN (insn);
23205 if (GET_CODE (set) == PARALLEL)
23206 set = XVECEXP (set, 0, 0);
23207 gcc_assert (GET_CODE (set) == SET);
23208 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23210 set_mode = GET_MODE (SET_DEST (set));
23211 switch (set_mode)
23213 case E_CCNOmode:
23214 if (req_mode != CCNOmode
23215 && (req_mode != CCmode
23216 || XEXP (SET_SRC (set), 1) != const0_rtx))
23217 return false;
23218 break;
23219 case E_CCmode:
23220 if (req_mode == CCGCmode)
23221 return false;
23222 /* FALLTHRU */
23223 case E_CCGCmode:
23224 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23225 return false;
23226 /* FALLTHRU */
23227 case E_CCGOCmode:
23228 if (req_mode == CCZmode)
23229 return false;
23230 /* FALLTHRU */
23231 case E_CCZmode:
23232 break;
23234 case E_CCAmode:
23235 case E_CCCmode:
23236 case E_CCOmode:
23237 case E_CCPmode:
23238 case E_CCSmode:
23239 if (set_mode != req_mode)
23240 return false;
23241 break;
23243 default:
23244 gcc_unreachable ();
23247 return GET_MODE (SET_SRC (set)) == set_mode;
23250 /* Generate insn patterns to do an integer compare of OPERANDS. */
23252 static rtx
23253 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23255 machine_mode cmpmode;
23256 rtx tmp, flags;
23258 cmpmode = SELECT_CC_MODE (code, op0, op1);
23259 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23261 /* This is very simple, but making the interface the same as in the
23262 FP case makes the rest of the code easier. */
23263 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23264 emit_insn (gen_rtx_SET (flags, tmp));
23266 /* Return the test that should be put into the flags user, i.e.
23267 the bcc, scc, or cmov instruction. */
23268 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23271 /* Figure out whether to use ordered or unordered fp comparisons.
23272 Return the appropriate mode to use. */
23274 machine_mode
23275 ix86_fp_compare_mode (enum rtx_code)
23277 /* ??? In order to make all comparisons reversible, we do all comparisons
23278 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23279 all forms trapping and nontrapping comparisons, we can make inequality
23280 comparisons trapping again, since it results in better code when using
23281 FCOM based compares. */
23282 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23285 machine_mode
23286 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23288 machine_mode mode = GET_MODE (op0);
23290 if (SCALAR_FLOAT_MODE_P (mode))
23292 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23293 return ix86_fp_compare_mode (code);
23296 switch (code)
23298 /* Only zero flag is needed. */
23299 case EQ: /* ZF=0 */
23300 case NE: /* ZF!=0 */
23301 return CCZmode;
23302 /* Codes needing carry flag. */
23303 case GEU: /* CF=0 */
23304 case LTU: /* CF=1 */
23305 /* Detect overflow checks. They need just the carry flag. */
23306 if (GET_CODE (op0) == PLUS
23307 && (rtx_equal_p (op1, XEXP (op0, 0))
23308 || rtx_equal_p (op1, XEXP (op0, 1))))
23309 return CCCmode;
23310 else
23311 return CCmode;
23312 case GTU: /* CF=0 & ZF=0 */
23313 case LEU: /* CF=1 | ZF=1 */
23314 return CCmode;
23315 /* Codes possibly doable only with sign flag when
23316 comparing against zero. */
23317 case GE: /* SF=OF or SF=0 */
23318 case LT: /* SF<>OF or SF=1 */
23319 if (op1 == const0_rtx)
23320 return CCGOCmode;
23321 else
23322 /* For other cases Carry flag is not required. */
23323 return CCGCmode;
23324 /* Codes doable only with sign flag when comparing
23325 against zero, but we miss jump instruction for it
23326 so we need to use relational tests against overflow
23327 that thus needs to be zero. */
23328 case GT: /* ZF=0 & SF=OF */
23329 case LE: /* ZF=1 | SF<>OF */
23330 if (op1 == const0_rtx)
23331 return CCNOmode;
23332 else
23333 return CCGCmode;
23334 /* strcmp pattern do (use flags) and combine may ask us for proper
23335 mode. */
23336 case USE:
23337 return CCmode;
23338 default:
23339 gcc_unreachable ();
23343 /* Return the fixed registers used for condition codes. */
23345 static bool
23346 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23348 *p1 = FLAGS_REG;
23349 *p2 = FPSR_REG;
23350 return true;
23353 /* If two condition code modes are compatible, return a condition code
23354 mode which is compatible with both. Otherwise, return
23355 VOIDmode. */
23357 static machine_mode
23358 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23360 if (m1 == m2)
23361 return m1;
23363 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23364 return VOIDmode;
23366 if ((m1 == CCGCmode && m2 == CCGOCmode)
23367 || (m1 == CCGOCmode && m2 == CCGCmode))
23368 return CCGCmode;
23370 if ((m1 == CCNOmode && m2 == CCGOCmode)
23371 || (m1 == CCGOCmode && m2 == CCNOmode))
23372 return CCNOmode;
23374 if (m1 == CCZmode
23375 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23376 return m2;
23377 else if (m2 == CCZmode
23378 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23379 return m1;
23381 switch (m1)
23383 default:
23384 gcc_unreachable ();
23386 case E_CCmode:
23387 case E_CCGCmode:
23388 case E_CCGOCmode:
23389 case E_CCNOmode:
23390 case E_CCAmode:
23391 case E_CCCmode:
23392 case E_CCOmode:
23393 case E_CCPmode:
23394 case E_CCSmode:
23395 case E_CCZmode:
23396 switch (m2)
23398 default:
23399 return VOIDmode;
23401 case E_CCmode:
23402 case E_CCGCmode:
23403 case E_CCGOCmode:
23404 case E_CCNOmode:
23405 case E_CCAmode:
23406 case E_CCCmode:
23407 case E_CCOmode:
23408 case E_CCPmode:
23409 case E_CCSmode:
23410 case E_CCZmode:
23411 return CCmode;
23414 case E_CCFPmode:
23415 case E_CCFPUmode:
23416 /* These are only compatible with themselves, which we already
23417 checked above. */
23418 return VOIDmode;
23423 /* Return a comparison we can do and that it is equivalent to
23424 swap_condition (code) apart possibly from orderedness.
23425 But, never change orderedness if TARGET_IEEE_FP, returning
23426 UNKNOWN in that case if necessary. */
23428 static enum rtx_code
23429 ix86_fp_swap_condition (enum rtx_code code)
23431 switch (code)
23433 case GT: /* GTU - CF=0 & ZF=0 */
23434 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23435 case GE: /* GEU - CF=0 */
23436 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23437 case UNLT: /* LTU - CF=1 */
23438 return TARGET_IEEE_FP ? UNKNOWN : GT;
23439 case UNLE: /* LEU - CF=1 | ZF=1 */
23440 return TARGET_IEEE_FP ? UNKNOWN : GE;
23441 default:
23442 return swap_condition (code);
23446 /* Return cost of comparison CODE using the best strategy for performance.
23447 All following functions do use number of instructions as a cost metrics.
23448 In future this should be tweaked to compute bytes for optimize_size and
23449 take into account performance of various instructions on various CPUs. */
23451 static int
23452 ix86_fp_comparison_cost (enum rtx_code code)
23454 int arith_cost;
23456 /* The cost of code using bit-twiddling on %ah. */
23457 switch (code)
23459 case UNLE:
23460 case UNLT:
23461 case LTGT:
23462 case GT:
23463 case GE:
23464 case UNORDERED:
23465 case ORDERED:
23466 case UNEQ:
23467 arith_cost = 4;
23468 break;
23469 case LT:
23470 case NE:
23471 case EQ:
23472 case UNGE:
23473 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23474 break;
23475 case LE:
23476 case UNGT:
23477 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23478 break;
23479 default:
23480 gcc_unreachable ();
23483 switch (ix86_fp_comparison_strategy (code))
23485 case IX86_FPCMP_COMI:
23486 return arith_cost > 4 ? 3 : 2;
23487 case IX86_FPCMP_SAHF:
23488 return arith_cost > 4 ? 4 : 3;
23489 default:
23490 return arith_cost;
23494 /* Return strategy to use for floating-point. We assume that fcomi is always
23495 preferrable where available, since that is also true when looking at size
23496 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23498 enum ix86_fpcmp_strategy
23499 ix86_fp_comparison_strategy (enum rtx_code)
23501 /* Do fcomi/sahf based test when profitable. */
23503 if (TARGET_CMOVE)
23504 return IX86_FPCMP_COMI;
23506 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23507 return IX86_FPCMP_SAHF;
23509 return IX86_FPCMP_ARITH;
23512 /* Swap, force into registers, or otherwise massage the two operands
23513 to a fp comparison. The operands are updated in place; the new
23514 comparison code is returned. */
23516 static enum rtx_code
23517 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23519 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23520 rtx op0 = *pop0, op1 = *pop1;
23521 machine_mode op_mode = GET_MODE (op0);
23522 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23524 /* All of the unordered compare instructions only work on registers.
23525 The same is true of the fcomi compare instructions. The XFmode
23526 compare instructions require registers except when comparing
23527 against zero or when converting operand 1 from fixed point to
23528 floating point. */
23530 if (!is_sse
23531 && (fpcmp_mode == CCFPUmode
23532 || (op_mode == XFmode
23533 && ! (standard_80387_constant_p (op0) == 1
23534 || standard_80387_constant_p (op1) == 1)
23535 && GET_CODE (op1) != FLOAT)
23536 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23538 op0 = force_reg (op_mode, op0);
23539 op1 = force_reg (op_mode, op1);
23541 else
23543 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23544 things around if they appear profitable, otherwise force op0
23545 into a register. */
23547 if (standard_80387_constant_p (op0) == 0
23548 || (MEM_P (op0)
23549 && ! (standard_80387_constant_p (op1) == 0
23550 || MEM_P (op1))))
23552 enum rtx_code new_code = ix86_fp_swap_condition (code);
23553 if (new_code != UNKNOWN)
23555 std::swap (op0, op1);
23556 code = new_code;
23560 if (!REG_P (op0))
23561 op0 = force_reg (op_mode, op0);
23563 if (CONSTANT_P (op1))
23565 int tmp = standard_80387_constant_p (op1);
23566 if (tmp == 0)
23567 op1 = validize_mem (force_const_mem (op_mode, op1));
23568 else if (tmp == 1)
23570 if (TARGET_CMOVE)
23571 op1 = force_reg (op_mode, op1);
23573 else
23574 op1 = force_reg (op_mode, op1);
23578 /* Try to rearrange the comparison to make it cheaper. */
23579 if (ix86_fp_comparison_cost (code)
23580 > ix86_fp_comparison_cost (swap_condition (code))
23581 && (REG_P (op1) || can_create_pseudo_p ()))
23583 std::swap (op0, op1);
23584 code = swap_condition (code);
23585 if (!REG_P (op0))
23586 op0 = force_reg (op_mode, op0);
23589 *pop0 = op0;
23590 *pop1 = op1;
23591 return code;
23594 /* Convert comparison codes we use to represent FP comparison to integer
23595 code that will result in proper branch. Return UNKNOWN if no such code
23596 is available. */
23598 enum rtx_code
23599 ix86_fp_compare_code_to_integer (enum rtx_code code)
23601 switch (code)
23603 case GT:
23604 return GTU;
23605 case GE:
23606 return GEU;
23607 case ORDERED:
23608 case UNORDERED:
23609 return code;
23610 case UNEQ:
23611 return EQ;
23612 case UNLT:
23613 return LTU;
23614 case UNLE:
23615 return LEU;
23616 case LTGT:
23617 return NE;
23618 default:
23619 return UNKNOWN;
23623 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23625 static rtx
23626 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23628 machine_mode fpcmp_mode, intcmp_mode;
23629 rtx tmp, tmp2;
23631 fpcmp_mode = ix86_fp_compare_mode (code);
23632 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23634 /* Do fcomi/sahf based test when profitable. */
23635 switch (ix86_fp_comparison_strategy (code))
23637 case IX86_FPCMP_COMI:
23638 intcmp_mode = fpcmp_mode;
23639 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23640 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23641 emit_insn (tmp);
23642 break;
23644 case IX86_FPCMP_SAHF:
23645 intcmp_mode = fpcmp_mode;
23646 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23647 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23649 if (!scratch)
23650 scratch = gen_reg_rtx (HImode);
23651 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23652 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23653 break;
23655 case IX86_FPCMP_ARITH:
23656 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23657 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23658 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23659 if (!scratch)
23660 scratch = gen_reg_rtx (HImode);
23661 emit_insn (gen_rtx_SET (scratch, tmp2));
23663 /* In the unordered case, we have to check C2 for NaN's, which
23664 doesn't happen to work out to anything nice combination-wise.
23665 So do some bit twiddling on the value we've got in AH to come
23666 up with an appropriate set of condition codes. */
23668 intcmp_mode = CCNOmode;
23669 switch (code)
23671 case GT:
23672 case UNGT:
23673 if (code == GT || !TARGET_IEEE_FP)
23675 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23676 code = EQ;
23678 else
23680 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23681 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23682 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23683 intcmp_mode = CCmode;
23684 code = GEU;
23686 break;
23687 case LT:
23688 case UNLT:
23689 if (code == LT && TARGET_IEEE_FP)
23691 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23692 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23693 intcmp_mode = CCmode;
23694 code = EQ;
23696 else
23698 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23699 code = NE;
23701 break;
23702 case GE:
23703 case UNGE:
23704 if (code == GE || !TARGET_IEEE_FP)
23706 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23707 code = EQ;
23709 else
23711 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23712 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23713 code = NE;
23715 break;
23716 case LE:
23717 case UNLE:
23718 if (code == LE && TARGET_IEEE_FP)
23720 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23721 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23722 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23723 intcmp_mode = CCmode;
23724 code = LTU;
23726 else
23728 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23729 code = NE;
23731 break;
23732 case EQ:
23733 case UNEQ:
23734 if (code == EQ && TARGET_IEEE_FP)
23736 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23737 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23738 intcmp_mode = CCmode;
23739 code = EQ;
23741 else
23743 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23744 code = NE;
23746 break;
23747 case NE:
23748 case LTGT:
23749 if (code == NE && TARGET_IEEE_FP)
23751 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23752 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23753 GEN_INT (0x40)));
23754 code = NE;
23756 else
23758 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23759 code = EQ;
23761 break;
23763 case UNORDERED:
23764 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23765 code = NE;
23766 break;
23767 case ORDERED:
23768 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23769 code = EQ;
23770 break;
23772 default:
23773 gcc_unreachable ();
23775 break;
23777 default:
23778 gcc_unreachable();
23781 /* Return the test that should be put into the flags user, i.e.
23782 the bcc, scc, or cmov instruction. */
23783 return gen_rtx_fmt_ee (code, VOIDmode,
23784 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23785 const0_rtx);
23788 static rtx
23789 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23791 rtx ret;
23793 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23794 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23796 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23798 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23799 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23801 else
23802 ret = ix86_expand_int_compare (code, op0, op1);
23804 return ret;
23807 void
23808 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23810 machine_mode mode = GET_MODE (op0);
23811 rtx tmp;
23813 /* Handle special case - vector comparsion with boolean result, transform
23814 it using ptest instruction. */
23815 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23817 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23818 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23820 gcc_assert (code == EQ || code == NE);
23821 /* Generate XOR since we can't check that one operand is zero vector. */
23822 tmp = gen_reg_rtx (mode);
23823 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23824 tmp = gen_lowpart (p_mode, tmp);
23825 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23826 gen_rtx_UNSPEC (CCmode,
23827 gen_rtvec (2, tmp, tmp),
23828 UNSPEC_PTEST)));
23829 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23830 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23831 gen_rtx_LABEL_REF (VOIDmode, label),
23832 pc_rtx);
23833 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23834 return;
23837 switch (mode)
23839 case E_SFmode:
23840 case E_DFmode:
23841 case E_XFmode:
23842 case E_QImode:
23843 case E_HImode:
23844 case E_SImode:
23845 simple:
23846 tmp = ix86_expand_compare (code, op0, op1);
23847 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23848 gen_rtx_LABEL_REF (VOIDmode, label),
23849 pc_rtx);
23850 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23851 return;
23853 case E_DImode:
23854 if (TARGET_64BIT)
23855 goto simple;
23856 /* For 32-bit target DI comparison may be performed on
23857 SSE registers. To allow this we should avoid split
23858 to SI mode which is achieved by doing xor in DI mode
23859 and then comparing with zero (which is recognized by
23860 STV pass). We don't compare using xor when optimizing
23861 for size. */
23862 if (!optimize_insn_for_size_p ()
23863 && TARGET_STV
23864 && (code == EQ || code == NE))
23866 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23867 op1 = const0_rtx;
23869 /* FALLTHRU */
23870 case E_TImode:
23871 /* Expand DImode branch into multiple compare+branch. */
23873 rtx lo[2], hi[2];
23874 rtx_code_label *label2;
23875 enum rtx_code code1, code2, code3;
23876 machine_mode submode;
23878 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23880 std::swap (op0, op1);
23881 code = swap_condition (code);
23884 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23885 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23887 submode = mode == DImode ? SImode : DImode;
23889 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23890 avoid two branches. This costs one extra insn, so disable when
23891 optimizing for size. */
23893 if ((code == EQ || code == NE)
23894 && (!optimize_insn_for_size_p ()
23895 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23897 rtx xor0, xor1;
23899 xor1 = hi[0];
23900 if (hi[1] != const0_rtx)
23901 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23902 NULL_RTX, 0, OPTAB_WIDEN);
23904 xor0 = lo[0];
23905 if (lo[1] != const0_rtx)
23906 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23907 NULL_RTX, 0, OPTAB_WIDEN);
23909 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23910 NULL_RTX, 0, OPTAB_WIDEN);
23912 ix86_expand_branch (code, tmp, const0_rtx, label);
23913 return;
23916 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23917 op1 is a constant and the low word is zero, then we can just
23918 examine the high word. Similarly for low word -1 and
23919 less-or-equal-than or greater-than. */
23921 if (CONST_INT_P (hi[1]))
23922 switch (code)
23924 case LT: case LTU: case GE: case GEU:
23925 if (lo[1] == const0_rtx)
23927 ix86_expand_branch (code, hi[0], hi[1], label);
23928 return;
23930 break;
23931 case LE: case LEU: case GT: case GTU:
23932 if (lo[1] == constm1_rtx)
23934 ix86_expand_branch (code, hi[0], hi[1], label);
23935 return;
23937 break;
23938 default:
23939 break;
23942 /* Otherwise, we need two or three jumps. */
23944 label2 = gen_label_rtx ();
23946 code1 = code;
23947 code2 = swap_condition (code);
23948 code3 = unsigned_condition (code);
23950 switch (code)
23952 case LT: case GT: case LTU: case GTU:
23953 break;
23955 case LE: code1 = LT; code2 = GT; break;
23956 case GE: code1 = GT; code2 = LT; break;
23957 case LEU: code1 = LTU; code2 = GTU; break;
23958 case GEU: code1 = GTU; code2 = LTU; break;
23960 case EQ: code1 = UNKNOWN; code2 = NE; break;
23961 case NE: code2 = UNKNOWN; break;
23963 default:
23964 gcc_unreachable ();
23968 * a < b =>
23969 * if (hi(a) < hi(b)) goto true;
23970 * if (hi(a) > hi(b)) goto false;
23971 * if (lo(a) < lo(b)) goto true;
23972 * false:
23975 if (code1 != UNKNOWN)
23976 ix86_expand_branch (code1, hi[0], hi[1], label);
23977 if (code2 != UNKNOWN)
23978 ix86_expand_branch (code2, hi[0], hi[1], label2);
23980 ix86_expand_branch (code3, lo[0], lo[1], label);
23982 if (code2 != UNKNOWN)
23983 emit_label (label2);
23984 return;
23987 default:
23988 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23989 goto simple;
23993 /* Split branch based on floating point condition. */
23994 void
23995 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23996 rtx target1, rtx target2, rtx tmp)
23998 rtx condition;
23999 rtx_insn *i;
24001 if (target2 != pc_rtx)
24003 std::swap (target1, target2);
24004 code = reverse_condition_maybe_unordered (code);
24007 condition = ix86_expand_fp_compare (code, op1, op2,
24008 tmp);
24010 i = emit_jump_insn (gen_rtx_SET
24011 (pc_rtx,
24012 gen_rtx_IF_THEN_ELSE (VOIDmode,
24013 condition, target1, target2)));
24014 if (split_branch_probability.initialized_p ())
24015 add_reg_br_prob_note (i, split_branch_probability);
24018 void
24019 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
24021 rtx ret;
24023 gcc_assert (GET_MODE (dest) == QImode);
24025 ret = ix86_expand_compare (code, op0, op1);
24026 PUT_MODE (ret, QImode);
24027 emit_insn (gen_rtx_SET (dest, ret));
24030 /* Expand comparison setting or clearing carry flag. Return true when
24031 successful and set pop for the operation. */
24032 static bool
24033 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
24035 machine_mode mode =
24036 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
24038 /* Do not handle double-mode compares that go through special path. */
24039 if (mode == (TARGET_64BIT ? TImode : DImode))
24040 return false;
24042 if (SCALAR_FLOAT_MODE_P (mode))
24044 rtx compare_op;
24045 rtx_insn *compare_seq;
24047 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
24049 /* Shortcut: following common codes never translate
24050 into carry flag compares. */
24051 if (code == EQ || code == NE || code == UNEQ || code == LTGT
24052 || code == ORDERED || code == UNORDERED)
24053 return false;
24055 /* These comparisons require zero flag; swap operands so they won't. */
24056 if ((code == GT || code == UNLE || code == LE || code == UNGT)
24057 && !TARGET_IEEE_FP)
24059 std::swap (op0, op1);
24060 code = swap_condition (code);
24063 /* Try to expand the comparison and verify that we end up with
24064 carry flag based comparison. This fails to be true only when
24065 we decide to expand comparison using arithmetic that is not
24066 too common scenario. */
24067 start_sequence ();
24068 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
24069 compare_seq = get_insns ();
24070 end_sequence ();
24072 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
24073 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
24074 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
24075 else
24076 code = GET_CODE (compare_op);
24078 if (code != LTU && code != GEU)
24079 return false;
24081 emit_insn (compare_seq);
24082 *pop = compare_op;
24083 return true;
24086 if (!INTEGRAL_MODE_P (mode))
24087 return false;
24089 switch (code)
24091 case LTU:
24092 case GEU:
24093 break;
24095 /* Convert a==0 into (unsigned)a<1. */
24096 case EQ:
24097 case NE:
24098 if (op1 != const0_rtx)
24099 return false;
24100 op1 = const1_rtx;
24101 code = (code == EQ ? LTU : GEU);
24102 break;
24104 /* Convert a>b into b<a or a>=b-1. */
24105 case GTU:
24106 case LEU:
24107 if (CONST_INT_P (op1))
24109 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
24110 /* Bail out on overflow. We still can swap operands but that
24111 would force loading of the constant into register. */
24112 if (op1 == const0_rtx
24113 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
24114 return false;
24115 code = (code == GTU ? GEU : LTU);
24117 else
24119 std::swap (op0, op1);
24120 code = (code == GTU ? LTU : GEU);
24122 break;
24124 /* Convert a>=0 into (unsigned)a<0x80000000. */
24125 case LT:
24126 case GE:
24127 if (mode == DImode || op1 != const0_rtx)
24128 return false;
24129 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24130 code = (code == LT ? GEU : LTU);
24131 break;
24132 case LE:
24133 case GT:
24134 if (mode == DImode || op1 != constm1_rtx)
24135 return false;
24136 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24137 code = (code == LE ? GEU : LTU);
24138 break;
24140 default:
24141 return false;
24143 /* Swapping operands may cause constant to appear as first operand. */
24144 if (!nonimmediate_operand (op0, VOIDmode))
24146 if (!can_create_pseudo_p ())
24147 return false;
24148 op0 = force_reg (mode, op0);
24150 *pop = ix86_expand_compare (code, op0, op1);
24151 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24152 return true;
24155 bool
24156 ix86_expand_int_movcc (rtx operands[])
24158 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24159 rtx_insn *compare_seq;
24160 rtx compare_op;
24161 machine_mode mode = GET_MODE (operands[0]);
24162 bool sign_bit_compare_p = false;
24163 rtx op0 = XEXP (operands[1], 0);
24164 rtx op1 = XEXP (operands[1], 1);
24166 if (GET_MODE (op0) == TImode
24167 || (GET_MODE (op0) == DImode
24168 && !TARGET_64BIT))
24169 return false;
24171 start_sequence ();
24172 compare_op = ix86_expand_compare (code, op0, op1);
24173 compare_seq = get_insns ();
24174 end_sequence ();
24176 compare_code = GET_CODE (compare_op);
24178 if ((op1 == const0_rtx && (code == GE || code == LT))
24179 || (op1 == constm1_rtx && (code == GT || code == LE)))
24180 sign_bit_compare_p = true;
24182 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24183 HImode insns, we'd be swallowed in word prefix ops. */
24185 if ((mode != HImode || TARGET_FAST_PREFIX)
24186 && (mode != (TARGET_64BIT ? TImode : DImode))
24187 && CONST_INT_P (operands[2])
24188 && CONST_INT_P (operands[3]))
24190 rtx out = operands[0];
24191 HOST_WIDE_INT ct = INTVAL (operands[2]);
24192 HOST_WIDE_INT cf = INTVAL (operands[3]);
24193 HOST_WIDE_INT diff;
24195 diff = ct - cf;
24196 /* Sign bit compares are better done using shifts than we do by using
24197 sbb. */
24198 if (sign_bit_compare_p
24199 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24201 /* Detect overlap between destination and compare sources. */
24202 rtx tmp = out;
24204 if (!sign_bit_compare_p)
24206 rtx flags;
24207 bool fpcmp = false;
24209 compare_code = GET_CODE (compare_op);
24211 flags = XEXP (compare_op, 0);
24213 if (GET_MODE (flags) == CCFPmode
24214 || GET_MODE (flags) == CCFPUmode)
24216 fpcmp = true;
24217 compare_code
24218 = ix86_fp_compare_code_to_integer (compare_code);
24221 /* To simplify rest of code, restrict to the GEU case. */
24222 if (compare_code == LTU)
24224 std::swap (ct, cf);
24225 compare_code = reverse_condition (compare_code);
24226 code = reverse_condition (code);
24228 else
24230 if (fpcmp)
24231 PUT_CODE (compare_op,
24232 reverse_condition_maybe_unordered
24233 (GET_CODE (compare_op)));
24234 else
24235 PUT_CODE (compare_op,
24236 reverse_condition (GET_CODE (compare_op)));
24238 diff = ct - cf;
24240 if (reg_overlap_mentioned_p (out, op0)
24241 || reg_overlap_mentioned_p (out, op1))
24242 tmp = gen_reg_rtx (mode);
24244 if (mode == DImode)
24245 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24246 else
24247 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24248 flags, compare_op));
24250 else
24252 if (code == GT || code == GE)
24253 code = reverse_condition (code);
24254 else
24256 std::swap (ct, cf);
24257 diff = ct - cf;
24259 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24262 if (diff == 1)
24265 * cmpl op0,op1
24266 * sbbl dest,dest
24267 * [addl dest, ct]
24269 * Size 5 - 8.
24271 if (ct)
24272 tmp = expand_simple_binop (mode, PLUS,
24273 tmp, GEN_INT (ct),
24274 copy_rtx (tmp), 1, OPTAB_DIRECT);
24276 else if (cf == -1)
24279 * cmpl op0,op1
24280 * sbbl dest,dest
24281 * orl $ct, dest
24283 * Size 8.
24285 tmp = expand_simple_binop (mode, IOR,
24286 tmp, GEN_INT (ct),
24287 copy_rtx (tmp), 1, OPTAB_DIRECT);
24289 else if (diff == -1 && ct)
24292 * cmpl op0,op1
24293 * sbbl dest,dest
24294 * notl dest
24295 * [addl dest, cf]
24297 * Size 8 - 11.
24299 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24300 if (cf)
24301 tmp = expand_simple_binop (mode, PLUS,
24302 copy_rtx (tmp), GEN_INT (cf),
24303 copy_rtx (tmp), 1, OPTAB_DIRECT);
24305 else
24308 * cmpl op0,op1
24309 * sbbl dest,dest
24310 * [notl dest]
24311 * andl cf - ct, dest
24312 * [addl dest, ct]
24314 * Size 8 - 11.
24317 if (cf == 0)
24319 cf = ct;
24320 ct = 0;
24321 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24324 tmp = expand_simple_binop (mode, AND,
24325 copy_rtx (tmp),
24326 gen_int_mode (cf - ct, mode),
24327 copy_rtx (tmp), 1, OPTAB_DIRECT);
24328 if (ct)
24329 tmp = expand_simple_binop (mode, PLUS,
24330 copy_rtx (tmp), GEN_INT (ct),
24331 copy_rtx (tmp), 1, OPTAB_DIRECT);
24334 if (!rtx_equal_p (tmp, out))
24335 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24337 return true;
24340 if (diff < 0)
24342 machine_mode cmp_mode = GET_MODE (op0);
24343 enum rtx_code new_code;
24345 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24347 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24349 /* We may be reversing unordered compare to normal compare, that
24350 is not valid in general (we may convert non-trapping condition
24351 to trapping one), however on i386 we currently emit all
24352 comparisons unordered. */
24353 new_code = reverse_condition_maybe_unordered (code);
24355 else
24356 new_code = ix86_reverse_condition (code, cmp_mode);
24357 if (new_code != UNKNOWN)
24359 std::swap (ct, cf);
24360 diff = -diff;
24361 code = new_code;
24365 compare_code = UNKNOWN;
24366 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24367 && CONST_INT_P (op1))
24369 if (op1 == const0_rtx
24370 && (code == LT || code == GE))
24371 compare_code = code;
24372 else if (op1 == constm1_rtx)
24374 if (code == LE)
24375 compare_code = LT;
24376 else if (code == GT)
24377 compare_code = GE;
24381 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24382 if (compare_code != UNKNOWN
24383 && GET_MODE (op0) == GET_MODE (out)
24384 && (cf == -1 || ct == -1))
24386 /* If lea code below could be used, only optimize
24387 if it results in a 2 insn sequence. */
24389 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24390 || diff == 3 || diff == 5 || diff == 9)
24391 || (compare_code == LT && ct == -1)
24392 || (compare_code == GE && cf == -1))
24395 * notl op1 (if necessary)
24396 * sarl $31, op1
24397 * orl cf, op1
24399 if (ct != -1)
24401 cf = ct;
24402 ct = -1;
24403 code = reverse_condition (code);
24406 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24408 out = expand_simple_binop (mode, IOR,
24409 out, GEN_INT (cf),
24410 out, 1, OPTAB_DIRECT);
24411 if (out != operands[0])
24412 emit_move_insn (operands[0], out);
24414 return true;
24419 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24420 || diff == 3 || diff == 5 || diff == 9)
24421 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24422 && (mode != DImode
24423 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24426 * xorl dest,dest
24427 * cmpl op1,op2
24428 * setcc dest
24429 * lea cf(dest*(ct-cf)),dest
24431 * Size 14.
24433 * This also catches the degenerate setcc-only case.
24436 rtx tmp;
24437 int nops;
24439 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24441 nops = 0;
24442 /* On x86_64 the lea instruction operates on Pmode, so we need
24443 to get arithmetics done in proper mode to match. */
24444 if (diff == 1)
24445 tmp = copy_rtx (out);
24446 else
24448 rtx out1;
24449 out1 = copy_rtx (out);
24450 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24451 nops++;
24452 if (diff & 1)
24454 tmp = gen_rtx_PLUS (mode, tmp, out1);
24455 nops++;
24458 if (cf != 0)
24460 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24461 nops++;
24463 if (!rtx_equal_p (tmp, out))
24465 if (nops == 1)
24466 out = force_operand (tmp, copy_rtx (out));
24467 else
24468 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24470 if (!rtx_equal_p (out, operands[0]))
24471 emit_move_insn (operands[0], copy_rtx (out));
24473 return true;
24477 * General case: Jumpful:
24478 * xorl dest,dest cmpl op1, op2
24479 * cmpl op1, op2 movl ct, dest
24480 * setcc dest jcc 1f
24481 * decl dest movl cf, dest
24482 * andl (cf-ct),dest 1:
24483 * addl ct,dest
24485 * Size 20. Size 14.
24487 * This is reasonably steep, but branch mispredict costs are
24488 * high on modern cpus, so consider failing only if optimizing
24489 * for space.
24492 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24493 && BRANCH_COST (optimize_insn_for_speed_p (),
24494 false) >= 2)
24496 if (cf == 0)
24498 machine_mode cmp_mode = GET_MODE (op0);
24499 enum rtx_code new_code;
24501 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24503 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24505 /* We may be reversing unordered compare to normal compare,
24506 that is not valid in general (we may convert non-trapping
24507 condition to trapping one), however on i386 we currently
24508 emit all comparisons unordered. */
24509 new_code = reverse_condition_maybe_unordered (code);
24511 else
24513 new_code = ix86_reverse_condition (code, cmp_mode);
24514 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24515 compare_code = reverse_condition (compare_code);
24518 if (new_code != UNKNOWN)
24520 cf = ct;
24521 ct = 0;
24522 code = new_code;
24526 if (compare_code != UNKNOWN)
24528 /* notl op1 (if needed)
24529 sarl $31, op1
24530 andl (cf-ct), op1
24531 addl ct, op1
24533 For x < 0 (resp. x <= -1) there will be no notl,
24534 so if possible swap the constants to get rid of the
24535 complement.
24536 True/false will be -1/0 while code below (store flag
24537 followed by decrement) is 0/-1, so the constants need
24538 to be exchanged once more. */
24540 if (compare_code == GE || !cf)
24542 code = reverse_condition (code);
24543 compare_code = LT;
24545 else
24546 std::swap (ct, cf);
24548 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24550 else
24552 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24554 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24555 constm1_rtx,
24556 copy_rtx (out), 1, OPTAB_DIRECT);
24559 out = expand_simple_binop (mode, AND, copy_rtx (out),
24560 gen_int_mode (cf - ct, mode),
24561 copy_rtx (out), 1, OPTAB_DIRECT);
24562 if (ct)
24563 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24564 copy_rtx (out), 1, OPTAB_DIRECT);
24565 if (!rtx_equal_p (out, operands[0]))
24566 emit_move_insn (operands[0], copy_rtx (out));
24568 return true;
24572 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24574 /* Try a few things more with specific constants and a variable. */
24576 optab op;
24577 rtx var, orig_out, out, tmp;
24579 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24580 return false;
24582 /* If one of the two operands is an interesting constant, load a
24583 constant with the above and mask it in with a logical operation. */
24585 if (CONST_INT_P (operands[2]))
24587 var = operands[3];
24588 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24589 operands[3] = constm1_rtx, op = and_optab;
24590 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24591 operands[3] = const0_rtx, op = ior_optab;
24592 else
24593 return false;
24595 else if (CONST_INT_P (operands[3]))
24597 var = operands[2];
24598 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24599 operands[2] = constm1_rtx, op = and_optab;
24600 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24601 operands[2] = const0_rtx, op = ior_optab;
24602 else
24603 return false;
24605 else
24606 return false;
24608 orig_out = operands[0];
24609 tmp = gen_reg_rtx (mode);
24610 operands[0] = tmp;
24612 /* Recurse to get the constant loaded. */
24613 if (!ix86_expand_int_movcc (operands))
24614 return false;
24616 /* Mask in the interesting variable. */
24617 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24618 OPTAB_WIDEN);
24619 if (!rtx_equal_p (out, orig_out))
24620 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24622 return true;
24626 * For comparison with above,
24628 * movl cf,dest
24629 * movl ct,tmp
24630 * cmpl op1,op2
24631 * cmovcc tmp,dest
24633 * Size 15.
24636 if (! nonimmediate_operand (operands[2], mode))
24637 operands[2] = force_reg (mode, operands[2]);
24638 if (! nonimmediate_operand (operands[3], mode))
24639 operands[3] = force_reg (mode, operands[3]);
24641 if (! register_operand (operands[2], VOIDmode)
24642 && (mode == QImode
24643 || ! register_operand (operands[3], VOIDmode)))
24644 operands[2] = force_reg (mode, operands[2]);
24646 if (mode == QImode
24647 && ! register_operand (operands[3], VOIDmode))
24648 operands[3] = force_reg (mode, operands[3]);
24650 emit_insn (compare_seq);
24651 emit_insn (gen_rtx_SET (operands[0],
24652 gen_rtx_IF_THEN_ELSE (mode,
24653 compare_op, operands[2],
24654 operands[3])));
24655 return true;
24658 /* Swap, force into registers, or otherwise massage the two operands
24659 to an sse comparison with a mask result. Thus we differ a bit from
24660 ix86_prepare_fp_compare_args which expects to produce a flags result.
24662 The DEST operand exists to help determine whether to commute commutative
24663 operators. The POP0/POP1 operands are updated in place. The new
24664 comparison code is returned, or UNKNOWN if not implementable. */
24666 static enum rtx_code
24667 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24668 rtx *pop0, rtx *pop1)
24670 switch (code)
24672 case LTGT:
24673 case UNEQ:
24674 /* AVX supports all the needed comparisons. */
24675 if (TARGET_AVX)
24676 break;
24677 /* We have no LTGT as an operator. We could implement it with
24678 NE & ORDERED, but this requires an extra temporary. It's
24679 not clear that it's worth it. */
24680 return UNKNOWN;
24682 case LT:
24683 case LE:
24684 case UNGT:
24685 case UNGE:
24686 /* These are supported directly. */
24687 break;
24689 case EQ:
24690 case NE:
24691 case UNORDERED:
24692 case ORDERED:
24693 /* AVX has 3 operand comparisons, no need to swap anything. */
24694 if (TARGET_AVX)
24695 break;
24696 /* For commutative operators, try to canonicalize the destination
24697 operand to be first in the comparison - this helps reload to
24698 avoid extra moves. */
24699 if (!dest || !rtx_equal_p (dest, *pop1))
24700 break;
24701 /* FALLTHRU */
24703 case GE:
24704 case GT:
24705 case UNLE:
24706 case UNLT:
24707 /* These are not supported directly before AVX, and furthermore
24708 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24709 comparison operands to transform into something that is
24710 supported. */
24711 std::swap (*pop0, *pop1);
24712 code = swap_condition (code);
24713 break;
24715 default:
24716 gcc_unreachable ();
24719 return code;
24722 /* Detect conditional moves that exactly match min/max operational
24723 semantics. Note that this is IEEE safe, as long as we don't
24724 interchange the operands.
24726 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24727 and TRUE if the operation is successful and instructions are emitted. */
24729 static bool
24730 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24731 rtx cmp_op1, rtx if_true, rtx if_false)
24733 machine_mode mode;
24734 bool is_min;
24735 rtx tmp;
24737 if (code == LT)
24739 else if (code == UNGE)
24740 std::swap (if_true, if_false);
24741 else
24742 return false;
24744 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24745 is_min = true;
24746 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24747 is_min = false;
24748 else
24749 return false;
24751 mode = GET_MODE (dest);
24753 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24754 but MODE may be a vector mode and thus not appropriate. */
24755 if (!flag_finite_math_only || flag_signed_zeros)
24757 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24758 rtvec v;
24760 if_true = force_reg (mode, if_true);
24761 v = gen_rtvec (2, if_true, if_false);
24762 tmp = gen_rtx_UNSPEC (mode, v, u);
24764 else
24766 code = is_min ? SMIN : SMAX;
24767 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24770 emit_insn (gen_rtx_SET (dest, tmp));
24771 return true;
24774 /* Expand an sse vector comparison. Return the register with the result. */
24776 static rtx
24777 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24778 rtx op_true, rtx op_false)
24780 machine_mode mode = GET_MODE (dest);
24781 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24783 /* In general case result of comparison can differ from operands' type. */
24784 machine_mode cmp_mode;
24786 /* In AVX512F the result of comparison is an integer mask. */
24787 bool maskcmp = false;
24788 rtx x;
24790 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24792 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
24793 gcc_assert (cmp_mode != BLKmode);
24795 maskcmp = true;
24797 else
24798 cmp_mode = cmp_ops_mode;
24801 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24802 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24803 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24805 if (optimize
24806 || (maskcmp && cmp_mode != mode)
24807 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24808 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24809 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24811 /* Compare patterns for int modes are unspec in AVX512F only. */
24812 if (maskcmp && (code == GT || code == EQ))
24814 rtx (*gen)(rtx, rtx, rtx);
24816 switch (cmp_ops_mode)
24818 case E_V64QImode:
24819 gcc_assert (TARGET_AVX512BW);
24820 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24821 break;
24822 case E_V32HImode:
24823 gcc_assert (TARGET_AVX512BW);
24824 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24825 break;
24826 case E_V16SImode:
24827 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24828 break;
24829 case E_V8DImode:
24830 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24831 break;
24832 default:
24833 gen = NULL;
24836 if (gen)
24838 emit_insn (gen (dest, cmp_op0, cmp_op1));
24839 return dest;
24842 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24844 if (cmp_mode != mode && !maskcmp)
24846 x = force_reg (cmp_ops_mode, x);
24847 convert_move (dest, x, false);
24849 else
24850 emit_insn (gen_rtx_SET (dest, x));
24852 return dest;
24855 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24856 operations. This is used for both scalar and vector conditional moves. */
24858 void
24859 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24861 machine_mode mode = GET_MODE (dest);
24862 machine_mode cmpmode = GET_MODE (cmp);
24864 /* In AVX512F the result of comparison is an integer mask. */
24865 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24867 rtx t2, t3, x;
24869 /* If we have an integer mask and FP value then we need
24870 to cast mask to FP mode. */
24871 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24873 cmp = force_reg (cmpmode, cmp);
24874 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24877 if (vector_all_ones_operand (op_true, mode)
24878 && rtx_equal_p (op_false, CONST0_RTX (mode))
24879 && !maskcmp)
24881 emit_insn (gen_rtx_SET (dest, cmp));
24883 else if (op_false == CONST0_RTX (mode)
24884 && !maskcmp)
24886 op_true = force_reg (mode, op_true);
24887 x = gen_rtx_AND (mode, cmp, op_true);
24888 emit_insn (gen_rtx_SET (dest, x));
24890 else if (op_true == CONST0_RTX (mode)
24891 && !maskcmp)
24893 op_false = force_reg (mode, op_false);
24894 x = gen_rtx_NOT (mode, cmp);
24895 x = gen_rtx_AND (mode, x, op_false);
24896 emit_insn (gen_rtx_SET (dest, x));
24898 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24899 && !maskcmp)
24901 op_false = force_reg (mode, op_false);
24902 x = gen_rtx_IOR (mode, cmp, op_false);
24903 emit_insn (gen_rtx_SET (dest, x));
24905 else if (TARGET_XOP
24906 && !maskcmp)
24908 op_true = force_reg (mode, op_true);
24910 if (!nonimmediate_operand (op_false, mode))
24911 op_false = force_reg (mode, op_false);
24913 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24914 op_true,
24915 op_false)));
24917 else
24919 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24920 rtx d = dest;
24922 if (!nonimmediate_operand (op_true, mode))
24923 op_true = force_reg (mode, op_true);
24925 op_false = force_reg (mode, op_false);
24927 switch (mode)
24929 case E_V4SFmode:
24930 if (TARGET_SSE4_1)
24931 gen = gen_sse4_1_blendvps;
24932 break;
24933 case E_V2DFmode:
24934 if (TARGET_SSE4_1)
24935 gen = gen_sse4_1_blendvpd;
24936 break;
24937 case E_V16QImode:
24938 case E_V8HImode:
24939 case E_V4SImode:
24940 case E_V2DImode:
24941 if (TARGET_SSE4_1)
24943 gen = gen_sse4_1_pblendvb;
24944 if (mode != V16QImode)
24945 d = gen_reg_rtx (V16QImode);
24946 op_false = gen_lowpart (V16QImode, op_false);
24947 op_true = gen_lowpart (V16QImode, op_true);
24948 cmp = gen_lowpart (V16QImode, cmp);
24950 break;
24951 case E_V8SFmode:
24952 if (TARGET_AVX)
24953 gen = gen_avx_blendvps256;
24954 break;
24955 case E_V4DFmode:
24956 if (TARGET_AVX)
24957 gen = gen_avx_blendvpd256;
24958 break;
24959 case E_V32QImode:
24960 case E_V16HImode:
24961 case E_V8SImode:
24962 case E_V4DImode:
24963 if (TARGET_AVX2)
24965 gen = gen_avx2_pblendvb;
24966 if (mode != V32QImode)
24967 d = gen_reg_rtx (V32QImode);
24968 op_false = gen_lowpart (V32QImode, op_false);
24969 op_true = gen_lowpart (V32QImode, op_true);
24970 cmp = gen_lowpart (V32QImode, cmp);
24972 break;
24974 case E_V64QImode:
24975 gen = gen_avx512bw_blendmv64qi;
24976 break;
24977 case E_V32HImode:
24978 gen = gen_avx512bw_blendmv32hi;
24979 break;
24980 case E_V16SImode:
24981 gen = gen_avx512f_blendmv16si;
24982 break;
24983 case E_V8DImode:
24984 gen = gen_avx512f_blendmv8di;
24985 break;
24986 case E_V8DFmode:
24987 gen = gen_avx512f_blendmv8df;
24988 break;
24989 case E_V16SFmode:
24990 gen = gen_avx512f_blendmv16sf;
24991 break;
24993 default:
24994 break;
24997 if (gen != NULL)
24999 emit_insn (gen (d, op_false, op_true, cmp));
25000 if (d != dest)
25001 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
25003 else
25005 op_true = force_reg (mode, op_true);
25007 t2 = gen_reg_rtx (mode);
25008 if (optimize)
25009 t3 = gen_reg_rtx (mode);
25010 else
25011 t3 = dest;
25013 x = gen_rtx_AND (mode, op_true, cmp);
25014 emit_insn (gen_rtx_SET (t2, x));
25016 x = gen_rtx_NOT (mode, cmp);
25017 x = gen_rtx_AND (mode, x, op_false);
25018 emit_insn (gen_rtx_SET (t3, x));
25020 x = gen_rtx_IOR (mode, t3, t2);
25021 emit_insn (gen_rtx_SET (dest, x));
25026 /* Expand a floating-point conditional move. Return true if successful. */
25028 bool
25029 ix86_expand_fp_movcc (rtx operands[])
25031 machine_mode mode = GET_MODE (operands[0]);
25032 enum rtx_code code = GET_CODE (operands[1]);
25033 rtx tmp, compare_op;
25034 rtx op0 = XEXP (operands[1], 0);
25035 rtx op1 = XEXP (operands[1], 1);
25037 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
25039 machine_mode cmode;
25041 /* Since we've no cmove for sse registers, don't force bad register
25042 allocation just to gain access to it. Deny movcc when the
25043 comparison mode doesn't match the move mode. */
25044 cmode = GET_MODE (op0);
25045 if (cmode == VOIDmode)
25046 cmode = GET_MODE (op1);
25047 if (cmode != mode)
25048 return false;
25050 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
25051 if (code == UNKNOWN)
25052 return false;
25054 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
25055 operands[2], operands[3]))
25056 return true;
25058 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
25059 operands[2], operands[3]);
25060 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
25061 return true;
25064 if (GET_MODE (op0) == TImode
25065 || (GET_MODE (op0) == DImode
25066 && !TARGET_64BIT))
25067 return false;
25069 /* The floating point conditional move instructions don't directly
25070 support conditions resulting from a signed integer comparison. */
25072 compare_op = ix86_expand_compare (code, op0, op1);
25073 if (!fcmov_comparison_operator (compare_op, VOIDmode))
25075 tmp = gen_reg_rtx (QImode);
25076 ix86_expand_setcc (tmp, code, op0, op1);
25078 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
25081 emit_insn (gen_rtx_SET (operands[0],
25082 gen_rtx_IF_THEN_ELSE (mode, compare_op,
25083 operands[2], operands[3])));
25085 return true;
25088 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
25090 static int
25091 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
25093 switch (code)
25095 case EQ:
25096 return 0;
25097 case LT:
25098 case LTU:
25099 return 1;
25100 case LE:
25101 case LEU:
25102 return 2;
25103 case NE:
25104 return 4;
25105 case GE:
25106 case GEU:
25107 return 5;
25108 case GT:
25109 case GTU:
25110 return 6;
25111 default:
25112 gcc_unreachable ();
25116 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
25118 static int
25119 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
25121 switch (code)
25123 case EQ:
25124 return 0x00;
25125 case NE:
25126 return 0x04;
25127 case GT:
25128 return 0x0e;
25129 case LE:
25130 return 0x02;
25131 case GE:
25132 return 0x0d;
25133 case LT:
25134 return 0x01;
25135 case UNLE:
25136 return 0x0a;
25137 case UNLT:
25138 return 0x09;
25139 case UNGE:
25140 return 0x05;
25141 case UNGT:
25142 return 0x06;
25143 case UNEQ:
25144 return 0x18;
25145 case LTGT:
25146 return 0x0c;
25147 case ORDERED:
25148 return 0x07;
25149 case UNORDERED:
25150 return 0x03;
25151 default:
25152 gcc_unreachable ();
25156 /* Return immediate value to be used in UNSPEC_PCMP
25157 for comparison CODE in MODE. */
25159 static int
25160 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25162 if (FLOAT_MODE_P (mode))
25163 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25164 return ix86_int_cmp_code_to_pcmp_immediate (code);
25167 /* Expand AVX-512 vector comparison. */
25169 bool
25170 ix86_expand_mask_vec_cmp (rtx operands[])
25172 machine_mode mask_mode = GET_MODE (operands[0]);
25173 machine_mode cmp_mode = GET_MODE (operands[2]);
25174 enum rtx_code code = GET_CODE (operands[1]);
25175 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25176 int unspec_code;
25177 rtx unspec;
25179 switch (code)
25181 case LEU:
25182 case GTU:
25183 case GEU:
25184 case LTU:
25185 unspec_code = UNSPEC_UNSIGNED_PCMP;
25186 break;
25188 default:
25189 unspec_code = UNSPEC_PCMP;
25192 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25193 operands[3], imm),
25194 unspec_code);
25195 emit_insn (gen_rtx_SET (operands[0], unspec));
25197 return true;
25200 /* Expand fp vector comparison. */
25202 bool
25203 ix86_expand_fp_vec_cmp (rtx operands[])
25205 enum rtx_code code = GET_CODE (operands[1]);
25206 rtx cmp;
25208 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25209 &operands[2], &operands[3]);
25210 if (code == UNKNOWN)
25212 rtx temp;
25213 switch (GET_CODE (operands[1]))
25215 case LTGT:
25216 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25217 operands[3], NULL, NULL);
25218 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25219 operands[3], NULL, NULL);
25220 code = AND;
25221 break;
25222 case UNEQ:
25223 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25224 operands[3], NULL, NULL);
25225 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25226 operands[3], NULL, NULL);
25227 code = IOR;
25228 break;
25229 default:
25230 gcc_unreachable ();
25232 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25233 OPTAB_DIRECT);
25235 else
25236 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25237 operands[1], operands[2]);
25239 if (operands[0] != cmp)
25240 emit_move_insn (operands[0], cmp);
25242 return true;
25245 static rtx
25246 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25247 rtx op_true, rtx op_false, bool *negate)
25249 machine_mode data_mode = GET_MODE (dest);
25250 machine_mode mode = GET_MODE (cop0);
25251 rtx x;
25253 *negate = false;
25255 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25256 if (TARGET_XOP
25257 && (mode == V16QImode || mode == V8HImode
25258 || mode == V4SImode || mode == V2DImode))
25260 else
25262 /* Canonicalize the comparison to EQ, GT, GTU. */
25263 switch (code)
25265 case EQ:
25266 case GT:
25267 case GTU:
25268 break;
25270 case NE:
25271 case LE:
25272 case LEU:
25273 code = reverse_condition (code);
25274 *negate = true;
25275 break;
25277 case GE:
25278 case GEU:
25279 code = reverse_condition (code);
25280 *negate = true;
25281 /* FALLTHRU */
25283 case LT:
25284 case LTU:
25285 std::swap (cop0, cop1);
25286 code = swap_condition (code);
25287 break;
25289 default:
25290 gcc_unreachable ();
25293 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25294 if (mode == V2DImode)
25296 switch (code)
25298 case EQ:
25299 /* SSE4.1 supports EQ. */
25300 if (!TARGET_SSE4_1)
25301 return NULL;
25302 break;
25304 case GT:
25305 case GTU:
25306 /* SSE4.2 supports GT/GTU. */
25307 if (!TARGET_SSE4_2)
25308 return NULL;
25309 break;
25311 default:
25312 gcc_unreachable ();
25316 /* Unsigned parallel compare is not supported by the hardware.
25317 Play some tricks to turn this into a signed comparison
25318 against 0. */
25319 if (code == GTU)
25321 cop0 = force_reg (mode, cop0);
25323 switch (mode)
25325 case E_V16SImode:
25326 case E_V8DImode:
25327 case E_V8SImode:
25328 case E_V4DImode:
25329 case E_V4SImode:
25330 case E_V2DImode:
25332 rtx t1, t2, mask;
25333 rtx (*gen_sub3) (rtx, rtx, rtx);
25335 switch (mode)
25337 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
25338 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
25339 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
25340 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
25341 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
25342 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
25343 default:
25344 gcc_unreachable ();
25346 /* Subtract (-(INT MAX) - 1) from both operands to make
25347 them signed. */
25348 mask = ix86_build_signbit_mask (mode, true, false);
25349 t1 = gen_reg_rtx (mode);
25350 emit_insn (gen_sub3 (t1, cop0, mask));
25352 t2 = gen_reg_rtx (mode);
25353 emit_insn (gen_sub3 (t2, cop1, mask));
25355 cop0 = t1;
25356 cop1 = t2;
25357 code = GT;
25359 break;
25361 case E_V64QImode:
25362 case E_V32HImode:
25363 case E_V32QImode:
25364 case E_V16HImode:
25365 case E_V16QImode:
25366 case E_V8HImode:
25367 /* Perform a parallel unsigned saturating subtraction. */
25368 x = gen_reg_rtx (mode);
25369 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25370 cop1)));
25372 cop0 = x;
25373 cop1 = CONST0_RTX (mode);
25374 code = EQ;
25375 *negate = !*negate;
25376 break;
25378 default:
25379 gcc_unreachable ();
25384 if (*negate)
25385 std::swap (op_true, op_false);
25387 /* Allow the comparison to be done in one mode, but the movcc to
25388 happen in another mode. */
25389 if (data_mode == mode)
25391 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25392 op_true, op_false);
25394 else
25396 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25397 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25398 op_true, op_false);
25399 if (GET_MODE (x) == mode)
25400 x = gen_lowpart (data_mode, x);
25403 return x;
25406 /* Expand integer vector comparison. */
25408 bool
25409 ix86_expand_int_vec_cmp (rtx operands[])
25411 rtx_code code = GET_CODE (operands[1]);
25412 bool negate = false;
25413 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25414 operands[3], NULL, NULL, &negate);
25416 if (!cmp)
25417 return false;
25419 if (negate)
25420 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25421 CONST0_RTX (GET_MODE (cmp)),
25422 NULL, NULL, &negate);
25424 gcc_assert (!negate);
25426 if (operands[0] != cmp)
25427 emit_move_insn (operands[0], cmp);
25429 return true;
25432 /* Expand a floating-point vector conditional move; a vcond operation
25433 rather than a movcc operation. */
25435 bool
25436 ix86_expand_fp_vcond (rtx operands[])
25438 enum rtx_code code = GET_CODE (operands[3]);
25439 rtx cmp;
25441 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25442 &operands[4], &operands[5]);
25443 if (code == UNKNOWN)
25445 rtx temp;
25446 switch (GET_CODE (operands[3]))
25448 case LTGT:
25449 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25450 operands[5], operands[0], operands[0]);
25451 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25452 operands[5], operands[1], operands[2]);
25453 code = AND;
25454 break;
25455 case UNEQ:
25456 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25457 operands[5], operands[0], operands[0]);
25458 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25459 operands[5], operands[1], operands[2]);
25460 code = IOR;
25461 break;
25462 default:
25463 gcc_unreachable ();
25465 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25466 OPTAB_DIRECT);
25467 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25468 return true;
25471 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25472 operands[5], operands[1], operands[2]))
25473 return true;
25475 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25476 operands[1], operands[2]);
25477 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25478 return true;
25481 /* Expand a signed/unsigned integral vector conditional move. */
25483 bool
25484 ix86_expand_int_vcond (rtx operands[])
25486 machine_mode data_mode = GET_MODE (operands[0]);
25487 machine_mode mode = GET_MODE (operands[4]);
25488 enum rtx_code code = GET_CODE (operands[3]);
25489 bool negate = false;
25490 rtx x, cop0, cop1;
25492 cop0 = operands[4];
25493 cop1 = operands[5];
25495 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25496 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25497 if ((code == LT || code == GE)
25498 && data_mode == mode
25499 && cop1 == CONST0_RTX (mode)
25500 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25501 && GET_MODE_UNIT_SIZE (data_mode) > 1
25502 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25503 && (GET_MODE_SIZE (data_mode) == 16
25504 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25506 rtx negop = operands[2 - (code == LT)];
25507 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25508 if (negop == CONST1_RTX (data_mode))
25510 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25511 operands[0], 1, OPTAB_DIRECT);
25512 if (res != operands[0])
25513 emit_move_insn (operands[0], res);
25514 return true;
25516 else if (GET_MODE_INNER (data_mode) != DImode
25517 && vector_all_ones_operand (negop, data_mode))
25519 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25520 operands[0], 0, OPTAB_DIRECT);
25521 if (res != operands[0])
25522 emit_move_insn (operands[0], res);
25523 return true;
25527 if (!nonimmediate_operand (cop1, mode))
25528 cop1 = force_reg (mode, cop1);
25529 if (!general_operand (operands[1], data_mode))
25530 operands[1] = force_reg (data_mode, operands[1]);
25531 if (!general_operand (operands[2], data_mode))
25532 operands[2] = force_reg (data_mode, operands[2]);
25534 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25535 operands[1], operands[2], &negate);
25537 if (!x)
25538 return false;
25540 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25541 operands[2-negate]);
25542 return true;
25545 /* AVX512F does support 64-byte integer vector operations,
25546 thus the longest vector we are faced with is V64QImode. */
25547 #define MAX_VECT_LEN 64
25549 struct expand_vec_perm_d
25551 rtx target, op0, op1;
25552 unsigned char perm[MAX_VECT_LEN];
25553 machine_mode vmode;
25554 unsigned char nelt;
25555 bool one_operand_p;
25556 bool testing_p;
25559 static bool
25560 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25561 struct expand_vec_perm_d *d)
25563 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25564 expander, so args are either in d, or in op0, op1 etc. */
25565 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25566 machine_mode maskmode = mode;
25567 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25569 switch (mode)
25571 case E_V8HImode:
25572 if (TARGET_AVX512VL && TARGET_AVX512BW)
25573 gen = gen_avx512vl_vpermi2varv8hi3;
25574 break;
25575 case E_V16HImode:
25576 if (TARGET_AVX512VL && TARGET_AVX512BW)
25577 gen = gen_avx512vl_vpermi2varv16hi3;
25578 break;
25579 case E_V64QImode:
25580 if (TARGET_AVX512VBMI)
25581 gen = gen_avx512bw_vpermi2varv64qi3;
25582 break;
25583 case E_V32HImode:
25584 if (TARGET_AVX512BW)
25585 gen = gen_avx512bw_vpermi2varv32hi3;
25586 break;
25587 case E_V4SImode:
25588 if (TARGET_AVX512VL)
25589 gen = gen_avx512vl_vpermi2varv4si3;
25590 break;
25591 case E_V8SImode:
25592 if (TARGET_AVX512VL)
25593 gen = gen_avx512vl_vpermi2varv8si3;
25594 break;
25595 case E_V16SImode:
25596 if (TARGET_AVX512F)
25597 gen = gen_avx512f_vpermi2varv16si3;
25598 break;
25599 case E_V4SFmode:
25600 if (TARGET_AVX512VL)
25602 gen = gen_avx512vl_vpermi2varv4sf3;
25603 maskmode = V4SImode;
25605 break;
25606 case E_V8SFmode:
25607 if (TARGET_AVX512VL)
25609 gen = gen_avx512vl_vpermi2varv8sf3;
25610 maskmode = V8SImode;
25612 break;
25613 case E_V16SFmode:
25614 if (TARGET_AVX512F)
25616 gen = gen_avx512f_vpermi2varv16sf3;
25617 maskmode = V16SImode;
25619 break;
25620 case E_V2DImode:
25621 if (TARGET_AVX512VL)
25622 gen = gen_avx512vl_vpermi2varv2di3;
25623 break;
25624 case E_V4DImode:
25625 if (TARGET_AVX512VL)
25626 gen = gen_avx512vl_vpermi2varv4di3;
25627 break;
25628 case E_V8DImode:
25629 if (TARGET_AVX512F)
25630 gen = gen_avx512f_vpermi2varv8di3;
25631 break;
25632 case E_V2DFmode:
25633 if (TARGET_AVX512VL)
25635 gen = gen_avx512vl_vpermi2varv2df3;
25636 maskmode = V2DImode;
25638 break;
25639 case E_V4DFmode:
25640 if (TARGET_AVX512VL)
25642 gen = gen_avx512vl_vpermi2varv4df3;
25643 maskmode = V4DImode;
25645 break;
25646 case E_V8DFmode:
25647 if (TARGET_AVX512F)
25649 gen = gen_avx512f_vpermi2varv8df3;
25650 maskmode = V8DImode;
25652 break;
25653 default:
25654 break;
25657 if (gen == NULL)
25658 return false;
25660 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25661 expander, so args are either in d, or in op0, op1 etc. */
25662 if (d)
25664 rtx vec[64];
25665 target = d->target;
25666 op0 = d->op0;
25667 op1 = d->op1;
25668 for (int i = 0; i < d->nelt; ++i)
25669 vec[i] = GEN_INT (d->perm[i]);
25670 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25673 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25674 return true;
25677 /* Expand a variable vector permutation. */
25679 void
25680 ix86_expand_vec_perm (rtx operands[])
25682 rtx target = operands[0];
25683 rtx op0 = operands[1];
25684 rtx op1 = operands[2];
25685 rtx mask = operands[3];
25686 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25687 machine_mode mode = GET_MODE (op0);
25688 machine_mode maskmode = GET_MODE (mask);
25689 int w, e, i;
25690 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25692 /* Number of elements in the vector. */
25693 w = GET_MODE_NUNITS (mode);
25694 e = GET_MODE_UNIT_SIZE (mode);
25695 gcc_assert (w <= 64);
25697 if (TARGET_AVX512F && one_operand_shuffle)
25699 rtx (*gen) (rtx, rtx, rtx) = NULL;
25700 switch (mode)
25702 case E_V16SImode:
25703 gen =gen_avx512f_permvarv16si;
25704 break;
25705 case E_V16SFmode:
25706 gen = gen_avx512f_permvarv16sf;
25707 break;
25708 case E_V8DImode:
25709 gen = gen_avx512f_permvarv8di;
25710 break;
25711 case E_V8DFmode:
25712 gen = gen_avx512f_permvarv8df;
25713 break;
25714 default:
25715 break;
25717 if (gen != NULL)
25719 emit_insn (gen (target, op0, mask));
25720 return;
25724 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25725 return;
25727 if (TARGET_AVX2)
25729 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25731 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25732 an constant shuffle operand. With a tiny bit of effort we can
25733 use VPERMD instead. A re-interpretation stall for V4DFmode is
25734 unfortunate but there's no avoiding it.
25735 Similarly for V16HImode we don't have instructions for variable
25736 shuffling, while for V32QImode we can use after preparing suitable
25737 masks vpshufb; vpshufb; vpermq; vpor. */
25739 if (mode == V16HImode)
25741 maskmode = mode = V32QImode;
25742 w = 32;
25743 e = 1;
25745 else
25747 maskmode = mode = V8SImode;
25748 w = 8;
25749 e = 4;
25751 t1 = gen_reg_rtx (maskmode);
25753 /* Replicate the low bits of the V4DImode mask into V8SImode:
25754 mask = { A B C D }
25755 t1 = { A A B B C C D D }. */
25756 for (i = 0; i < w / 2; ++i)
25757 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25758 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25759 vt = force_reg (maskmode, vt);
25760 mask = gen_lowpart (maskmode, mask);
25761 if (maskmode == V8SImode)
25762 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25763 else
25764 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25766 /* Multiply the shuffle indicies by two. */
25767 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25768 OPTAB_DIRECT);
25770 /* Add one to the odd shuffle indicies:
25771 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25772 for (i = 0; i < w / 2; ++i)
25774 vec[i * 2] = const0_rtx;
25775 vec[i * 2 + 1] = const1_rtx;
25777 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25778 vt = validize_mem (force_const_mem (maskmode, vt));
25779 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25780 OPTAB_DIRECT);
25782 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25783 operands[3] = mask = t1;
25784 target = gen_reg_rtx (mode);
25785 op0 = gen_lowpart (mode, op0);
25786 op1 = gen_lowpart (mode, op1);
25789 switch (mode)
25791 case E_V8SImode:
25792 /* The VPERMD and VPERMPS instructions already properly ignore
25793 the high bits of the shuffle elements. No need for us to
25794 perform an AND ourselves. */
25795 if (one_operand_shuffle)
25797 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25798 if (target != operands[0])
25799 emit_move_insn (operands[0],
25800 gen_lowpart (GET_MODE (operands[0]), target));
25802 else
25804 t1 = gen_reg_rtx (V8SImode);
25805 t2 = gen_reg_rtx (V8SImode);
25806 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25807 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25808 goto merge_two;
25810 return;
25812 case E_V8SFmode:
25813 mask = gen_lowpart (V8SImode, mask);
25814 if (one_operand_shuffle)
25815 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25816 else
25818 t1 = gen_reg_rtx (V8SFmode);
25819 t2 = gen_reg_rtx (V8SFmode);
25820 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25821 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25822 goto merge_two;
25824 return;
25826 case E_V4SImode:
25827 /* By combining the two 128-bit input vectors into one 256-bit
25828 input vector, we can use VPERMD and VPERMPS for the full
25829 two-operand shuffle. */
25830 t1 = gen_reg_rtx (V8SImode);
25831 t2 = gen_reg_rtx (V8SImode);
25832 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25833 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25834 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25835 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25836 return;
25838 case E_V4SFmode:
25839 t1 = gen_reg_rtx (V8SFmode);
25840 t2 = gen_reg_rtx (V8SImode);
25841 mask = gen_lowpart (V4SImode, mask);
25842 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25843 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25844 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25845 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25846 return;
25848 case E_V32QImode:
25849 t1 = gen_reg_rtx (V32QImode);
25850 t2 = gen_reg_rtx (V32QImode);
25851 t3 = gen_reg_rtx (V32QImode);
25852 vt2 = GEN_INT (-128);
25853 for (i = 0; i < 32; i++)
25854 vec[i] = vt2;
25855 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25856 vt = force_reg (V32QImode, vt);
25857 for (i = 0; i < 32; i++)
25858 vec[i] = i < 16 ? vt2 : const0_rtx;
25859 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25860 vt2 = force_reg (V32QImode, vt2);
25861 /* From mask create two adjusted masks, which contain the same
25862 bits as mask in the low 7 bits of each vector element.
25863 The first mask will have the most significant bit clear
25864 if it requests element from the same 128-bit lane
25865 and MSB set if it requests element from the other 128-bit lane.
25866 The second mask will have the opposite values of the MSB,
25867 and additionally will have its 128-bit lanes swapped.
25868 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25869 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25870 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25871 stands for other 12 bytes. */
25872 /* The bit whether element is from the same lane or the other
25873 lane is bit 4, so shift it up by 3 to the MSB position. */
25874 t5 = gen_reg_rtx (V4DImode);
25875 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25876 GEN_INT (3)));
25877 /* Clear MSB bits from the mask just in case it had them set. */
25878 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25879 /* After this t1 will have MSB set for elements from other lane. */
25880 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25881 /* Clear bits other than MSB. */
25882 emit_insn (gen_andv32qi3 (t1, t1, vt));
25883 /* Or in the lower bits from mask into t3. */
25884 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25885 /* And invert MSB bits in t1, so MSB is set for elements from the same
25886 lane. */
25887 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25888 /* Swap 128-bit lanes in t3. */
25889 t6 = gen_reg_rtx (V4DImode);
25890 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25891 const2_rtx, GEN_INT (3),
25892 const0_rtx, const1_rtx));
25893 /* And or in the lower bits from mask into t1. */
25894 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25895 if (one_operand_shuffle)
25897 /* Each of these shuffles will put 0s in places where
25898 element from the other 128-bit lane is needed, otherwise
25899 will shuffle in the requested value. */
25900 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25901 gen_lowpart (V32QImode, t6)));
25902 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25903 /* For t3 the 128-bit lanes are swapped again. */
25904 t7 = gen_reg_rtx (V4DImode);
25905 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25906 const2_rtx, GEN_INT (3),
25907 const0_rtx, const1_rtx));
25908 /* And oring both together leads to the result. */
25909 emit_insn (gen_iorv32qi3 (target, t1,
25910 gen_lowpart (V32QImode, t7)));
25911 if (target != operands[0])
25912 emit_move_insn (operands[0],
25913 gen_lowpart (GET_MODE (operands[0]), target));
25914 return;
25917 t4 = gen_reg_rtx (V32QImode);
25918 /* Similarly to the above one_operand_shuffle code,
25919 just for repeated twice for each operand. merge_two:
25920 code will merge the two results together. */
25921 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25922 gen_lowpart (V32QImode, t6)));
25923 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25924 gen_lowpart (V32QImode, t6)));
25925 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25926 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25927 t7 = gen_reg_rtx (V4DImode);
25928 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25929 const2_rtx, GEN_INT (3),
25930 const0_rtx, const1_rtx));
25931 t8 = gen_reg_rtx (V4DImode);
25932 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25933 const2_rtx, GEN_INT (3),
25934 const0_rtx, const1_rtx));
25935 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25936 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25937 t1 = t4;
25938 t2 = t3;
25939 goto merge_two;
25941 default:
25942 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25943 break;
25947 if (TARGET_XOP)
25949 /* The XOP VPPERM insn supports three inputs. By ignoring the
25950 one_operand_shuffle special case, we avoid creating another
25951 set of constant vectors in memory. */
25952 one_operand_shuffle = false;
25954 /* mask = mask & {2*w-1, ...} */
25955 vt = GEN_INT (2*w - 1);
25957 else
25959 /* mask = mask & {w-1, ...} */
25960 vt = GEN_INT (w - 1);
25963 for (i = 0; i < w; i++)
25964 vec[i] = vt;
25965 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25966 mask = expand_simple_binop (maskmode, AND, mask, vt,
25967 NULL_RTX, 0, OPTAB_DIRECT);
25969 /* For non-QImode operations, convert the word permutation control
25970 into a byte permutation control. */
25971 if (mode != V16QImode)
25973 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25974 GEN_INT (exact_log2 (e)),
25975 NULL_RTX, 0, OPTAB_DIRECT);
25977 /* Convert mask to vector of chars. */
25978 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25980 /* Replicate each of the input bytes into byte positions:
25981 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25982 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25983 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25984 for (i = 0; i < 16; ++i)
25985 vec[i] = GEN_INT (i/e * e);
25986 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25987 vt = validize_mem (force_const_mem (V16QImode, vt));
25988 if (TARGET_XOP)
25989 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25990 else
25991 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25993 /* Convert it into the byte positions by doing
25994 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25995 for (i = 0; i < 16; ++i)
25996 vec[i] = GEN_INT (i % e);
25997 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25998 vt = validize_mem (force_const_mem (V16QImode, vt));
25999 emit_insn (gen_addv16qi3 (mask, mask, vt));
26002 /* The actual shuffle operations all operate on V16QImode. */
26003 op0 = gen_lowpart (V16QImode, op0);
26004 op1 = gen_lowpart (V16QImode, op1);
26006 if (TARGET_XOP)
26008 if (GET_MODE (target) != V16QImode)
26009 target = gen_reg_rtx (V16QImode);
26010 emit_insn (gen_xop_pperm (target, op0, op1, mask));
26011 if (target != operands[0])
26012 emit_move_insn (operands[0],
26013 gen_lowpart (GET_MODE (operands[0]), target));
26015 else if (one_operand_shuffle)
26017 if (GET_MODE (target) != V16QImode)
26018 target = gen_reg_rtx (V16QImode);
26019 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
26020 if (target != operands[0])
26021 emit_move_insn (operands[0],
26022 gen_lowpart (GET_MODE (operands[0]), target));
26024 else
26026 rtx xops[6];
26027 bool ok;
26029 /* Shuffle the two input vectors independently. */
26030 t1 = gen_reg_rtx (V16QImode);
26031 t2 = gen_reg_rtx (V16QImode);
26032 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
26033 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
26035 merge_two:
26036 /* Then merge them together. The key is whether any given control
26037 element contained a bit set that indicates the second word. */
26038 mask = operands[3];
26039 vt = GEN_INT (w);
26040 if (maskmode == V2DImode && !TARGET_SSE4_1)
26042 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
26043 more shuffle to convert the V2DI input mask into a V4SI
26044 input mask. At which point the masking that expand_int_vcond
26045 will work as desired. */
26046 rtx t3 = gen_reg_rtx (V4SImode);
26047 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
26048 const0_rtx, const0_rtx,
26049 const2_rtx, const2_rtx));
26050 mask = t3;
26051 maskmode = V4SImode;
26052 e = w = 4;
26055 for (i = 0; i < w; i++)
26056 vec[i] = vt;
26057 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26058 vt = force_reg (maskmode, vt);
26059 mask = expand_simple_binop (maskmode, AND, mask, vt,
26060 NULL_RTX, 0, OPTAB_DIRECT);
26062 if (GET_MODE (target) != mode)
26063 target = gen_reg_rtx (mode);
26064 xops[0] = target;
26065 xops[1] = gen_lowpart (mode, t2);
26066 xops[2] = gen_lowpart (mode, t1);
26067 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
26068 xops[4] = mask;
26069 xops[5] = vt;
26070 ok = ix86_expand_int_vcond (xops);
26071 gcc_assert (ok);
26072 if (target != operands[0])
26073 emit_move_insn (operands[0],
26074 gen_lowpart (GET_MODE (operands[0]), target));
26078 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
26079 true if we should do zero extension, else sign extension. HIGH_P is
26080 true if we want the N/2 high elements, else the low elements. */
26082 void
26083 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
26085 machine_mode imode = GET_MODE (src);
26086 rtx tmp;
26088 if (TARGET_SSE4_1)
26090 rtx (*unpack)(rtx, rtx);
26091 rtx (*extract)(rtx, rtx) = NULL;
26092 machine_mode halfmode = BLKmode;
26094 switch (imode)
26096 case E_V64QImode:
26097 if (unsigned_p)
26098 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
26099 else
26100 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
26101 halfmode = V32QImode;
26102 extract
26103 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
26104 break;
26105 case E_V32QImode:
26106 if (unsigned_p)
26107 unpack = gen_avx2_zero_extendv16qiv16hi2;
26108 else
26109 unpack = gen_avx2_sign_extendv16qiv16hi2;
26110 halfmode = V16QImode;
26111 extract
26112 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
26113 break;
26114 case E_V32HImode:
26115 if (unsigned_p)
26116 unpack = gen_avx512f_zero_extendv16hiv16si2;
26117 else
26118 unpack = gen_avx512f_sign_extendv16hiv16si2;
26119 halfmode = V16HImode;
26120 extract
26121 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
26122 break;
26123 case E_V16HImode:
26124 if (unsigned_p)
26125 unpack = gen_avx2_zero_extendv8hiv8si2;
26126 else
26127 unpack = gen_avx2_sign_extendv8hiv8si2;
26128 halfmode = V8HImode;
26129 extract
26130 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
26131 break;
26132 case E_V16SImode:
26133 if (unsigned_p)
26134 unpack = gen_avx512f_zero_extendv8siv8di2;
26135 else
26136 unpack = gen_avx512f_sign_extendv8siv8di2;
26137 halfmode = V8SImode;
26138 extract
26139 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
26140 break;
26141 case E_V8SImode:
26142 if (unsigned_p)
26143 unpack = gen_avx2_zero_extendv4siv4di2;
26144 else
26145 unpack = gen_avx2_sign_extendv4siv4di2;
26146 halfmode = V4SImode;
26147 extract
26148 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26149 break;
26150 case E_V16QImode:
26151 if (unsigned_p)
26152 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26153 else
26154 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26155 break;
26156 case E_V8HImode:
26157 if (unsigned_p)
26158 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26159 else
26160 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26161 break;
26162 case E_V4SImode:
26163 if (unsigned_p)
26164 unpack = gen_sse4_1_zero_extendv2siv2di2;
26165 else
26166 unpack = gen_sse4_1_sign_extendv2siv2di2;
26167 break;
26168 default:
26169 gcc_unreachable ();
26172 if (GET_MODE_SIZE (imode) >= 32)
26174 tmp = gen_reg_rtx (halfmode);
26175 emit_insn (extract (tmp, src));
26177 else if (high_p)
26179 /* Shift higher 8 bytes to lower 8 bytes. */
26180 tmp = gen_reg_rtx (V1TImode);
26181 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26182 GEN_INT (64)));
26183 tmp = gen_lowpart (imode, tmp);
26185 else
26186 tmp = src;
26188 emit_insn (unpack (dest, tmp));
26190 else
26192 rtx (*unpack)(rtx, rtx, rtx);
26194 switch (imode)
26196 case E_V16QImode:
26197 if (high_p)
26198 unpack = gen_vec_interleave_highv16qi;
26199 else
26200 unpack = gen_vec_interleave_lowv16qi;
26201 break;
26202 case E_V8HImode:
26203 if (high_p)
26204 unpack = gen_vec_interleave_highv8hi;
26205 else
26206 unpack = gen_vec_interleave_lowv8hi;
26207 break;
26208 case E_V4SImode:
26209 if (high_p)
26210 unpack = gen_vec_interleave_highv4si;
26211 else
26212 unpack = gen_vec_interleave_lowv4si;
26213 break;
26214 default:
26215 gcc_unreachable ();
26218 if (unsigned_p)
26219 tmp = force_reg (imode, CONST0_RTX (imode));
26220 else
26221 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26222 src, pc_rtx, pc_rtx);
26224 rtx tmp2 = gen_reg_rtx (imode);
26225 emit_insn (unpack (tmp2, src, tmp));
26226 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26230 /* Expand conditional increment or decrement using adb/sbb instructions.
26231 The default case using setcc followed by the conditional move can be
26232 done by generic code. */
26233 bool
26234 ix86_expand_int_addcc (rtx operands[])
26236 enum rtx_code code = GET_CODE (operands[1]);
26237 rtx flags;
26238 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26239 rtx compare_op;
26240 rtx val = const0_rtx;
26241 bool fpcmp = false;
26242 machine_mode mode;
26243 rtx op0 = XEXP (operands[1], 0);
26244 rtx op1 = XEXP (operands[1], 1);
26246 if (operands[3] != const1_rtx
26247 && operands[3] != constm1_rtx)
26248 return false;
26249 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26250 return false;
26251 code = GET_CODE (compare_op);
26253 flags = XEXP (compare_op, 0);
26255 if (GET_MODE (flags) == CCFPmode
26256 || GET_MODE (flags) == CCFPUmode)
26258 fpcmp = true;
26259 code = ix86_fp_compare_code_to_integer (code);
26262 if (code != LTU)
26264 val = constm1_rtx;
26265 if (fpcmp)
26266 PUT_CODE (compare_op,
26267 reverse_condition_maybe_unordered
26268 (GET_CODE (compare_op)));
26269 else
26270 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26273 mode = GET_MODE (operands[0]);
26275 /* Construct either adc or sbb insn. */
26276 if ((code == LTU) == (operands[3] == constm1_rtx))
26278 switch (mode)
26280 case E_QImode:
26281 insn = gen_subqi3_carry;
26282 break;
26283 case E_HImode:
26284 insn = gen_subhi3_carry;
26285 break;
26286 case E_SImode:
26287 insn = gen_subsi3_carry;
26288 break;
26289 case E_DImode:
26290 insn = gen_subdi3_carry;
26291 break;
26292 default:
26293 gcc_unreachable ();
26296 else
26298 switch (mode)
26300 case E_QImode:
26301 insn = gen_addqi3_carry;
26302 break;
26303 case E_HImode:
26304 insn = gen_addhi3_carry;
26305 break;
26306 case E_SImode:
26307 insn = gen_addsi3_carry;
26308 break;
26309 case E_DImode:
26310 insn = gen_adddi3_carry;
26311 break;
26312 default:
26313 gcc_unreachable ();
26316 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26318 return true;
26322 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26323 but works for floating pointer parameters and nonoffsetable memories.
26324 For pushes, it returns just stack offsets; the values will be saved
26325 in the right order. Maximally three parts are generated. */
26327 static int
26328 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26330 int size;
26332 if (!TARGET_64BIT)
26333 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26334 else
26335 size = (GET_MODE_SIZE (mode) + 4) / 8;
26337 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26338 gcc_assert (size >= 2 && size <= 4);
26340 /* Optimize constant pool reference to immediates. This is used by fp
26341 moves, that force all constants to memory to allow combining. */
26342 if (MEM_P (operand) && MEM_READONLY_P (operand))
26344 rtx tmp = maybe_get_pool_constant (operand);
26345 if (tmp)
26346 operand = tmp;
26349 if (MEM_P (operand) && !offsettable_memref_p (operand))
26351 /* The only non-offsetable memories we handle are pushes. */
26352 int ok = push_operand (operand, VOIDmode);
26354 gcc_assert (ok);
26356 operand = copy_rtx (operand);
26357 PUT_MODE (operand, word_mode);
26358 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26359 return size;
26362 if (GET_CODE (operand) == CONST_VECTOR)
26364 machine_mode imode = int_mode_for_mode (mode);
26365 /* Caution: if we looked through a constant pool memory above,
26366 the operand may actually have a different mode now. That's
26367 ok, since we want to pun this all the way back to an integer. */
26368 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26369 gcc_assert (operand != NULL);
26370 mode = imode;
26373 if (!TARGET_64BIT)
26375 if (mode == DImode)
26376 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26377 else
26379 int i;
26381 if (REG_P (operand))
26383 gcc_assert (reload_completed);
26384 for (i = 0; i < size; i++)
26385 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26387 else if (offsettable_memref_p (operand))
26389 operand = adjust_address (operand, SImode, 0);
26390 parts[0] = operand;
26391 for (i = 1; i < size; i++)
26392 parts[i] = adjust_address (operand, SImode, 4 * i);
26394 else if (CONST_DOUBLE_P (operand))
26396 const REAL_VALUE_TYPE *r;
26397 long l[4];
26399 r = CONST_DOUBLE_REAL_VALUE (operand);
26400 switch (mode)
26402 case E_TFmode:
26403 real_to_target (l, r, mode);
26404 parts[3] = gen_int_mode (l[3], SImode);
26405 parts[2] = gen_int_mode (l[2], SImode);
26406 break;
26407 case E_XFmode:
26408 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26409 long double may not be 80-bit. */
26410 real_to_target (l, r, mode);
26411 parts[2] = gen_int_mode (l[2], SImode);
26412 break;
26413 case E_DFmode:
26414 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26415 break;
26416 default:
26417 gcc_unreachable ();
26419 parts[1] = gen_int_mode (l[1], SImode);
26420 parts[0] = gen_int_mode (l[0], SImode);
26422 else
26423 gcc_unreachable ();
26426 else
26428 if (mode == TImode)
26429 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26430 if (mode == XFmode || mode == TFmode)
26432 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26433 if (REG_P (operand))
26435 gcc_assert (reload_completed);
26436 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26437 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26439 else if (offsettable_memref_p (operand))
26441 operand = adjust_address (operand, DImode, 0);
26442 parts[0] = operand;
26443 parts[1] = adjust_address (operand, upper_mode, 8);
26445 else if (CONST_DOUBLE_P (operand))
26447 long l[4];
26449 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26451 /* real_to_target puts 32-bit pieces in each long. */
26452 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26453 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26454 << 32), DImode);
26456 if (upper_mode == SImode)
26457 parts[1] = gen_int_mode (l[2], SImode);
26458 else
26459 parts[1]
26460 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26461 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26462 << 32), DImode);
26464 else
26465 gcc_unreachable ();
26469 return size;
26472 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26473 Return false when normal moves are needed; true when all required
26474 insns have been emitted. Operands 2-4 contain the input values
26475 int the correct order; operands 5-7 contain the output values. */
26477 void
26478 ix86_split_long_move (rtx operands[])
26480 rtx part[2][4];
26481 int nparts, i, j;
26482 int push = 0;
26483 int collisions = 0;
26484 machine_mode mode = GET_MODE (operands[0]);
26485 bool collisionparts[4];
26487 /* The DFmode expanders may ask us to move double.
26488 For 64bit target this is single move. By hiding the fact
26489 here we simplify i386.md splitters. */
26490 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26492 /* Optimize constant pool reference to immediates. This is used by
26493 fp moves, that force all constants to memory to allow combining. */
26495 if (MEM_P (operands[1])
26496 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26497 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26498 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26499 if (push_operand (operands[0], VOIDmode))
26501 operands[0] = copy_rtx (operands[0]);
26502 PUT_MODE (operands[0], word_mode);
26504 else
26505 operands[0] = gen_lowpart (DImode, operands[0]);
26506 operands[1] = gen_lowpart (DImode, operands[1]);
26507 emit_move_insn (operands[0], operands[1]);
26508 return;
26511 /* The only non-offsettable memory we handle is push. */
26512 if (push_operand (operands[0], VOIDmode))
26513 push = 1;
26514 else
26515 gcc_assert (!MEM_P (operands[0])
26516 || offsettable_memref_p (operands[0]));
26518 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26519 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26521 /* When emitting push, take care for source operands on the stack. */
26522 if (push && MEM_P (operands[1])
26523 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26525 rtx src_base = XEXP (part[1][nparts - 1], 0);
26527 /* Compensate for the stack decrement by 4. */
26528 if (!TARGET_64BIT && nparts == 3
26529 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26530 src_base = plus_constant (Pmode, src_base, 4);
26532 /* src_base refers to the stack pointer and is
26533 automatically decreased by emitted push. */
26534 for (i = 0; i < nparts; i++)
26535 part[1][i] = change_address (part[1][i],
26536 GET_MODE (part[1][i]), src_base);
26539 /* We need to do copy in the right order in case an address register
26540 of the source overlaps the destination. */
26541 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26543 rtx tmp;
26545 for (i = 0; i < nparts; i++)
26547 collisionparts[i]
26548 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26549 if (collisionparts[i])
26550 collisions++;
26553 /* Collision in the middle part can be handled by reordering. */
26554 if (collisions == 1 && nparts == 3 && collisionparts [1])
26556 std::swap (part[0][1], part[0][2]);
26557 std::swap (part[1][1], part[1][2]);
26559 else if (collisions == 1
26560 && nparts == 4
26561 && (collisionparts [1] || collisionparts [2]))
26563 if (collisionparts [1])
26565 std::swap (part[0][1], part[0][2]);
26566 std::swap (part[1][1], part[1][2]);
26568 else
26570 std::swap (part[0][2], part[0][3]);
26571 std::swap (part[1][2], part[1][3]);
26575 /* If there are more collisions, we can't handle it by reordering.
26576 Do an lea to the last part and use only one colliding move. */
26577 else if (collisions > 1)
26579 rtx base, addr, tls_base = NULL_RTX;
26581 collisions = 1;
26583 base = part[0][nparts - 1];
26585 /* Handle the case when the last part isn't valid for lea.
26586 Happens in 64-bit mode storing the 12-byte XFmode. */
26587 if (GET_MODE (base) != Pmode)
26588 base = gen_rtx_REG (Pmode, REGNO (base));
26590 addr = XEXP (part[1][0], 0);
26591 if (TARGET_TLS_DIRECT_SEG_REFS)
26593 struct ix86_address parts;
26594 int ok = ix86_decompose_address (addr, &parts);
26595 gcc_assert (ok);
26596 if (parts.seg == DEFAULT_TLS_SEG_REG)
26598 /* It is not valid to use %gs: or %fs: in
26599 lea though, so we need to remove it from the
26600 address used for lea and add it to each individual
26601 memory loads instead. */
26602 addr = copy_rtx (addr);
26603 rtx *x = &addr;
26604 while (GET_CODE (*x) == PLUS)
26606 for (i = 0; i < 2; i++)
26608 rtx u = XEXP (*x, i);
26609 if (GET_CODE (u) == ZERO_EXTEND)
26610 u = XEXP (u, 0);
26611 if (GET_CODE (u) == UNSPEC
26612 && XINT (u, 1) == UNSPEC_TP)
26614 tls_base = XEXP (*x, i);
26615 *x = XEXP (*x, 1 - i);
26616 break;
26619 if (tls_base)
26620 break;
26621 x = &XEXP (*x, 0);
26623 gcc_assert (tls_base);
26626 emit_insn (gen_rtx_SET (base, addr));
26627 if (tls_base)
26628 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26629 part[1][0] = replace_equiv_address (part[1][0], base);
26630 for (i = 1; i < nparts; i++)
26632 if (tls_base)
26633 base = copy_rtx (base);
26634 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26635 part[1][i] = replace_equiv_address (part[1][i], tmp);
26640 if (push)
26642 if (!TARGET_64BIT)
26644 if (nparts == 3)
26646 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26647 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26648 stack_pointer_rtx, GEN_INT (-4)));
26649 emit_move_insn (part[0][2], part[1][2]);
26651 else if (nparts == 4)
26653 emit_move_insn (part[0][3], part[1][3]);
26654 emit_move_insn (part[0][2], part[1][2]);
26657 else
26659 /* In 64bit mode we don't have 32bit push available. In case this is
26660 register, it is OK - we will just use larger counterpart. We also
26661 retype memory - these comes from attempt to avoid REX prefix on
26662 moving of second half of TFmode value. */
26663 if (GET_MODE (part[1][1]) == SImode)
26665 switch (GET_CODE (part[1][1]))
26667 case MEM:
26668 part[1][1] = adjust_address (part[1][1], DImode, 0);
26669 break;
26671 case REG:
26672 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26673 break;
26675 default:
26676 gcc_unreachable ();
26679 if (GET_MODE (part[1][0]) == SImode)
26680 part[1][0] = part[1][1];
26683 emit_move_insn (part[0][1], part[1][1]);
26684 emit_move_insn (part[0][0], part[1][0]);
26685 return;
26688 /* Choose correct order to not overwrite the source before it is copied. */
26689 if ((REG_P (part[0][0])
26690 && REG_P (part[1][1])
26691 && (REGNO (part[0][0]) == REGNO (part[1][1])
26692 || (nparts == 3
26693 && REGNO (part[0][0]) == REGNO (part[1][2]))
26694 || (nparts == 4
26695 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26696 || (collisions > 0
26697 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26699 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26701 operands[2 + i] = part[0][j];
26702 operands[6 + i] = part[1][j];
26705 else
26707 for (i = 0; i < nparts; i++)
26709 operands[2 + i] = part[0][i];
26710 operands[6 + i] = part[1][i];
26714 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26715 if (optimize_insn_for_size_p ())
26717 for (j = 0; j < nparts - 1; j++)
26718 if (CONST_INT_P (operands[6 + j])
26719 && operands[6 + j] != const0_rtx
26720 && REG_P (operands[2 + j]))
26721 for (i = j; i < nparts - 1; i++)
26722 if (CONST_INT_P (operands[7 + i])
26723 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26724 operands[7 + i] = operands[2 + j];
26727 for (i = 0; i < nparts; i++)
26728 emit_move_insn (operands[2 + i], operands[6 + i]);
26730 return;
26733 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26734 left shift by a constant, either using a single shift or
26735 a sequence of add instructions. */
26737 static void
26738 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26740 rtx (*insn)(rtx, rtx, rtx);
26742 if (count == 1
26743 || (count * ix86_cost->add <= ix86_cost->shift_const
26744 && !optimize_insn_for_size_p ()))
26746 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26747 while (count-- > 0)
26748 emit_insn (insn (operand, operand, operand));
26750 else
26752 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26753 emit_insn (insn (operand, operand, GEN_INT (count)));
26757 void
26758 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26760 rtx (*gen_ashl3)(rtx, rtx, rtx);
26761 rtx (*gen_shld)(rtx, rtx, rtx);
26762 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26764 rtx low[2], high[2];
26765 int count;
26767 if (CONST_INT_P (operands[2]))
26769 split_double_mode (mode, operands, 2, low, high);
26770 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26772 if (count >= half_width)
26774 emit_move_insn (high[0], low[1]);
26775 emit_move_insn (low[0], const0_rtx);
26777 if (count > half_width)
26778 ix86_expand_ashl_const (high[0], count - half_width, mode);
26780 else
26782 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26784 if (!rtx_equal_p (operands[0], operands[1]))
26785 emit_move_insn (operands[0], operands[1]);
26787 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26788 ix86_expand_ashl_const (low[0], count, mode);
26790 return;
26793 split_double_mode (mode, operands, 1, low, high);
26795 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26797 if (operands[1] == const1_rtx)
26799 /* Assuming we've chosen a QImode capable registers, then 1 << N
26800 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26801 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26803 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26805 ix86_expand_clear (low[0]);
26806 ix86_expand_clear (high[0]);
26807 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26809 d = gen_lowpart (QImode, low[0]);
26810 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26811 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26812 emit_insn (gen_rtx_SET (d, s));
26814 d = gen_lowpart (QImode, high[0]);
26815 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26816 s = gen_rtx_NE (QImode, flags, const0_rtx);
26817 emit_insn (gen_rtx_SET (d, s));
26820 /* Otherwise, we can get the same results by manually performing
26821 a bit extract operation on bit 5/6, and then performing the two
26822 shifts. The two methods of getting 0/1 into low/high are exactly
26823 the same size. Avoiding the shift in the bit extract case helps
26824 pentium4 a bit; no one else seems to care much either way. */
26825 else
26827 machine_mode half_mode;
26828 rtx (*gen_lshr3)(rtx, rtx, rtx);
26829 rtx (*gen_and3)(rtx, rtx, rtx);
26830 rtx (*gen_xor3)(rtx, rtx, rtx);
26831 HOST_WIDE_INT bits;
26832 rtx x;
26834 if (mode == DImode)
26836 half_mode = SImode;
26837 gen_lshr3 = gen_lshrsi3;
26838 gen_and3 = gen_andsi3;
26839 gen_xor3 = gen_xorsi3;
26840 bits = 5;
26842 else
26844 half_mode = DImode;
26845 gen_lshr3 = gen_lshrdi3;
26846 gen_and3 = gen_anddi3;
26847 gen_xor3 = gen_xordi3;
26848 bits = 6;
26851 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26852 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26853 else
26854 x = gen_lowpart (half_mode, operands[2]);
26855 emit_insn (gen_rtx_SET (high[0], x));
26857 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26858 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26859 emit_move_insn (low[0], high[0]);
26860 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26863 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26864 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26865 return;
26868 if (operands[1] == constm1_rtx)
26870 /* For -1 << N, we can avoid the shld instruction, because we
26871 know that we're shifting 0...31/63 ones into a -1. */
26872 emit_move_insn (low[0], constm1_rtx);
26873 if (optimize_insn_for_size_p ())
26874 emit_move_insn (high[0], low[0]);
26875 else
26876 emit_move_insn (high[0], constm1_rtx);
26878 else
26880 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26882 if (!rtx_equal_p (operands[0], operands[1]))
26883 emit_move_insn (operands[0], operands[1]);
26885 split_double_mode (mode, operands, 1, low, high);
26886 emit_insn (gen_shld (high[0], low[0], operands[2]));
26889 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26891 if (TARGET_CMOVE && scratch)
26893 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26894 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26896 ix86_expand_clear (scratch);
26897 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26899 else
26901 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26902 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26904 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26908 void
26909 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26911 rtx (*gen_ashr3)(rtx, rtx, rtx)
26912 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26913 rtx (*gen_shrd)(rtx, rtx, rtx);
26914 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26916 rtx low[2], high[2];
26917 int count;
26919 if (CONST_INT_P (operands[2]))
26921 split_double_mode (mode, operands, 2, low, high);
26922 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26924 if (count == GET_MODE_BITSIZE (mode) - 1)
26926 emit_move_insn (high[0], high[1]);
26927 emit_insn (gen_ashr3 (high[0], high[0],
26928 GEN_INT (half_width - 1)));
26929 emit_move_insn (low[0], high[0]);
26932 else if (count >= half_width)
26934 emit_move_insn (low[0], high[1]);
26935 emit_move_insn (high[0], low[0]);
26936 emit_insn (gen_ashr3 (high[0], high[0],
26937 GEN_INT (half_width - 1)));
26939 if (count > half_width)
26940 emit_insn (gen_ashr3 (low[0], low[0],
26941 GEN_INT (count - half_width)));
26943 else
26945 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26947 if (!rtx_equal_p (operands[0], operands[1]))
26948 emit_move_insn (operands[0], operands[1]);
26950 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26951 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26954 else
26956 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26958 if (!rtx_equal_p (operands[0], operands[1]))
26959 emit_move_insn (operands[0], operands[1]);
26961 split_double_mode (mode, operands, 1, low, high);
26963 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26964 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26966 if (TARGET_CMOVE && scratch)
26968 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26969 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26971 emit_move_insn (scratch, high[0]);
26972 emit_insn (gen_ashr3 (scratch, scratch,
26973 GEN_INT (half_width - 1)));
26974 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26975 scratch));
26977 else
26979 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26980 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26982 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26987 void
26988 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26990 rtx (*gen_lshr3)(rtx, rtx, rtx)
26991 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26992 rtx (*gen_shrd)(rtx, rtx, rtx);
26993 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26995 rtx low[2], high[2];
26996 int count;
26998 if (CONST_INT_P (operands[2]))
27000 split_double_mode (mode, operands, 2, low, high);
27001 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27003 if (count >= half_width)
27005 emit_move_insn (low[0], high[1]);
27006 ix86_expand_clear (high[0]);
27008 if (count > half_width)
27009 emit_insn (gen_lshr3 (low[0], low[0],
27010 GEN_INT (count - half_width)));
27012 else
27014 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27016 if (!rtx_equal_p (operands[0], operands[1]))
27017 emit_move_insn (operands[0], operands[1]);
27019 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
27020 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
27023 else
27025 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27027 if (!rtx_equal_p (operands[0], operands[1]))
27028 emit_move_insn (operands[0], operands[1]);
27030 split_double_mode (mode, operands, 1, low, high);
27032 emit_insn (gen_shrd (low[0], high[0], operands[2]));
27033 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
27035 if (TARGET_CMOVE && scratch)
27037 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27038 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27040 ix86_expand_clear (scratch);
27041 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
27042 scratch));
27044 else
27046 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
27047 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
27049 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
27054 /* Predict just emitted jump instruction to be taken with probability PROB. */
27055 static void
27056 predict_jump (int prob)
27058 rtx_insn *insn = get_last_insn ();
27059 gcc_assert (JUMP_P (insn));
27060 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
27063 /* Helper function for the string operations below. Dest VARIABLE whether
27064 it is aligned to VALUE bytes. If true, jump to the label. */
27065 static rtx_code_label *
27066 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
27068 rtx_code_label *label = gen_label_rtx ();
27069 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
27070 if (GET_MODE (variable) == DImode)
27071 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
27072 else
27073 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
27074 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
27075 1, label);
27076 if (epilogue)
27077 predict_jump (REG_BR_PROB_BASE * 50 / 100);
27078 else
27079 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27080 return label;
27083 /* Adjust COUNTER by the VALUE. */
27084 static void
27085 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
27087 rtx (*gen_add)(rtx, rtx, rtx)
27088 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
27090 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
27093 /* Zero extend possibly SImode EXP to Pmode register. */
27095 ix86_zero_extend_to_Pmode (rtx exp)
27097 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
27100 /* Divide COUNTREG by SCALE. */
27101 static rtx
27102 scale_counter (rtx countreg, int scale)
27104 rtx sc;
27106 if (scale == 1)
27107 return countreg;
27108 if (CONST_INT_P (countreg))
27109 return GEN_INT (INTVAL (countreg) / scale);
27110 gcc_assert (REG_P (countreg));
27112 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
27113 GEN_INT (exact_log2 (scale)),
27114 NULL, 1, OPTAB_DIRECT);
27115 return sc;
27118 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
27119 DImode for constant loop counts. */
27121 static machine_mode
27122 counter_mode (rtx count_exp)
27124 if (GET_MODE (count_exp) != VOIDmode)
27125 return GET_MODE (count_exp);
27126 if (!CONST_INT_P (count_exp))
27127 return Pmode;
27128 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
27129 return DImode;
27130 return SImode;
27133 /* Copy the address to a Pmode register. This is used for x32 to
27134 truncate DImode TLS address to a SImode register. */
27136 static rtx
27137 ix86_copy_addr_to_reg (rtx addr)
27139 rtx reg;
27140 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27142 reg = copy_addr_to_reg (addr);
27143 REG_POINTER (reg) = 1;
27144 return reg;
27146 else
27148 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27149 reg = copy_to_mode_reg (DImode, addr);
27150 REG_POINTER (reg) = 1;
27151 return gen_rtx_SUBREG (SImode, reg, 0);
27155 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27156 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27157 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27158 memory by VALUE (supposed to be in MODE).
27160 The size is rounded down to whole number of chunk size moved at once.
27161 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27164 static void
27165 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27166 rtx destptr, rtx srcptr, rtx value,
27167 rtx count, machine_mode mode, int unroll,
27168 int expected_size, bool issetmem)
27170 rtx_code_label *out_label, *top_label;
27171 rtx iter, tmp;
27172 machine_mode iter_mode = counter_mode (count);
27173 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27174 rtx piece_size = GEN_INT (piece_size_n);
27175 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27176 rtx size;
27177 int i;
27179 top_label = gen_label_rtx ();
27180 out_label = gen_label_rtx ();
27181 iter = gen_reg_rtx (iter_mode);
27183 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27184 NULL, 1, OPTAB_DIRECT);
27185 /* Those two should combine. */
27186 if (piece_size == const1_rtx)
27188 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27189 true, out_label);
27190 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27192 emit_move_insn (iter, const0_rtx);
27194 emit_label (top_label);
27196 tmp = convert_modes (Pmode, iter_mode, iter, true);
27198 /* This assert could be relaxed - in this case we'll need to compute
27199 smallest power of two, containing in PIECE_SIZE_N and pass it to
27200 offset_address. */
27201 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27202 destmem = offset_address (destmem, tmp, piece_size_n);
27203 destmem = adjust_address (destmem, mode, 0);
27205 if (!issetmem)
27207 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27208 srcmem = adjust_address (srcmem, mode, 0);
27210 /* When unrolling for chips that reorder memory reads and writes,
27211 we can save registers by using single temporary.
27212 Also using 4 temporaries is overkill in 32bit mode. */
27213 if (!TARGET_64BIT && 0)
27215 for (i = 0; i < unroll; i++)
27217 if (i)
27219 destmem =
27220 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27221 srcmem =
27222 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27224 emit_move_insn (destmem, srcmem);
27227 else
27229 rtx tmpreg[4];
27230 gcc_assert (unroll <= 4);
27231 for (i = 0; i < unroll; i++)
27233 tmpreg[i] = gen_reg_rtx (mode);
27234 if (i)
27236 srcmem =
27237 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27239 emit_move_insn (tmpreg[i], srcmem);
27241 for (i = 0; i < unroll; i++)
27243 if (i)
27245 destmem =
27246 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27248 emit_move_insn (destmem, tmpreg[i]);
27252 else
27253 for (i = 0; i < unroll; i++)
27255 if (i)
27256 destmem =
27257 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27258 emit_move_insn (destmem, value);
27261 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27262 true, OPTAB_LIB_WIDEN);
27263 if (tmp != iter)
27264 emit_move_insn (iter, tmp);
27266 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27267 true, top_label);
27268 if (expected_size != -1)
27270 expected_size /= GET_MODE_SIZE (mode) * unroll;
27271 if (expected_size == 0)
27272 predict_jump (0);
27273 else if (expected_size > REG_BR_PROB_BASE)
27274 predict_jump (REG_BR_PROB_BASE - 1);
27275 else
27276 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27278 else
27279 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27280 iter = ix86_zero_extend_to_Pmode (iter);
27281 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27282 true, OPTAB_LIB_WIDEN);
27283 if (tmp != destptr)
27284 emit_move_insn (destptr, tmp);
27285 if (!issetmem)
27287 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27288 true, OPTAB_LIB_WIDEN);
27289 if (tmp != srcptr)
27290 emit_move_insn (srcptr, tmp);
27292 emit_label (out_label);
27295 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27296 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27297 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27298 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27299 ORIG_VALUE is the original value passed to memset to fill the memory with.
27300 Other arguments have same meaning as for previous function. */
27302 static void
27303 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27304 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27305 rtx count,
27306 machine_mode mode, bool issetmem)
27308 rtx destexp;
27309 rtx srcexp;
27310 rtx countreg;
27311 HOST_WIDE_INT rounded_count;
27313 /* If possible, it is shorter to use rep movs.
27314 TODO: Maybe it is better to move this logic to decide_alg. */
27315 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27316 && (!issetmem || orig_value == const0_rtx))
27317 mode = SImode;
27319 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27320 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27322 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27323 GET_MODE_SIZE (mode)));
27324 if (mode != QImode)
27326 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27327 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27328 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27330 else
27331 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27332 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27334 rounded_count
27335 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27336 destmem = shallow_copy_rtx (destmem);
27337 set_mem_size (destmem, rounded_count);
27339 else if (MEM_SIZE_KNOWN_P (destmem))
27340 clear_mem_size (destmem);
27342 if (issetmem)
27344 value = force_reg (mode, gen_lowpart (mode, value));
27345 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27347 else
27349 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27350 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27351 if (mode != QImode)
27353 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27354 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27355 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27357 else
27358 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27359 if (CONST_INT_P (count))
27361 rounded_count
27362 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27363 srcmem = shallow_copy_rtx (srcmem);
27364 set_mem_size (srcmem, rounded_count);
27366 else
27368 if (MEM_SIZE_KNOWN_P (srcmem))
27369 clear_mem_size (srcmem);
27371 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27372 destexp, srcexp));
27376 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27377 DESTMEM.
27378 SRC is passed by pointer to be updated on return.
27379 Return value is updated DST. */
27380 static rtx
27381 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27382 HOST_WIDE_INT size_to_move)
27384 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27385 enum insn_code code;
27386 machine_mode move_mode;
27387 int piece_size, i;
27389 /* Find the widest mode in which we could perform moves.
27390 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27391 it until move of such size is supported. */
27392 piece_size = 1 << floor_log2 (size_to_move);
27393 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27394 code = optab_handler (mov_optab, move_mode);
27395 while (code == CODE_FOR_nothing && piece_size > 1)
27397 piece_size >>= 1;
27398 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27399 code = optab_handler (mov_optab, move_mode);
27402 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27403 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27404 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27406 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27407 move_mode = mode_for_vector (word_mode, nunits);
27408 code = optab_handler (mov_optab, move_mode);
27409 if (code == CODE_FOR_nothing)
27411 move_mode = word_mode;
27412 piece_size = GET_MODE_SIZE (move_mode);
27413 code = optab_handler (mov_optab, move_mode);
27416 gcc_assert (code != CODE_FOR_nothing);
27418 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27419 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27421 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27422 gcc_assert (size_to_move % piece_size == 0);
27423 adjust = GEN_INT (piece_size);
27424 for (i = 0; i < size_to_move; i += piece_size)
27426 /* We move from memory to memory, so we'll need to do it via
27427 a temporary register. */
27428 tempreg = gen_reg_rtx (move_mode);
27429 emit_insn (GEN_FCN (code) (tempreg, src));
27430 emit_insn (GEN_FCN (code) (dst, tempreg));
27432 emit_move_insn (destptr,
27433 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27434 emit_move_insn (srcptr,
27435 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27437 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27438 piece_size);
27439 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27440 piece_size);
27443 /* Update DST and SRC rtx. */
27444 *srcmem = src;
27445 return dst;
27448 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27449 static void
27450 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27451 rtx destptr, rtx srcptr, rtx count, int max_size)
27453 rtx src, dest;
27454 if (CONST_INT_P (count))
27456 HOST_WIDE_INT countval = INTVAL (count);
27457 HOST_WIDE_INT epilogue_size = countval % max_size;
27458 int i;
27460 /* For now MAX_SIZE should be a power of 2. This assert could be
27461 relaxed, but it'll require a bit more complicated epilogue
27462 expanding. */
27463 gcc_assert ((max_size & (max_size - 1)) == 0);
27464 for (i = max_size; i >= 1; i >>= 1)
27466 if (epilogue_size & i)
27467 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27469 return;
27471 if (max_size > 8)
27473 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27474 count, 1, OPTAB_DIRECT);
27475 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27476 count, QImode, 1, 4, false);
27477 return;
27480 /* When there are stringops, we can cheaply increase dest and src pointers.
27481 Otherwise we save code size by maintaining offset (zero is readily
27482 available from preceding rep operation) and using x86 addressing modes.
27484 if (TARGET_SINGLE_STRINGOP)
27486 if (max_size > 4)
27488 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27489 src = change_address (srcmem, SImode, srcptr);
27490 dest = change_address (destmem, SImode, destptr);
27491 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27492 emit_label (label);
27493 LABEL_NUSES (label) = 1;
27495 if (max_size > 2)
27497 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27498 src = change_address (srcmem, HImode, srcptr);
27499 dest = change_address (destmem, HImode, destptr);
27500 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27501 emit_label (label);
27502 LABEL_NUSES (label) = 1;
27504 if (max_size > 1)
27506 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27507 src = change_address (srcmem, QImode, srcptr);
27508 dest = change_address (destmem, QImode, destptr);
27509 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27510 emit_label (label);
27511 LABEL_NUSES (label) = 1;
27514 else
27516 rtx offset = force_reg (Pmode, const0_rtx);
27517 rtx tmp;
27519 if (max_size > 4)
27521 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27522 src = change_address (srcmem, SImode, srcptr);
27523 dest = change_address (destmem, SImode, destptr);
27524 emit_move_insn (dest, src);
27525 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27526 true, OPTAB_LIB_WIDEN);
27527 if (tmp != offset)
27528 emit_move_insn (offset, tmp);
27529 emit_label (label);
27530 LABEL_NUSES (label) = 1;
27532 if (max_size > 2)
27534 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27535 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27536 src = change_address (srcmem, HImode, tmp);
27537 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27538 dest = change_address (destmem, HImode, tmp);
27539 emit_move_insn (dest, src);
27540 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27541 true, OPTAB_LIB_WIDEN);
27542 if (tmp != offset)
27543 emit_move_insn (offset, tmp);
27544 emit_label (label);
27545 LABEL_NUSES (label) = 1;
27547 if (max_size > 1)
27549 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27550 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27551 src = change_address (srcmem, QImode, tmp);
27552 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27553 dest = change_address (destmem, QImode, tmp);
27554 emit_move_insn (dest, src);
27555 emit_label (label);
27556 LABEL_NUSES (label) = 1;
27561 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27562 with value PROMOTED_VAL.
27563 SRC is passed by pointer to be updated on return.
27564 Return value is updated DST. */
27565 static rtx
27566 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27567 HOST_WIDE_INT size_to_move)
27569 rtx dst = destmem, adjust;
27570 enum insn_code code;
27571 machine_mode move_mode;
27572 int piece_size, i;
27574 /* Find the widest mode in which we could perform moves.
27575 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27576 it until move of such size is supported. */
27577 move_mode = GET_MODE (promoted_val);
27578 if (move_mode == VOIDmode)
27579 move_mode = QImode;
27580 if (size_to_move < GET_MODE_SIZE (move_mode))
27582 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
27583 promoted_val = gen_lowpart (move_mode, promoted_val);
27585 piece_size = GET_MODE_SIZE (move_mode);
27586 code = optab_handler (mov_optab, move_mode);
27587 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27589 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27591 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27592 gcc_assert (size_to_move % piece_size == 0);
27593 adjust = GEN_INT (piece_size);
27594 for (i = 0; i < size_to_move; i += piece_size)
27596 if (piece_size <= GET_MODE_SIZE (word_mode))
27598 emit_insn (gen_strset (destptr, dst, promoted_val));
27599 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27600 piece_size);
27601 continue;
27604 emit_insn (GEN_FCN (code) (dst, promoted_val));
27606 emit_move_insn (destptr,
27607 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27609 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27610 piece_size);
27613 /* Update DST rtx. */
27614 return dst;
27616 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27617 static void
27618 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27619 rtx count, int max_size)
27621 count =
27622 expand_simple_binop (counter_mode (count), AND, count,
27623 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27624 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27625 gen_lowpart (QImode, value), count, QImode,
27626 1, max_size / 2, true);
27629 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27630 static void
27631 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27632 rtx count, int max_size)
27634 rtx dest;
27636 if (CONST_INT_P (count))
27638 HOST_WIDE_INT countval = INTVAL (count);
27639 HOST_WIDE_INT epilogue_size = countval % max_size;
27640 int i;
27642 /* For now MAX_SIZE should be a power of 2. This assert could be
27643 relaxed, but it'll require a bit more complicated epilogue
27644 expanding. */
27645 gcc_assert ((max_size & (max_size - 1)) == 0);
27646 for (i = max_size; i >= 1; i >>= 1)
27648 if (epilogue_size & i)
27650 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27651 destmem = emit_memset (destmem, destptr, vec_value, i);
27652 else
27653 destmem = emit_memset (destmem, destptr, value, i);
27656 return;
27658 if (max_size > 32)
27660 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27661 return;
27663 if (max_size > 16)
27665 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27666 if (TARGET_64BIT)
27668 dest = change_address (destmem, DImode, destptr);
27669 emit_insn (gen_strset (destptr, dest, value));
27670 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27671 emit_insn (gen_strset (destptr, dest, value));
27673 else
27675 dest = change_address (destmem, SImode, destptr);
27676 emit_insn (gen_strset (destptr, dest, value));
27677 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27678 emit_insn (gen_strset (destptr, dest, value));
27679 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27680 emit_insn (gen_strset (destptr, dest, value));
27681 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27682 emit_insn (gen_strset (destptr, dest, value));
27684 emit_label (label);
27685 LABEL_NUSES (label) = 1;
27687 if (max_size > 8)
27689 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27690 if (TARGET_64BIT)
27692 dest = change_address (destmem, DImode, destptr);
27693 emit_insn (gen_strset (destptr, dest, value));
27695 else
27697 dest = change_address (destmem, SImode, destptr);
27698 emit_insn (gen_strset (destptr, dest, value));
27699 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27700 emit_insn (gen_strset (destptr, dest, value));
27702 emit_label (label);
27703 LABEL_NUSES (label) = 1;
27705 if (max_size > 4)
27707 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27708 dest = change_address (destmem, SImode, destptr);
27709 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27710 emit_label (label);
27711 LABEL_NUSES (label) = 1;
27713 if (max_size > 2)
27715 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27716 dest = change_address (destmem, HImode, destptr);
27717 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27718 emit_label (label);
27719 LABEL_NUSES (label) = 1;
27721 if (max_size > 1)
27723 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27724 dest = change_address (destmem, QImode, destptr);
27725 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27726 emit_label (label);
27727 LABEL_NUSES (label) = 1;
27731 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27732 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27733 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27734 ignored.
27735 Return value is updated DESTMEM. */
27736 static rtx
27737 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27738 rtx destptr, rtx srcptr, rtx value,
27739 rtx vec_value, rtx count, int align,
27740 int desired_alignment, bool issetmem)
27742 int i;
27743 for (i = 1; i < desired_alignment; i <<= 1)
27745 if (align <= i)
27747 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27748 if (issetmem)
27750 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27751 destmem = emit_memset (destmem, destptr, vec_value, i);
27752 else
27753 destmem = emit_memset (destmem, destptr, value, i);
27755 else
27756 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27757 ix86_adjust_counter (count, i);
27758 emit_label (label);
27759 LABEL_NUSES (label) = 1;
27760 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27763 return destmem;
27766 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27767 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27768 and jump to DONE_LABEL. */
27769 static void
27770 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27771 rtx destptr, rtx srcptr,
27772 rtx value, rtx vec_value,
27773 rtx count, int size,
27774 rtx done_label, bool issetmem)
27776 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27777 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
27778 rtx modesize;
27779 int n;
27781 /* If we do not have vector value to copy, we must reduce size. */
27782 if (issetmem)
27784 if (!vec_value)
27786 if (GET_MODE (value) == VOIDmode && size > 8)
27787 mode = Pmode;
27788 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27789 mode = GET_MODE (value);
27791 else
27792 mode = GET_MODE (vec_value), value = vec_value;
27794 else
27796 /* Choose appropriate vector mode. */
27797 if (size >= 32)
27798 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27799 else if (size >= 16)
27800 mode = TARGET_SSE ? V16QImode : DImode;
27801 srcmem = change_address (srcmem, mode, srcptr);
27803 destmem = change_address (destmem, mode, destptr);
27804 modesize = GEN_INT (GET_MODE_SIZE (mode));
27805 gcc_assert (GET_MODE_SIZE (mode) <= size);
27806 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27808 if (issetmem)
27809 emit_move_insn (destmem, gen_lowpart (mode, value));
27810 else
27812 emit_move_insn (destmem, srcmem);
27813 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27815 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27818 destmem = offset_address (destmem, count, 1);
27819 destmem = offset_address (destmem, GEN_INT (-2 * size),
27820 GET_MODE_SIZE (mode));
27821 if (!issetmem)
27823 srcmem = offset_address (srcmem, count, 1);
27824 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27825 GET_MODE_SIZE (mode));
27827 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27829 if (issetmem)
27830 emit_move_insn (destmem, gen_lowpart (mode, value));
27831 else
27833 emit_move_insn (destmem, srcmem);
27834 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27836 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27838 emit_jump_insn (gen_jump (done_label));
27839 emit_barrier ();
27841 emit_label (label);
27842 LABEL_NUSES (label) = 1;
27845 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27846 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27847 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27848 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27849 DONE_LABEL is a label after the whole copying sequence. The label is created
27850 on demand if *DONE_LABEL is NULL.
27851 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27852 bounds after the initial copies.
27854 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27855 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27856 we will dispatch to a library call for large blocks.
27858 In pseudocode we do:
27860 if (COUNT < SIZE)
27862 Assume that SIZE is 4. Bigger sizes are handled analogously
27863 if (COUNT & 4)
27865 copy 4 bytes from SRCPTR to DESTPTR
27866 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27867 goto done_label
27869 if (!COUNT)
27870 goto done_label;
27871 copy 1 byte from SRCPTR to DESTPTR
27872 if (COUNT & 2)
27874 copy 2 bytes from SRCPTR to DESTPTR
27875 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27878 else
27880 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27881 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27883 OLD_DESPTR = DESTPTR;
27884 Align DESTPTR up to DESIRED_ALIGN
27885 SRCPTR += DESTPTR - OLD_DESTPTR
27886 COUNT -= DEST_PTR - OLD_DESTPTR
27887 if (DYNAMIC_CHECK)
27888 Round COUNT down to multiple of SIZE
27889 << optional caller supplied zero size guard is here >>
27890 << optional caller supplied dynamic check is here >>
27891 << caller supplied main copy loop is here >>
27893 done_label:
27895 static void
27896 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27897 rtx *destptr, rtx *srcptr,
27898 machine_mode mode,
27899 rtx value, rtx vec_value,
27900 rtx *count,
27901 rtx_code_label **done_label,
27902 int size,
27903 int desired_align,
27904 int align,
27905 unsigned HOST_WIDE_INT *min_size,
27906 bool dynamic_check,
27907 bool issetmem)
27909 rtx_code_label *loop_label = NULL, *label;
27910 int n;
27911 rtx modesize;
27912 int prolog_size = 0;
27913 rtx mode_value;
27915 /* Chose proper value to copy. */
27916 if (issetmem && VECTOR_MODE_P (mode))
27917 mode_value = vec_value;
27918 else
27919 mode_value = value;
27920 gcc_assert (GET_MODE_SIZE (mode) <= size);
27922 /* See if block is big or small, handle small blocks. */
27923 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27925 int size2 = size;
27926 loop_label = gen_label_rtx ();
27928 if (!*done_label)
27929 *done_label = gen_label_rtx ();
27931 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27932 1, loop_label);
27933 size2 >>= 1;
27935 /* Handle sizes > 3. */
27936 for (;size2 > 2; size2 >>= 1)
27937 expand_small_movmem_or_setmem (destmem, srcmem,
27938 *destptr, *srcptr,
27939 value, vec_value,
27940 *count,
27941 size2, *done_label, issetmem);
27942 /* Nothing to copy? Jump to DONE_LABEL if so */
27943 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27944 1, *done_label);
27946 /* Do a byte copy. */
27947 destmem = change_address (destmem, QImode, *destptr);
27948 if (issetmem)
27949 emit_move_insn (destmem, gen_lowpart (QImode, value));
27950 else
27952 srcmem = change_address (srcmem, QImode, *srcptr);
27953 emit_move_insn (destmem, srcmem);
27956 /* Handle sizes 2 and 3. */
27957 label = ix86_expand_aligntest (*count, 2, false);
27958 destmem = change_address (destmem, HImode, *destptr);
27959 destmem = offset_address (destmem, *count, 1);
27960 destmem = offset_address (destmem, GEN_INT (-2), 2);
27961 if (issetmem)
27962 emit_move_insn (destmem, gen_lowpart (HImode, value));
27963 else
27965 srcmem = change_address (srcmem, HImode, *srcptr);
27966 srcmem = offset_address (srcmem, *count, 1);
27967 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27968 emit_move_insn (destmem, srcmem);
27971 emit_label (label);
27972 LABEL_NUSES (label) = 1;
27973 emit_jump_insn (gen_jump (*done_label));
27974 emit_barrier ();
27976 else
27977 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27978 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27980 /* Start memcpy for COUNT >= SIZE. */
27981 if (loop_label)
27983 emit_label (loop_label);
27984 LABEL_NUSES (loop_label) = 1;
27987 /* Copy first desired_align bytes. */
27988 if (!issetmem)
27989 srcmem = change_address (srcmem, mode, *srcptr);
27990 destmem = change_address (destmem, mode, *destptr);
27991 modesize = GEN_INT (GET_MODE_SIZE (mode));
27992 for (n = 0; prolog_size < desired_align - align; n++)
27994 if (issetmem)
27995 emit_move_insn (destmem, mode_value);
27996 else
27998 emit_move_insn (destmem, srcmem);
27999 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
28001 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
28002 prolog_size += GET_MODE_SIZE (mode);
28006 /* Copy last SIZE bytes. */
28007 destmem = offset_address (destmem, *count, 1);
28008 destmem = offset_address (destmem,
28009 GEN_INT (-size - prolog_size),
28011 if (issetmem)
28012 emit_move_insn (destmem, mode_value);
28013 else
28015 srcmem = offset_address (srcmem, *count, 1);
28016 srcmem = offset_address (srcmem,
28017 GEN_INT (-size - prolog_size),
28019 emit_move_insn (destmem, srcmem);
28021 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
28023 destmem = offset_address (destmem, modesize, 1);
28024 if (issetmem)
28025 emit_move_insn (destmem, mode_value);
28026 else
28028 srcmem = offset_address (srcmem, modesize, 1);
28029 emit_move_insn (destmem, srcmem);
28033 /* Align destination. */
28034 if (desired_align > 1 && desired_align > align)
28036 rtx saveddest = *destptr;
28038 gcc_assert (desired_align <= size);
28039 /* Align destptr up, place it to new register. */
28040 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
28041 GEN_INT (prolog_size),
28042 NULL_RTX, 1, OPTAB_DIRECT);
28043 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
28044 REG_POINTER (*destptr) = 1;
28045 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
28046 GEN_INT (-desired_align),
28047 *destptr, 1, OPTAB_DIRECT);
28048 /* See how many bytes we skipped. */
28049 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
28050 *destptr,
28051 saveddest, 1, OPTAB_DIRECT);
28052 /* Adjust srcptr and count. */
28053 if (!issetmem)
28054 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
28055 saveddest, *srcptr, 1, OPTAB_DIRECT);
28056 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28057 saveddest, *count, 1, OPTAB_DIRECT);
28058 /* We copied at most size + prolog_size. */
28059 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
28060 *min_size
28061 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
28062 else
28063 *min_size = 0;
28065 /* Our loops always round down the block size, but for dispatch to
28066 library we need precise value. */
28067 if (dynamic_check)
28068 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
28069 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
28071 else
28073 gcc_assert (prolog_size == 0);
28074 /* Decrease count, so we won't end up copying last word twice. */
28075 if (!CONST_INT_P (*count))
28076 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28077 constm1_rtx, *count, 1, OPTAB_DIRECT);
28078 else
28079 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
28080 (unsigned HOST_WIDE_INT)size));
28081 if (*min_size)
28082 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
28087 /* This function is like the previous one, except here we know how many bytes
28088 need to be copied. That allows us to update alignment not only of DST, which
28089 is returned, but also of SRC, which is passed as a pointer for that
28090 reason. */
28091 static rtx
28092 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
28093 rtx srcreg, rtx value, rtx vec_value,
28094 int desired_align, int align_bytes,
28095 bool issetmem)
28097 rtx src = NULL;
28098 rtx orig_dst = dst;
28099 rtx orig_src = NULL;
28100 int piece_size = 1;
28101 int copied_bytes = 0;
28103 if (!issetmem)
28105 gcc_assert (srcp != NULL);
28106 src = *srcp;
28107 orig_src = src;
28110 for (piece_size = 1;
28111 piece_size <= desired_align && copied_bytes < align_bytes;
28112 piece_size <<= 1)
28114 if (align_bytes & piece_size)
28116 if (issetmem)
28118 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
28119 dst = emit_memset (dst, destreg, vec_value, piece_size);
28120 else
28121 dst = emit_memset (dst, destreg, value, piece_size);
28123 else
28124 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
28125 copied_bytes += piece_size;
28128 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
28129 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28130 if (MEM_SIZE_KNOWN_P (orig_dst))
28131 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
28133 if (!issetmem)
28135 int src_align_bytes = get_mem_align_offset (src, desired_align
28136 * BITS_PER_UNIT);
28137 if (src_align_bytes >= 0)
28138 src_align_bytes = desired_align - src_align_bytes;
28139 if (src_align_bytes >= 0)
28141 unsigned int src_align;
28142 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28144 if ((src_align_bytes & (src_align - 1))
28145 == (align_bytes & (src_align - 1)))
28146 break;
28148 if (src_align > (unsigned int) desired_align)
28149 src_align = desired_align;
28150 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28151 set_mem_align (src, src_align * BITS_PER_UNIT);
28153 if (MEM_SIZE_KNOWN_P (orig_src))
28154 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28155 *srcp = src;
28158 return dst;
28161 /* Return true if ALG can be used in current context.
28162 Assume we expand memset if MEMSET is true. */
28163 static bool
28164 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28166 if (alg == no_stringop)
28167 return false;
28168 if (alg == vector_loop)
28169 return TARGET_SSE || TARGET_AVX;
28170 /* Algorithms using the rep prefix want at least edi and ecx;
28171 additionally, memset wants eax and memcpy wants esi. Don't
28172 consider such algorithms if the user has appropriated those
28173 registers for their own purposes, or if we have a non-default
28174 address space, since some string insns cannot override the segment. */
28175 if (alg == rep_prefix_1_byte
28176 || alg == rep_prefix_4_byte
28177 || alg == rep_prefix_8_byte)
28179 if (have_as)
28180 return false;
28181 if (fixed_regs[CX_REG]
28182 || fixed_regs[DI_REG]
28183 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28184 return false;
28186 return true;
28189 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28190 static enum stringop_alg
28191 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28192 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28193 bool memset, bool zero_memset, bool have_as,
28194 int *dynamic_check, bool *noalign, bool recur)
28196 const struct stringop_algs *algs;
28197 bool optimize_for_speed;
28198 int max = 0;
28199 const struct processor_costs *cost;
28200 int i;
28201 bool any_alg_usable_p = false;
28203 *noalign = false;
28204 *dynamic_check = -1;
28206 /* Even if the string operation call is cold, we still might spend a lot
28207 of time processing large blocks. */
28208 if (optimize_function_for_size_p (cfun)
28209 || (optimize_insn_for_size_p ()
28210 && (max_size < 256
28211 || (expected_size != -1 && expected_size < 256))))
28212 optimize_for_speed = false;
28213 else
28214 optimize_for_speed = true;
28216 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28217 if (memset)
28218 algs = &cost->memset[TARGET_64BIT != 0];
28219 else
28220 algs = &cost->memcpy[TARGET_64BIT != 0];
28222 /* See maximal size for user defined algorithm. */
28223 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28225 enum stringop_alg candidate = algs->size[i].alg;
28226 bool usable = alg_usable_p (candidate, memset, have_as);
28227 any_alg_usable_p |= usable;
28229 if (candidate != libcall && candidate && usable)
28230 max = algs->size[i].max;
28233 /* If expected size is not known but max size is small enough
28234 so inline version is a win, set expected size into
28235 the range. */
28236 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28237 && expected_size == -1)
28238 expected_size = min_size / 2 + max_size / 2;
28240 /* If user specified the algorithm, honor it if possible. */
28241 if (ix86_stringop_alg != no_stringop
28242 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28243 return ix86_stringop_alg;
28244 /* rep; movq or rep; movl is the smallest variant. */
28245 else if (!optimize_for_speed)
28247 *noalign = true;
28248 if (!count || (count & 3) || (memset && !zero_memset))
28249 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28250 ? rep_prefix_1_byte : loop_1_byte;
28251 else
28252 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28253 ? rep_prefix_4_byte : loop;
28255 /* Very tiny blocks are best handled via the loop, REP is expensive to
28256 setup. */
28257 else if (expected_size != -1 && expected_size < 4)
28258 return loop_1_byte;
28259 else if (expected_size != -1)
28261 enum stringop_alg alg = libcall;
28262 bool alg_noalign = false;
28263 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28265 /* We get here if the algorithms that were not libcall-based
28266 were rep-prefix based and we are unable to use rep prefixes
28267 based on global register usage. Break out of the loop and
28268 use the heuristic below. */
28269 if (algs->size[i].max == 0)
28270 break;
28271 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28273 enum stringop_alg candidate = algs->size[i].alg;
28275 if (candidate != libcall
28276 && alg_usable_p (candidate, memset, have_as))
28278 alg = candidate;
28279 alg_noalign = algs->size[i].noalign;
28281 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28282 last non-libcall inline algorithm. */
28283 if (TARGET_INLINE_ALL_STRINGOPS)
28285 /* When the current size is best to be copied by a libcall,
28286 but we are still forced to inline, run the heuristic below
28287 that will pick code for medium sized blocks. */
28288 if (alg != libcall)
28290 *noalign = alg_noalign;
28291 return alg;
28293 else if (!any_alg_usable_p)
28294 break;
28296 else if (alg_usable_p (candidate, memset, have_as))
28298 *noalign = algs->size[i].noalign;
28299 return candidate;
28304 /* When asked to inline the call anyway, try to pick meaningful choice.
28305 We look for maximal size of block that is faster to copy by hand and
28306 take blocks of at most of that size guessing that average size will
28307 be roughly half of the block.
28309 If this turns out to be bad, we might simply specify the preferred
28310 choice in ix86_costs. */
28311 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28312 && (algs->unknown_size == libcall
28313 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28315 enum stringop_alg alg;
28316 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28318 /* If there aren't any usable algorithms or if recursing already,
28319 then recursing on smaller sizes or same size isn't going to
28320 find anything. Just return the simple byte-at-a-time copy loop. */
28321 if (!any_alg_usable_p || recur)
28323 /* Pick something reasonable. */
28324 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28325 *dynamic_check = 128;
28326 return loop_1_byte;
28328 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28329 zero_memset, have_as, dynamic_check, noalign, true);
28330 gcc_assert (*dynamic_check == -1);
28331 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28332 *dynamic_check = max;
28333 else
28334 gcc_assert (alg != libcall);
28335 return alg;
28337 return (alg_usable_p (algs->unknown_size, memset, have_as)
28338 ? algs->unknown_size : libcall);
28341 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28342 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28343 static int
28344 decide_alignment (int align,
28345 enum stringop_alg alg,
28346 int expected_size,
28347 machine_mode move_mode)
28349 int desired_align = 0;
28351 gcc_assert (alg != no_stringop);
28353 if (alg == libcall)
28354 return 0;
28355 if (move_mode == VOIDmode)
28356 return 0;
28358 desired_align = GET_MODE_SIZE (move_mode);
28359 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28360 copying whole cacheline at once. */
28361 if (TARGET_PENTIUMPRO
28362 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28363 desired_align = 8;
28365 if (optimize_size)
28366 desired_align = 1;
28367 if (desired_align < align)
28368 desired_align = align;
28369 if (expected_size != -1 && expected_size < 4)
28370 desired_align = align;
28372 return desired_align;
28376 /* Helper function for memcpy. For QImode value 0xXY produce
28377 0xXYXYXYXY of wide specified by MODE. This is essentially
28378 a * 0x10101010, but we can do slightly better than
28379 synth_mult by unwinding the sequence by hand on CPUs with
28380 slow multiply. */
28381 static rtx
28382 promote_duplicated_reg (machine_mode mode, rtx val)
28384 machine_mode valmode = GET_MODE (val);
28385 rtx tmp;
28386 int nops = mode == DImode ? 3 : 2;
28388 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28389 if (val == const0_rtx)
28390 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28391 if (CONST_INT_P (val))
28393 HOST_WIDE_INT v = INTVAL (val) & 255;
28395 v |= v << 8;
28396 v |= v << 16;
28397 if (mode == DImode)
28398 v |= (v << 16) << 16;
28399 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28402 if (valmode == VOIDmode)
28403 valmode = QImode;
28404 if (valmode != QImode)
28405 val = gen_lowpart (QImode, val);
28406 if (mode == QImode)
28407 return val;
28408 if (!TARGET_PARTIAL_REG_STALL)
28409 nops--;
28410 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28411 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28412 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28413 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28415 rtx reg = convert_modes (mode, QImode, val, true);
28416 tmp = promote_duplicated_reg (mode, const1_rtx);
28417 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28418 OPTAB_DIRECT);
28420 else
28422 rtx reg = convert_modes (mode, QImode, val, true);
28424 if (!TARGET_PARTIAL_REG_STALL)
28425 if (mode == SImode)
28426 emit_insn (gen_insvsi_1 (reg, reg));
28427 else
28428 emit_insn (gen_insvdi_1 (reg, reg));
28429 else
28431 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28432 NULL, 1, OPTAB_DIRECT);
28433 reg =
28434 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28436 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28437 NULL, 1, OPTAB_DIRECT);
28438 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28439 if (mode == SImode)
28440 return reg;
28441 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28442 NULL, 1, OPTAB_DIRECT);
28443 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28444 return reg;
28448 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28449 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28450 alignment from ALIGN to DESIRED_ALIGN. */
28451 static rtx
28452 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28453 int align)
28455 rtx promoted_val;
28457 if (TARGET_64BIT
28458 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28459 promoted_val = promote_duplicated_reg (DImode, val);
28460 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28461 promoted_val = promote_duplicated_reg (SImode, val);
28462 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28463 promoted_val = promote_duplicated_reg (HImode, val);
28464 else
28465 promoted_val = val;
28467 return promoted_val;
28470 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28471 operations when profitable. The code depends upon architecture, block size
28472 and alignment, but always has one of the following overall structures:
28474 Aligned move sequence:
28476 1) Prologue guard: Conditional that jumps up to epilogues for small
28477 blocks that can be handled by epilogue alone. This is faster
28478 but also needed for correctness, since prologue assume the block
28479 is larger than the desired alignment.
28481 Optional dynamic check for size and libcall for large
28482 blocks is emitted here too, with -minline-stringops-dynamically.
28484 2) Prologue: copy first few bytes in order to get destination
28485 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28486 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28487 copied. We emit either a jump tree on power of two sized
28488 blocks, or a byte loop.
28490 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28491 with specified algorithm.
28493 4) Epilogue: code copying tail of the block that is too small to be
28494 handled by main body (or up to size guarded by prologue guard).
28496 Misaligned move sequence
28498 1) missaligned move prologue/epilogue containing:
28499 a) Prologue handling small memory blocks and jumping to done_label
28500 (skipped if blocks are known to be large enough)
28501 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28502 needed by single possibly misaligned move
28503 (skipped if alignment is not needed)
28504 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28506 2) Zero size guard dispatching to done_label, if needed
28508 3) dispatch to library call, if needed,
28510 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28511 with specified algorithm. */
28512 bool
28513 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28514 rtx align_exp, rtx expected_align_exp,
28515 rtx expected_size_exp, rtx min_size_exp,
28516 rtx max_size_exp, rtx probable_max_size_exp,
28517 bool issetmem)
28519 rtx destreg;
28520 rtx srcreg = NULL;
28521 rtx_code_label *label = NULL;
28522 rtx tmp;
28523 rtx_code_label *jump_around_label = NULL;
28524 HOST_WIDE_INT align = 1;
28525 unsigned HOST_WIDE_INT count = 0;
28526 HOST_WIDE_INT expected_size = -1;
28527 int size_needed = 0, epilogue_size_needed;
28528 int desired_align = 0, align_bytes = 0;
28529 enum stringop_alg alg;
28530 rtx promoted_val = NULL;
28531 rtx vec_promoted_val = NULL;
28532 bool force_loopy_epilogue = false;
28533 int dynamic_check;
28534 bool need_zero_guard = false;
28535 bool noalign;
28536 machine_mode move_mode = VOIDmode;
28537 machine_mode wider_mode;
28538 int unroll_factor = 1;
28539 /* TODO: Once value ranges are available, fill in proper data. */
28540 unsigned HOST_WIDE_INT min_size = 0;
28541 unsigned HOST_WIDE_INT max_size = -1;
28542 unsigned HOST_WIDE_INT probable_max_size = -1;
28543 bool misaligned_prologue_used = false;
28544 bool have_as;
28546 if (CONST_INT_P (align_exp))
28547 align = INTVAL (align_exp);
28548 /* i386 can do misaligned access on reasonably increased cost. */
28549 if (CONST_INT_P (expected_align_exp)
28550 && INTVAL (expected_align_exp) > align)
28551 align = INTVAL (expected_align_exp);
28552 /* ALIGN is the minimum of destination and source alignment, but we care here
28553 just about destination alignment. */
28554 else if (!issetmem
28555 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28556 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28558 if (CONST_INT_P (count_exp))
28560 min_size = max_size = probable_max_size = count = expected_size
28561 = INTVAL (count_exp);
28562 /* When COUNT is 0, there is nothing to do. */
28563 if (!count)
28564 return true;
28566 else
28568 if (min_size_exp)
28569 min_size = INTVAL (min_size_exp);
28570 if (max_size_exp)
28571 max_size = INTVAL (max_size_exp);
28572 if (probable_max_size_exp)
28573 probable_max_size = INTVAL (probable_max_size_exp);
28574 if (CONST_INT_P (expected_size_exp))
28575 expected_size = INTVAL (expected_size_exp);
28578 /* Make sure we don't need to care about overflow later on. */
28579 if (count > (HOST_WIDE_INT_1U << 30))
28580 return false;
28582 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28583 if (!issetmem)
28584 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28586 /* Step 0: Decide on preferred algorithm, desired alignment and
28587 size of chunks to be copied by main loop. */
28588 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28589 issetmem,
28590 issetmem && val_exp == const0_rtx, have_as,
28591 &dynamic_check, &noalign, false);
28592 if (alg == libcall)
28593 return false;
28594 gcc_assert (alg != no_stringop);
28596 /* For now vector-version of memset is generated only for memory zeroing, as
28597 creating of promoted vector value is very cheap in this case. */
28598 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28599 alg = unrolled_loop;
28601 if (!count)
28602 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28603 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28604 if (!issetmem)
28605 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28607 unroll_factor = 1;
28608 move_mode = word_mode;
28609 switch (alg)
28611 case libcall:
28612 case no_stringop:
28613 case last_alg:
28614 gcc_unreachable ();
28615 case loop_1_byte:
28616 need_zero_guard = true;
28617 move_mode = QImode;
28618 break;
28619 case loop:
28620 need_zero_guard = true;
28621 break;
28622 case unrolled_loop:
28623 need_zero_guard = true;
28624 unroll_factor = (TARGET_64BIT ? 4 : 2);
28625 break;
28626 case vector_loop:
28627 need_zero_guard = true;
28628 unroll_factor = 4;
28629 /* Find the widest supported mode. */
28630 move_mode = word_mode;
28631 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
28632 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
28633 move_mode = wider_mode;
28635 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28636 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28637 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28639 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28640 move_mode = mode_for_vector (word_mode, nunits);
28641 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28642 move_mode = word_mode;
28644 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28645 break;
28646 case rep_prefix_8_byte:
28647 move_mode = DImode;
28648 break;
28649 case rep_prefix_4_byte:
28650 move_mode = SImode;
28651 break;
28652 case rep_prefix_1_byte:
28653 move_mode = QImode;
28654 break;
28656 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28657 epilogue_size_needed = size_needed;
28659 /* If we are going to call any library calls conditionally, make sure any
28660 pending stack adjustment happen before the first conditional branch,
28661 otherwise they will be emitted before the library call only and won't
28662 happen from the other branches. */
28663 if (dynamic_check != -1)
28664 do_pending_stack_adjust ();
28666 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28667 if (!TARGET_ALIGN_STRINGOPS || noalign)
28668 align = desired_align;
28670 /* Step 1: Prologue guard. */
28672 /* Alignment code needs count to be in register. */
28673 if (CONST_INT_P (count_exp) && desired_align > align)
28675 if (INTVAL (count_exp) > desired_align
28676 && INTVAL (count_exp) > size_needed)
28678 align_bytes
28679 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28680 if (align_bytes <= 0)
28681 align_bytes = 0;
28682 else
28683 align_bytes = desired_align - align_bytes;
28685 if (align_bytes == 0)
28686 count_exp = force_reg (counter_mode (count_exp), count_exp);
28688 gcc_assert (desired_align >= 1 && align >= 1);
28690 /* Misaligned move sequences handle both prologue and epilogue at once.
28691 Default code generation results in a smaller code for large alignments
28692 and also avoids redundant job when sizes are known precisely. */
28693 misaligned_prologue_used
28694 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28695 && MAX (desired_align, epilogue_size_needed) <= 32
28696 && desired_align <= epilogue_size_needed
28697 && ((desired_align > align && !align_bytes)
28698 || (!count && epilogue_size_needed > 1)));
28700 /* Do the cheap promotion to allow better CSE across the
28701 main loop and epilogue (ie one load of the big constant in the
28702 front of all code.
28703 For now the misaligned move sequences do not have fast path
28704 without broadcasting. */
28705 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28707 if (alg == vector_loop)
28709 gcc_assert (val_exp == const0_rtx);
28710 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28711 promoted_val = promote_duplicated_reg_to_size (val_exp,
28712 GET_MODE_SIZE (word_mode),
28713 desired_align, align);
28715 else
28717 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28718 desired_align, align);
28721 /* Misaligned move sequences handles both prologues and epilogues at once.
28722 Default code generation results in smaller code for large alignments and
28723 also avoids redundant job when sizes are known precisely. */
28724 if (misaligned_prologue_used)
28726 /* Misaligned move prologue handled small blocks by itself. */
28727 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28728 (dst, src, &destreg, &srcreg,
28729 move_mode, promoted_val, vec_promoted_val,
28730 &count_exp,
28731 &jump_around_label,
28732 desired_align < align
28733 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28734 desired_align, align, &min_size, dynamic_check, issetmem);
28735 if (!issetmem)
28736 src = change_address (src, BLKmode, srcreg);
28737 dst = change_address (dst, BLKmode, destreg);
28738 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28739 epilogue_size_needed = 0;
28740 if (need_zero_guard
28741 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28743 /* It is possible that we copied enough so the main loop will not
28744 execute. */
28745 gcc_assert (size_needed > 1);
28746 if (jump_around_label == NULL_RTX)
28747 jump_around_label = gen_label_rtx ();
28748 emit_cmp_and_jump_insns (count_exp,
28749 GEN_INT (size_needed),
28750 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28751 if (expected_size == -1
28752 || expected_size < (desired_align - align) / 2 + size_needed)
28753 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28754 else
28755 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28758 /* Ensure that alignment prologue won't copy past end of block. */
28759 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28761 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28762 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28763 Make sure it is power of 2. */
28764 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28766 /* To improve performance of small blocks, we jump around the VAL
28767 promoting mode. This mean that if the promoted VAL is not constant,
28768 we might not use it in the epilogue and have to use byte
28769 loop variant. */
28770 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28771 force_loopy_epilogue = true;
28772 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28773 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28775 /* If main algorithm works on QImode, no epilogue is needed.
28776 For small sizes just don't align anything. */
28777 if (size_needed == 1)
28778 desired_align = align;
28779 else
28780 goto epilogue;
28782 else if (!count
28783 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28785 label = gen_label_rtx ();
28786 emit_cmp_and_jump_insns (count_exp,
28787 GEN_INT (epilogue_size_needed),
28788 LTU, 0, counter_mode (count_exp), 1, label);
28789 if (expected_size == -1 || expected_size < epilogue_size_needed)
28790 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28791 else
28792 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28796 /* Emit code to decide on runtime whether library call or inline should be
28797 used. */
28798 if (dynamic_check != -1)
28800 if (!issetmem && CONST_INT_P (count_exp))
28802 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28804 emit_block_copy_via_libcall (dst, src, count_exp);
28805 count_exp = const0_rtx;
28806 goto epilogue;
28809 else
28811 rtx_code_label *hot_label = gen_label_rtx ();
28812 if (jump_around_label == NULL_RTX)
28813 jump_around_label = gen_label_rtx ();
28814 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28815 LEU, 0, counter_mode (count_exp),
28816 1, hot_label);
28817 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28818 if (issetmem)
28819 set_storage_via_libcall (dst, count_exp, val_exp);
28820 else
28821 emit_block_copy_via_libcall (dst, src, count_exp);
28822 emit_jump (jump_around_label);
28823 emit_label (hot_label);
28827 /* Step 2: Alignment prologue. */
28828 /* Do the expensive promotion once we branched off the small blocks. */
28829 if (issetmem && !promoted_val)
28830 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28831 desired_align, align);
28833 if (desired_align > align && !misaligned_prologue_used)
28835 if (align_bytes == 0)
28837 /* Except for the first move in prologue, we no longer know
28838 constant offset in aliasing info. It don't seems to worth
28839 the pain to maintain it for the first move, so throw away
28840 the info early. */
28841 dst = change_address (dst, BLKmode, destreg);
28842 if (!issetmem)
28843 src = change_address (src, BLKmode, srcreg);
28844 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28845 promoted_val, vec_promoted_val,
28846 count_exp, align, desired_align,
28847 issetmem);
28848 /* At most desired_align - align bytes are copied. */
28849 if (min_size < (unsigned)(desired_align - align))
28850 min_size = 0;
28851 else
28852 min_size -= desired_align - align;
28854 else
28856 /* If we know how many bytes need to be stored before dst is
28857 sufficiently aligned, maintain aliasing info accurately. */
28858 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28859 srcreg,
28860 promoted_val,
28861 vec_promoted_val,
28862 desired_align,
28863 align_bytes,
28864 issetmem);
28866 count_exp = plus_constant (counter_mode (count_exp),
28867 count_exp, -align_bytes);
28868 count -= align_bytes;
28869 min_size -= align_bytes;
28870 max_size -= align_bytes;
28872 if (need_zero_guard
28873 && min_size < (unsigned HOST_WIDE_INT) size_needed
28874 && (count < (unsigned HOST_WIDE_INT) size_needed
28875 || (align_bytes == 0
28876 && count < ((unsigned HOST_WIDE_INT) size_needed
28877 + desired_align - align))))
28879 /* It is possible that we copied enough so the main loop will not
28880 execute. */
28881 gcc_assert (size_needed > 1);
28882 if (label == NULL_RTX)
28883 label = gen_label_rtx ();
28884 emit_cmp_and_jump_insns (count_exp,
28885 GEN_INT (size_needed),
28886 LTU, 0, counter_mode (count_exp), 1, label);
28887 if (expected_size == -1
28888 || expected_size < (desired_align - align) / 2 + size_needed)
28889 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28890 else
28891 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28894 if (label && size_needed == 1)
28896 emit_label (label);
28897 LABEL_NUSES (label) = 1;
28898 label = NULL;
28899 epilogue_size_needed = 1;
28900 if (issetmem)
28901 promoted_val = val_exp;
28903 else if (label == NULL_RTX && !misaligned_prologue_used)
28904 epilogue_size_needed = size_needed;
28906 /* Step 3: Main loop. */
28908 switch (alg)
28910 case libcall:
28911 case no_stringop:
28912 case last_alg:
28913 gcc_unreachable ();
28914 case loop_1_byte:
28915 case loop:
28916 case unrolled_loop:
28917 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28918 count_exp, move_mode, unroll_factor,
28919 expected_size, issetmem);
28920 break;
28921 case vector_loop:
28922 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28923 vec_promoted_val, count_exp, move_mode,
28924 unroll_factor, expected_size, issetmem);
28925 break;
28926 case rep_prefix_8_byte:
28927 case rep_prefix_4_byte:
28928 case rep_prefix_1_byte:
28929 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28930 val_exp, count_exp, move_mode, issetmem);
28931 break;
28933 /* Adjust properly the offset of src and dest memory for aliasing. */
28934 if (CONST_INT_P (count_exp))
28936 if (!issetmem)
28937 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28938 (count / size_needed) * size_needed);
28939 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28940 (count / size_needed) * size_needed);
28942 else
28944 if (!issetmem)
28945 src = change_address (src, BLKmode, srcreg);
28946 dst = change_address (dst, BLKmode, destreg);
28949 /* Step 4: Epilogue to copy the remaining bytes. */
28950 epilogue:
28951 if (label)
28953 /* When the main loop is done, COUNT_EXP might hold original count,
28954 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28955 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28956 bytes. Compensate if needed. */
28958 if (size_needed < epilogue_size_needed)
28960 tmp =
28961 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28962 GEN_INT (size_needed - 1), count_exp, 1,
28963 OPTAB_DIRECT);
28964 if (tmp != count_exp)
28965 emit_move_insn (count_exp, tmp);
28967 emit_label (label);
28968 LABEL_NUSES (label) = 1;
28971 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28973 if (force_loopy_epilogue)
28974 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28975 epilogue_size_needed);
28976 else
28978 if (issetmem)
28979 expand_setmem_epilogue (dst, destreg, promoted_val,
28980 vec_promoted_val, count_exp,
28981 epilogue_size_needed);
28982 else
28983 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28984 epilogue_size_needed);
28987 if (jump_around_label)
28988 emit_label (jump_around_label);
28989 return true;
28993 /* Expand the appropriate insns for doing strlen if not just doing
28994 repnz; scasb
28996 out = result, initialized with the start address
28997 align_rtx = alignment of the address.
28998 scratch = scratch register, initialized with the startaddress when
28999 not aligned, otherwise undefined
29001 This is just the body. It needs the initializations mentioned above and
29002 some address computing at the end. These things are done in i386.md. */
29004 static void
29005 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
29007 int align;
29008 rtx tmp;
29009 rtx_code_label *align_2_label = NULL;
29010 rtx_code_label *align_3_label = NULL;
29011 rtx_code_label *align_4_label = gen_label_rtx ();
29012 rtx_code_label *end_0_label = gen_label_rtx ();
29013 rtx mem;
29014 rtx tmpreg = gen_reg_rtx (SImode);
29015 rtx scratch = gen_reg_rtx (SImode);
29016 rtx cmp;
29018 align = 0;
29019 if (CONST_INT_P (align_rtx))
29020 align = INTVAL (align_rtx);
29022 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
29024 /* Is there a known alignment and is it less than 4? */
29025 if (align < 4)
29027 rtx scratch1 = gen_reg_rtx (Pmode);
29028 emit_move_insn (scratch1, out);
29029 /* Is there a known alignment and is it not 2? */
29030 if (align != 2)
29032 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
29033 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
29035 /* Leave just the 3 lower bits. */
29036 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
29037 NULL_RTX, 0, OPTAB_WIDEN);
29039 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29040 Pmode, 1, align_4_label);
29041 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
29042 Pmode, 1, align_2_label);
29043 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
29044 Pmode, 1, align_3_label);
29046 else
29048 /* Since the alignment is 2, we have to check 2 or 0 bytes;
29049 check if is aligned to 4 - byte. */
29051 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
29052 NULL_RTX, 0, OPTAB_WIDEN);
29054 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29055 Pmode, 1, align_4_label);
29058 mem = change_address (src, QImode, out);
29060 /* Now compare the bytes. */
29062 /* Compare the first n unaligned byte on a byte per byte basis. */
29063 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
29064 QImode, 1, end_0_label);
29066 /* Increment the address. */
29067 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29069 /* Not needed with an alignment of 2 */
29070 if (align != 2)
29072 emit_label (align_2_label);
29074 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29075 end_0_label);
29077 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29079 emit_label (align_3_label);
29082 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29083 end_0_label);
29085 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29088 /* Generate loop to check 4 bytes at a time. It is not a good idea to
29089 align this loop. It gives only huge programs, but does not help to
29090 speed up. */
29091 emit_label (align_4_label);
29093 mem = change_address (src, SImode, out);
29094 emit_move_insn (scratch, mem);
29095 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
29097 /* This formula yields a nonzero result iff one of the bytes is zero.
29098 This saves three branches inside loop and many cycles. */
29100 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
29101 emit_insn (gen_one_cmplsi2 (scratch, scratch));
29102 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
29103 emit_insn (gen_andsi3 (tmpreg, tmpreg,
29104 gen_int_mode (0x80808080, SImode)));
29105 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
29106 align_4_label);
29108 if (TARGET_CMOVE)
29110 rtx reg = gen_reg_rtx (SImode);
29111 rtx reg2 = gen_reg_rtx (Pmode);
29112 emit_move_insn (reg, tmpreg);
29113 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
29115 /* If zero is not in the first two bytes, move two bytes forward. */
29116 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29117 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29118 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29119 emit_insn (gen_rtx_SET (tmpreg,
29120 gen_rtx_IF_THEN_ELSE (SImode, tmp,
29121 reg,
29122 tmpreg)));
29123 /* Emit lea manually to avoid clobbering of flags. */
29124 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
29126 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29127 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29128 emit_insn (gen_rtx_SET (out,
29129 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
29130 reg2,
29131 out)));
29133 else
29135 rtx_code_label *end_2_label = gen_label_rtx ();
29136 /* Is zero in the first two bytes? */
29138 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29139 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29140 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
29141 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
29142 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29143 pc_rtx);
29144 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29145 JUMP_LABEL (tmp) = end_2_label;
29147 /* Not in the first two. Move two bytes forward. */
29148 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29149 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29151 emit_label (end_2_label);
29155 /* Avoid branch in fixing the byte. */
29156 tmpreg = gen_lowpart (QImode, tmpreg);
29157 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29158 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29159 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29160 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29162 emit_label (end_0_label);
29165 /* Expand strlen. */
29167 bool
29168 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29170 rtx addr, scratch1, scratch2, scratch3, scratch4;
29172 /* The generic case of strlen expander is long. Avoid it's
29173 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29175 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29176 && !TARGET_INLINE_ALL_STRINGOPS
29177 && !optimize_insn_for_size_p ()
29178 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29179 return false;
29181 addr = force_reg (Pmode, XEXP (src, 0));
29182 scratch1 = gen_reg_rtx (Pmode);
29184 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29185 && !optimize_insn_for_size_p ())
29187 /* Well it seems that some optimizer does not combine a call like
29188 foo(strlen(bar), strlen(bar));
29189 when the move and the subtraction is done here. It does calculate
29190 the length just once when these instructions are done inside of
29191 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29192 often used and I use one fewer register for the lifetime of
29193 output_strlen_unroll() this is better. */
29195 emit_move_insn (out, addr);
29197 ix86_expand_strlensi_unroll_1 (out, src, align);
29199 /* strlensi_unroll_1 returns the address of the zero at the end of
29200 the string, like memchr(), so compute the length by subtracting
29201 the start address. */
29202 emit_insn (ix86_gen_sub3 (out, out, addr));
29204 else
29206 rtx unspec;
29208 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29209 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29210 return false;
29211 /* Can't use this for non-default address spaces. */
29212 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29213 return false;
29215 scratch2 = gen_reg_rtx (Pmode);
29216 scratch3 = gen_reg_rtx (Pmode);
29217 scratch4 = force_reg (Pmode, constm1_rtx);
29219 emit_move_insn (scratch3, addr);
29220 eoschar = force_reg (QImode, eoschar);
29222 src = replace_equiv_address_nv (src, scratch3);
29224 /* If .md starts supporting :P, this can be done in .md. */
29225 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29226 scratch4), UNSPEC_SCAS);
29227 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29228 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29229 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29231 return true;
29234 /* For given symbol (function) construct code to compute address of it's PLT
29235 entry in large x86-64 PIC model. */
29236 static rtx
29237 construct_plt_address (rtx symbol)
29239 rtx tmp, unspec;
29241 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29242 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29243 gcc_assert (Pmode == DImode);
29245 tmp = gen_reg_rtx (Pmode);
29246 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29248 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29249 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29250 return tmp;
29254 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29255 rtx callarg2,
29256 rtx pop, bool sibcall)
29258 rtx vec[3];
29259 rtx use = NULL, call;
29260 unsigned int vec_len = 0;
29261 tree fndecl;
29263 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29265 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29266 if (fndecl
29267 && (lookup_attribute ("interrupt",
29268 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29269 error ("interrupt service routine can't be called directly");
29271 else
29272 fndecl = NULL_TREE;
29274 if (pop == const0_rtx)
29275 pop = NULL;
29276 gcc_assert (!TARGET_64BIT || !pop);
29278 if (TARGET_MACHO && !TARGET_64BIT)
29280 #if TARGET_MACHO
29281 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29282 fnaddr = machopic_indirect_call_target (fnaddr);
29283 #endif
29285 else
29287 /* Static functions and indirect calls don't need the pic register. Also,
29288 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29289 it an indirect call. */
29290 rtx addr = XEXP (fnaddr, 0);
29291 if (flag_pic
29292 && GET_CODE (addr) == SYMBOL_REF
29293 && !SYMBOL_REF_LOCAL_P (addr))
29295 if (flag_plt
29296 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29297 || !lookup_attribute ("noplt",
29298 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29300 if (!TARGET_64BIT
29301 || (ix86_cmodel == CM_LARGE_PIC
29302 && DEFAULT_ABI != MS_ABI))
29304 use_reg (&use, gen_rtx_REG (Pmode,
29305 REAL_PIC_OFFSET_TABLE_REGNUM));
29306 if (ix86_use_pseudo_pic_reg ())
29307 emit_move_insn (gen_rtx_REG (Pmode,
29308 REAL_PIC_OFFSET_TABLE_REGNUM),
29309 pic_offset_table_rtx);
29312 else if (!TARGET_PECOFF && !TARGET_MACHO)
29314 if (TARGET_64BIT)
29316 fnaddr = gen_rtx_UNSPEC (Pmode,
29317 gen_rtvec (1, addr),
29318 UNSPEC_GOTPCREL);
29319 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29321 else
29323 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29324 UNSPEC_GOT);
29325 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29326 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29327 fnaddr);
29329 fnaddr = gen_const_mem (Pmode, fnaddr);
29330 /* Pmode may not be the same as word_mode for x32, which
29331 doesn't support indirect branch via 32-bit memory slot.
29332 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29333 indirect branch via x32 GOT slot is OK. */
29334 if (GET_MODE (fnaddr) != word_mode)
29335 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29336 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29341 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29342 parameters passed in vector registers. */
29343 if (TARGET_64BIT
29344 && (INTVAL (callarg2) > 0
29345 || (INTVAL (callarg2) == 0
29346 && (TARGET_SSE || !flag_skip_rax_setup))))
29348 rtx al = gen_rtx_REG (QImode, AX_REG);
29349 emit_move_insn (al, callarg2);
29350 use_reg (&use, al);
29353 if (ix86_cmodel == CM_LARGE_PIC
29354 && !TARGET_PECOFF
29355 && MEM_P (fnaddr)
29356 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29357 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29358 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29359 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29360 branch via x32 GOT slot is OK. */
29361 else if (!(TARGET_X32
29362 && MEM_P (fnaddr)
29363 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29364 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29365 && (sibcall
29366 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29367 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29369 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29370 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29373 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29375 if (retval)
29377 /* We should add bounds as destination register in case
29378 pointer with bounds may be returned. */
29379 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29381 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29382 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29383 if (GET_CODE (retval) == PARALLEL)
29385 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29386 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29387 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29388 retval = chkp_join_splitted_slot (retval, par);
29390 else
29392 retval = gen_rtx_PARALLEL (VOIDmode,
29393 gen_rtvec (3, retval, b0, b1));
29394 chkp_put_regs_to_expr_list (retval);
29398 call = gen_rtx_SET (retval, call);
29400 vec[vec_len++] = call;
29402 if (pop)
29404 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29405 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29406 vec[vec_len++] = pop;
29409 if (cfun->machine->no_caller_saved_registers
29410 && (!fndecl
29411 || (!TREE_THIS_VOLATILE (fndecl)
29412 && !lookup_attribute ("no_caller_saved_registers",
29413 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29415 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29416 bool is_64bit_ms_abi = (TARGET_64BIT
29417 && ix86_function_abi (fndecl) == MS_ABI);
29418 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29420 /* If there are no caller-saved registers, add all registers
29421 that are clobbered by the call which returns. */
29422 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29423 if (!fixed_regs[i]
29424 && (ix86_call_used_regs[i] == 1
29425 || (ix86_call_used_regs[i] & c_mask))
29426 && !STACK_REGNO_P (i)
29427 && !MMX_REGNO_P (i))
29428 clobber_reg (&use,
29429 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29431 else if (TARGET_64BIT_MS_ABI
29432 && (!callarg2 || INTVAL (callarg2) != -2))
29434 unsigned i;
29436 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29438 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29439 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29441 clobber_reg (&use, gen_rtx_REG (mode, regno));
29444 /* Set here, but it may get cleared later. */
29445 if (TARGET_CALL_MS2SYSV_XLOGUES)
29447 if (!TARGET_SSE)
29450 /* Don't break hot-patched functions. */
29451 else if (ix86_function_ms_hook_prologue (current_function_decl))
29454 /* TODO: Cases not yet examined. */
29455 else if (flag_split_stack)
29456 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29458 else
29460 gcc_assert (!reload_completed);
29461 cfun->machine->call_ms2sysv = true;
29466 if (vec_len > 1)
29467 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29468 call = emit_call_insn (call);
29469 if (use)
29470 CALL_INSN_FUNCTION_USAGE (call) = use;
29472 return call;
29475 /* Return true if the function being called was marked with attribute
29476 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29477 to handle the non-PIC case in the backend because there is no easy
29478 interface for the front-end to force non-PLT calls to use the GOT.
29479 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29480 to call the function marked "noplt" indirectly. */
29482 static bool
29483 ix86_nopic_noplt_attribute_p (rtx call_op)
29485 if (flag_pic || ix86_cmodel == CM_LARGE
29486 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29487 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29488 || SYMBOL_REF_LOCAL_P (call_op))
29489 return false;
29491 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29493 if (!flag_plt
29494 || (symbol_decl != NULL_TREE
29495 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29496 return true;
29498 return false;
29501 /* Output the assembly for a call instruction. */
29503 const char *
29504 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29506 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29507 bool seh_nop_p = false;
29508 const char *xasm;
29510 if (SIBLING_CALL_P (insn))
29512 if (direct_p)
29514 if (ix86_nopic_noplt_attribute_p (call_op))
29516 if (TARGET_64BIT)
29517 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29518 else
29519 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29521 else
29522 xasm = "%!jmp\t%P0";
29524 /* SEH epilogue detection requires the indirect branch case
29525 to include REX.W. */
29526 else if (TARGET_SEH)
29527 xasm = "%!rex.W jmp\t%A0";
29528 else
29529 xasm = "%!jmp\t%A0";
29531 output_asm_insn (xasm, &call_op);
29532 return "";
29535 /* SEH unwinding can require an extra nop to be emitted in several
29536 circumstances. Determine if we have one of those. */
29537 if (TARGET_SEH)
29539 rtx_insn *i;
29541 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29543 /* If we get to another real insn, we don't need the nop. */
29544 if (INSN_P (i))
29545 break;
29547 /* If we get to the epilogue note, prevent a catch region from
29548 being adjacent to the standard epilogue sequence. If non-
29549 call-exceptions, we'll have done this during epilogue emission. */
29550 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29551 && !flag_non_call_exceptions
29552 && !can_throw_internal (insn))
29554 seh_nop_p = true;
29555 break;
29559 /* If we didn't find a real insn following the call, prevent the
29560 unwinder from looking into the next function. */
29561 if (i == NULL)
29562 seh_nop_p = true;
29565 if (direct_p)
29567 if (ix86_nopic_noplt_attribute_p (call_op))
29569 if (TARGET_64BIT)
29570 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29571 else
29572 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29574 else
29575 xasm = "%!call\t%P0";
29577 else
29578 xasm = "%!call\t%A0";
29580 output_asm_insn (xasm, &call_op);
29582 if (seh_nop_p)
29583 return "nop";
29585 return "";
29588 /* Clear stack slot assignments remembered from previous functions.
29589 This is called from INIT_EXPANDERS once before RTL is emitted for each
29590 function. */
29592 static struct machine_function *
29593 ix86_init_machine_status (void)
29595 struct machine_function *f;
29597 f = ggc_cleared_alloc<machine_function> ();
29598 f->call_abi = ix86_abi;
29600 return f;
29603 /* Return a MEM corresponding to a stack slot with mode MODE.
29604 Allocate a new slot if necessary.
29606 The RTL for a function can have several slots available: N is
29607 which slot to use. */
29610 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29612 struct stack_local_entry *s;
29614 gcc_assert (n < MAX_386_STACK_LOCALS);
29616 for (s = ix86_stack_locals; s; s = s->next)
29617 if (s->mode == mode && s->n == n)
29618 return validize_mem (copy_rtx (s->rtl));
29620 s = ggc_alloc<stack_local_entry> ();
29621 s->n = n;
29622 s->mode = mode;
29623 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29625 s->next = ix86_stack_locals;
29626 ix86_stack_locals = s;
29627 return validize_mem (copy_rtx (s->rtl));
29630 static void
29631 ix86_instantiate_decls (void)
29633 struct stack_local_entry *s;
29635 for (s = ix86_stack_locals; s; s = s->next)
29636 if (s->rtl != NULL_RTX)
29637 instantiate_decl_rtl (s->rtl);
29640 /* Return the number used for encoding REG, in the range 0..7. */
29642 static int
29643 reg_encoded_number (rtx reg)
29645 unsigned regno = REGNO (reg);
29646 switch (regno)
29648 case AX_REG:
29649 return 0;
29650 case CX_REG:
29651 return 1;
29652 case DX_REG:
29653 return 2;
29654 case BX_REG:
29655 return 3;
29656 case SP_REG:
29657 return 4;
29658 case BP_REG:
29659 return 5;
29660 case SI_REG:
29661 return 6;
29662 case DI_REG:
29663 return 7;
29664 default:
29665 break;
29667 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29668 return regno - FIRST_STACK_REG;
29669 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29670 return regno - FIRST_SSE_REG;
29671 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29672 return regno - FIRST_MMX_REG;
29673 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29674 return regno - FIRST_REX_SSE_REG;
29675 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29676 return regno - FIRST_REX_INT_REG;
29677 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29678 return regno - FIRST_MASK_REG;
29679 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29680 return regno - FIRST_BND_REG;
29681 return -1;
29684 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29685 in its encoding if it could be relevant for ROP mitigation, otherwise
29686 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29687 used for calculating it into them. */
29689 static int
29690 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29691 int *popno0 = 0, int *popno1 = 0)
29693 if (asm_noperands (PATTERN (insn)) >= 0)
29694 return -1;
29695 int has_modrm = get_attr_modrm (insn);
29696 if (!has_modrm)
29697 return -1;
29698 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29699 rtx op0, op1;
29700 switch (cls)
29702 case MODRM_CLASS_OP02:
29703 gcc_assert (noperands >= 3);
29704 if (popno0)
29706 *popno0 = 0;
29707 *popno1 = 2;
29709 op0 = operands[0];
29710 op1 = operands[2];
29711 break;
29712 case MODRM_CLASS_OP01:
29713 gcc_assert (noperands >= 2);
29714 if (popno0)
29716 *popno0 = 0;
29717 *popno1 = 1;
29719 op0 = operands[0];
29720 op1 = operands[1];
29721 break;
29722 default:
29723 return -1;
29725 if (REG_P (op0) && REG_P (op1))
29727 int enc0 = reg_encoded_number (op0);
29728 int enc1 = reg_encoded_number (op1);
29729 return 0xc0 + (enc1 << 3) + enc0;
29731 return -1;
29734 /* Check whether x86 address PARTS is a pc-relative address. */
29736 static bool
29737 rip_relative_addr_p (struct ix86_address *parts)
29739 rtx base, index, disp;
29741 base = parts->base;
29742 index = parts->index;
29743 disp = parts->disp;
29745 if (disp && !base && !index)
29747 if (TARGET_64BIT)
29749 rtx symbol = disp;
29751 if (GET_CODE (disp) == CONST)
29752 symbol = XEXP (disp, 0);
29753 if (GET_CODE (symbol) == PLUS
29754 && CONST_INT_P (XEXP (symbol, 1)))
29755 symbol = XEXP (symbol, 0);
29757 if (GET_CODE (symbol) == LABEL_REF
29758 || (GET_CODE (symbol) == SYMBOL_REF
29759 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29760 || (GET_CODE (symbol) == UNSPEC
29761 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29762 || XINT (symbol, 1) == UNSPEC_PCREL
29763 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29764 return true;
29767 return false;
29770 /* Calculate the length of the memory address in the instruction encoding.
29771 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29772 or other prefixes. We never generate addr32 prefix for LEA insn. */
29775 memory_address_length (rtx addr, bool lea)
29777 struct ix86_address parts;
29778 rtx base, index, disp;
29779 int len;
29780 int ok;
29782 if (GET_CODE (addr) == PRE_DEC
29783 || GET_CODE (addr) == POST_INC
29784 || GET_CODE (addr) == PRE_MODIFY
29785 || GET_CODE (addr) == POST_MODIFY)
29786 return 0;
29788 ok = ix86_decompose_address (addr, &parts);
29789 gcc_assert (ok);
29791 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29793 /* If this is not LEA instruction, add the length of addr32 prefix. */
29794 if (TARGET_64BIT && !lea
29795 && (SImode_address_operand (addr, VOIDmode)
29796 || (parts.base && GET_MODE (parts.base) == SImode)
29797 || (parts.index && GET_MODE (parts.index) == SImode)))
29798 len++;
29800 base = parts.base;
29801 index = parts.index;
29802 disp = parts.disp;
29804 if (base && SUBREG_P (base))
29805 base = SUBREG_REG (base);
29806 if (index && SUBREG_P (index))
29807 index = SUBREG_REG (index);
29809 gcc_assert (base == NULL_RTX || REG_P (base));
29810 gcc_assert (index == NULL_RTX || REG_P (index));
29812 /* Rule of thumb:
29813 - esp as the base always wants an index,
29814 - ebp as the base always wants a displacement,
29815 - r12 as the base always wants an index,
29816 - r13 as the base always wants a displacement. */
29818 /* Register Indirect. */
29819 if (base && !index && !disp)
29821 /* esp (for its index) and ebp (for its displacement) need
29822 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29823 code. */
29824 if (base == arg_pointer_rtx
29825 || base == frame_pointer_rtx
29826 || REGNO (base) == SP_REG
29827 || REGNO (base) == BP_REG
29828 || REGNO (base) == R12_REG
29829 || REGNO (base) == R13_REG)
29830 len++;
29833 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29834 is not disp32, but disp32(%rip), so for disp32
29835 SIB byte is needed, unless print_operand_address
29836 optimizes it into disp32(%rip) or (%rip) is implied
29837 by UNSPEC. */
29838 else if (disp && !base && !index)
29840 len += 4;
29841 if (!rip_relative_addr_p (&parts))
29842 len++;
29844 else
29846 /* Find the length of the displacement constant. */
29847 if (disp)
29849 if (base && satisfies_constraint_K (disp))
29850 len += 1;
29851 else
29852 len += 4;
29854 /* ebp always wants a displacement. Similarly r13. */
29855 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29856 len++;
29858 /* An index requires the two-byte modrm form.... */
29859 if (index
29860 /* ...like esp (or r12), which always wants an index. */
29861 || base == arg_pointer_rtx
29862 || base == frame_pointer_rtx
29863 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29864 len++;
29867 return len;
29870 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29871 is set, expect that insn have 8bit immediate alternative. */
29873 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29875 int len = 0;
29876 int i;
29877 extract_insn_cached (insn);
29878 for (i = recog_data.n_operands - 1; i >= 0; --i)
29879 if (CONSTANT_P (recog_data.operand[i]))
29881 enum attr_mode mode = get_attr_mode (insn);
29883 gcc_assert (!len);
29884 if (shortform && CONST_INT_P (recog_data.operand[i]))
29886 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29887 switch (mode)
29889 case MODE_QI:
29890 len = 1;
29891 continue;
29892 case MODE_HI:
29893 ival = trunc_int_for_mode (ival, HImode);
29894 break;
29895 case MODE_SI:
29896 ival = trunc_int_for_mode (ival, SImode);
29897 break;
29898 default:
29899 break;
29901 if (IN_RANGE (ival, -128, 127))
29903 len = 1;
29904 continue;
29907 switch (mode)
29909 case MODE_QI:
29910 len = 1;
29911 break;
29912 case MODE_HI:
29913 len = 2;
29914 break;
29915 case MODE_SI:
29916 len = 4;
29917 break;
29918 /* Immediates for DImode instructions are encoded
29919 as 32bit sign extended values. */
29920 case MODE_DI:
29921 len = 4;
29922 break;
29923 default:
29924 fatal_insn ("unknown insn mode", insn);
29927 return len;
29930 /* Compute default value for "length_address" attribute. */
29932 ix86_attr_length_address_default (rtx_insn *insn)
29934 int i;
29936 if (get_attr_type (insn) == TYPE_LEA)
29938 rtx set = PATTERN (insn), addr;
29940 if (GET_CODE (set) == PARALLEL)
29941 set = XVECEXP (set, 0, 0);
29943 gcc_assert (GET_CODE (set) == SET);
29945 addr = SET_SRC (set);
29947 return memory_address_length (addr, true);
29950 extract_insn_cached (insn);
29951 for (i = recog_data.n_operands - 1; i >= 0; --i)
29953 rtx op = recog_data.operand[i];
29954 if (MEM_P (op))
29956 constrain_operands_cached (insn, reload_completed);
29957 if (which_alternative != -1)
29959 const char *constraints = recog_data.constraints[i];
29960 int alt = which_alternative;
29962 while (*constraints == '=' || *constraints == '+')
29963 constraints++;
29964 while (alt-- > 0)
29965 while (*constraints++ != ',')
29967 /* Skip ignored operands. */
29968 if (*constraints == 'X')
29969 continue;
29972 int len = memory_address_length (XEXP (op, 0), false);
29974 /* Account for segment prefix for non-default addr spaces. */
29975 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29976 len++;
29978 return len;
29981 return 0;
29984 /* Compute default value for "length_vex" attribute. It includes
29985 2 or 3 byte VEX prefix and 1 opcode byte. */
29988 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29989 bool has_vex_w)
29991 int i;
29993 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29994 byte VEX prefix. */
29995 if (!has_0f_opcode || has_vex_w)
29996 return 3 + 1;
29998 /* We can always use 2 byte VEX prefix in 32bit. */
29999 if (!TARGET_64BIT)
30000 return 2 + 1;
30002 extract_insn_cached (insn);
30004 for (i = recog_data.n_operands - 1; i >= 0; --i)
30005 if (REG_P (recog_data.operand[i]))
30007 /* REX.W bit uses 3 byte VEX prefix. */
30008 if (GET_MODE (recog_data.operand[i]) == DImode
30009 && GENERAL_REG_P (recog_data.operand[i]))
30010 return 3 + 1;
30012 else
30014 /* REX.X or REX.B bits use 3 byte VEX prefix. */
30015 if (MEM_P (recog_data.operand[i])
30016 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
30017 return 3 + 1;
30020 return 2 + 1;
30023 /* Return the maximum number of instructions a cpu can issue. */
30025 static int
30026 ix86_issue_rate (void)
30028 switch (ix86_tune)
30030 case PROCESSOR_PENTIUM:
30031 case PROCESSOR_LAKEMONT:
30032 case PROCESSOR_BONNELL:
30033 case PROCESSOR_SILVERMONT:
30034 case PROCESSOR_KNL:
30035 case PROCESSOR_INTEL:
30036 case PROCESSOR_K6:
30037 case PROCESSOR_BTVER2:
30038 case PROCESSOR_PENTIUM4:
30039 case PROCESSOR_NOCONA:
30040 return 2;
30042 case PROCESSOR_PENTIUMPRO:
30043 case PROCESSOR_ATHLON:
30044 case PROCESSOR_K8:
30045 case PROCESSOR_AMDFAM10:
30046 case PROCESSOR_GENERIC:
30047 case PROCESSOR_BTVER1:
30048 return 3;
30050 case PROCESSOR_BDVER1:
30051 case PROCESSOR_BDVER2:
30052 case PROCESSOR_BDVER3:
30053 case PROCESSOR_BDVER4:
30054 case PROCESSOR_ZNVER1:
30055 case PROCESSOR_CORE2:
30056 case PROCESSOR_NEHALEM:
30057 case PROCESSOR_SANDYBRIDGE:
30058 case PROCESSOR_HASWELL:
30059 return 4;
30061 default:
30062 return 1;
30066 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
30067 by DEP_INSN and nothing set by DEP_INSN. */
30069 static bool
30070 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
30072 rtx set, set2;
30074 /* Simplify the test for uninteresting insns. */
30075 if (insn_type != TYPE_SETCC
30076 && insn_type != TYPE_ICMOV
30077 && insn_type != TYPE_FCMOV
30078 && insn_type != TYPE_IBR)
30079 return false;
30081 if ((set = single_set (dep_insn)) != 0)
30083 set = SET_DEST (set);
30084 set2 = NULL_RTX;
30086 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
30087 && XVECLEN (PATTERN (dep_insn), 0) == 2
30088 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
30089 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
30091 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30092 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30094 else
30095 return false;
30097 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
30098 return false;
30100 /* This test is true if the dependent insn reads the flags but
30101 not any other potentially set register. */
30102 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
30103 return false;
30105 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
30106 return false;
30108 return true;
30111 /* Return true iff USE_INSN has a memory address with operands set by
30112 SET_INSN. */
30114 bool
30115 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
30117 int i;
30118 extract_insn_cached (use_insn);
30119 for (i = recog_data.n_operands - 1; i >= 0; --i)
30120 if (MEM_P (recog_data.operand[i]))
30122 rtx addr = XEXP (recog_data.operand[i], 0);
30123 if (modified_in_p (addr, set_insn) != 0)
30125 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
30126 has SP based memory (unless index reg is modified in a pop). */
30127 rtx set = single_set (set_insn);
30128 if (set
30129 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
30130 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
30132 struct ix86_address parts;
30133 if (ix86_decompose_address (addr, &parts)
30134 && parts.base == stack_pointer_rtx
30135 && (parts.index == NULL_RTX
30136 || MEM_P (SET_DEST (set))
30137 || !modified_in_p (parts.index, set_insn)))
30138 return false;
30140 return true;
30142 return false;
30144 return false;
30147 /* Helper function for exact_store_load_dependency.
30148 Return true if addr is found in insn. */
30149 static bool
30150 exact_dependency_1 (rtx addr, rtx insn)
30152 enum rtx_code code;
30153 const char *format_ptr;
30154 int i, j;
30156 code = GET_CODE (insn);
30157 switch (code)
30159 case MEM:
30160 if (rtx_equal_p (addr, insn))
30161 return true;
30162 break;
30163 case REG:
30164 CASE_CONST_ANY:
30165 case SYMBOL_REF:
30166 case CODE_LABEL:
30167 case PC:
30168 case CC0:
30169 case EXPR_LIST:
30170 return false;
30171 default:
30172 break;
30175 format_ptr = GET_RTX_FORMAT (code);
30176 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30178 switch (*format_ptr++)
30180 case 'e':
30181 if (exact_dependency_1 (addr, XEXP (insn, i)))
30182 return true;
30183 break;
30184 case 'E':
30185 for (j = 0; j < XVECLEN (insn, i); j++)
30186 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30187 return true;
30188 break;
30191 return false;
30194 /* Return true if there exists exact dependency for store & load, i.e.
30195 the same memory address is used in them. */
30196 static bool
30197 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30199 rtx set1, set2;
30201 set1 = single_set (store);
30202 if (!set1)
30203 return false;
30204 if (!MEM_P (SET_DEST (set1)))
30205 return false;
30206 set2 = single_set (load);
30207 if (!set2)
30208 return false;
30209 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30210 return true;
30211 return false;
30214 static int
30215 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30216 unsigned int)
30218 enum attr_type insn_type, dep_insn_type;
30219 enum attr_memory memory;
30220 rtx set, set2;
30221 int dep_insn_code_number;
30223 /* Anti and output dependencies have zero cost on all CPUs. */
30224 if (dep_type != 0)
30225 return 0;
30227 dep_insn_code_number = recog_memoized (dep_insn);
30229 /* If we can't recognize the insns, we can't really do anything. */
30230 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30231 return cost;
30233 insn_type = get_attr_type (insn);
30234 dep_insn_type = get_attr_type (dep_insn);
30236 switch (ix86_tune)
30238 case PROCESSOR_PENTIUM:
30239 case PROCESSOR_LAKEMONT:
30240 /* Address Generation Interlock adds a cycle of latency. */
30241 if (insn_type == TYPE_LEA)
30243 rtx addr = PATTERN (insn);
30245 if (GET_CODE (addr) == PARALLEL)
30246 addr = XVECEXP (addr, 0, 0);
30248 gcc_assert (GET_CODE (addr) == SET);
30250 addr = SET_SRC (addr);
30251 if (modified_in_p (addr, dep_insn))
30252 cost += 1;
30254 else if (ix86_agi_dependent (dep_insn, insn))
30255 cost += 1;
30257 /* ??? Compares pair with jump/setcc. */
30258 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30259 cost = 0;
30261 /* Floating point stores require value to be ready one cycle earlier. */
30262 if (insn_type == TYPE_FMOV
30263 && get_attr_memory (insn) == MEMORY_STORE
30264 && !ix86_agi_dependent (dep_insn, insn))
30265 cost += 1;
30266 break;
30268 case PROCESSOR_PENTIUMPRO:
30269 /* INT->FP conversion is expensive. */
30270 if (get_attr_fp_int_src (dep_insn))
30271 cost += 5;
30273 /* There is one cycle extra latency between an FP op and a store. */
30274 if (insn_type == TYPE_FMOV
30275 && (set = single_set (dep_insn)) != NULL_RTX
30276 && (set2 = single_set (insn)) != NULL_RTX
30277 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30278 && MEM_P (SET_DEST (set2)))
30279 cost += 1;
30281 memory = get_attr_memory (insn);
30283 /* Show ability of reorder buffer to hide latency of load by executing
30284 in parallel with previous instruction in case
30285 previous instruction is not needed to compute the address. */
30286 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30287 && !ix86_agi_dependent (dep_insn, insn))
30289 /* Claim moves to take one cycle, as core can issue one load
30290 at time and the next load can start cycle later. */
30291 if (dep_insn_type == TYPE_IMOV
30292 || dep_insn_type == TYPE_FMOV)
30293 cost = 1;
30294 else if (cost > 1)
30295 cost--;
30297 break;
30299 case PROCESSOR_K6:
30300 /* The esp dependency is resolved before
30301 the instruction is really finished. */
30302 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30303 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30304 return 1;
30306 /* INT->FP conversion is expensive. */
30307 if (get_attr_fp_int_src (dep_insn))
30308 cost += 5;
30310 memory = get_attr_memory (insn);
30312 /* Show ability of reorder buffer to hide latency of load by executing
30313 in parallel with previous instruction in case
30314 previous instruction is not needed to compute the address. */
30315 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30316 && !ix86_agi_dependent (dep_insn, insn))
30318 /* Claim moves to take one cycle, as core can issue one load
30319 at time and the next load can start cycle later. */
30320 if (dep_insn_type == TYPE_IMOV
30321 || dep_insn_type == TYPE_FMOV)
30322 cost = 1;
30323 else if (cost > 2)
30324 cost -= 2;
30325 else
30326 cost = 1;
30328 break;
30330 case PROCESSOR_AMDFAM10:
30331 case PROCESSOR_BDVER1:
30332 case PROCESSOR_BDVER2:
30333 case PROCESSOR_BDVER3:
30334 case PROCESSOR_BDVER4:
30335 case PROCESSOR_ZNVER1:
30336 case PROCESSOR_BTVER1:
30337 case PROCESSOR_BTVER2:
30338 case PROCESSOR_GENERIC:
30339 /* Stack engine allows to execute push&pop instructions in parall. */
30340 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30341 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30342 return 0;
30343 /* FALLTHRU */
30345 case PROCESSOR_ATHLON:
30346 case PROCESSOR_K8:
30347 memory = get_attr_memory (insn);
30349 /* Show ability of reorder buffer to hide latency of load by executing
30350 in parallel with previous instruction in case
30351 previous instruction is not needed to compute the address. */
30352 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30353 && !ix86_agi_dependent (dep_insn, insn))
30355 enum attr_unit unit = get_attr_unit (insn);
30356 int loadcost = 3;
30358 /* Because of the difference between the length of integer and
30359 floating unit pipeline preparation stages, the memory operands
30360 for floating point are cheaper.
30362 ??? For Athlon it the difference is most probably 2. */
30363 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30364 loadcost = 3;
30365 else
30366 loadcost = TARGET_ATHLON ? 2 : 0;
30368 if (cost >= loadcost)
30369 cost -= loadcost;
30370 else
30371 cost = 0;
30373 break;
30375 case PROCESSOR_CORE2:
30376 case PROCESSOR_NEHALEM:
30377 case PROCESSOR_SANDYBRIDGE:
30378 case PROCESSOR_HASWELL:
30379 /* Stack engine allows to execute push&pop instructions in parall. */
30380 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30381 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30382 return 0;
30384 memory = get_attr_memory (insn);
30386 /* Show ability of reorder buffer to hide latency of load by executing
30387 in parallel with previous instruction in case
30388 previous instruction is not needed to compute the address. */
30389 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30390 && !ix86_agi_dependent (dep_insn, insn))
30392 if (cost >= 4)
30393 cost -= 4;
30394 else
30395 cost = 0;
30397 break;
30399 case PROCESSOR_SILVERMONT:
30400 case PROCESSOR_KNL:
30401 case PROCESSOR_INTEL:
30402 if (!reload_completed)
30403 return cost;
30405 /* Increase cost of integer loads. */
30406 memory = get_attr_memory (dep_insn);
30407 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30409 enum attr_unit unit = get_attr_unit (dep_insn);
30410 if (unit == UNIT_INTEGER && cost == 1)
30412 if (memory == MEMORY_LOAD)
30413 cost = 3;
30414 else
30416 /* Increase cost of ld/st for short int types only
30417 because of store forwarding issue. */
30418 rtx set = single_set (dep_insn);
30419 if (set && (GET_MODE (SET_DEST (set)) == QImode
30420 || GET_MODE (SET_DEST (set)) == HImode))
30422 /* Increase cost of store/load insn if exact
30423 dependence exists and it is load insn. */
30424 enum attr_memory insn_memory = get_attr_memory (insn);
30425 if (insn_memory == MEMORY_LOAD
30426 && exact_store_load_dependency (dep_insn, insn))
30427 cost = 3;
30433 default:
30434 break;
30437 return cost;
30440 /* How many alternative schedules to try. This should be as wide as the
30441 scheduling freedom in the DFA, but no wider. Making this value too
30442 large results extra work for the scheduler. */
30444 static int
30445 ia32_multipass_dfa_lookahead (void)
30447 switch (ix86_tune)
30449 case PROCESSOR_PENTIUM:
30450 case PROCESSOR_LAKEMONT:
30451 return 2;
30453 case PROCESSOR_PENTIUMPRO:
30454 case PROCESSOR_K6:
30455 return 1;
30457 case PROCESSOR_BDVER1:
30458 case PROCESSOR_BDVER2:
30459 case PROCESSOR_BDVER3:
30460 case PROCESSOR_BDVER4:
30461 /* We use lookahead value 4 for BD both before and after reload
30462 schedules. Plan is to have value 8 included for O3. */
30463 return 4;
30465 case PROCESSOR_CORE2:
30466 case PROCESSOR_NEHALEM:
30467 case PROCESSOR_SANDYBRIDGE:
30468 case PROCESSOR_HASWELL:
30469 case PROCESSOR_BONNELL:
30470 case PROCESSOR_SILVERMONT:
30471 case PROCESSOR_KNL:
30472 case PROCESSOR_INTEL:
30473 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30474 as many instructions can be executed on a cycle, i.e.,
30475 issue_rate. I wonder why tuning for many CPUs does not do this. */
30476 if (reload_completed)
30477 return ix86_issue_rate ();
30478 /* Don't use lookahead for pre-reload schedule to save compile time. */
30479 return 0;
30481 default:
30482 return 0;
30486 /* Return true if target platform supports macro-fusion. */
30488 static bool
30489 ix86_macro_fusion_p ()
30491 return TARGET_FUSE_CMP_AND_BRANCH;
30494 /* Check whether current microarchitecture support macro fusion
30495 for insn pair "CONDGEN + CONDJMP". Refer to
30496 "Intel Architectures Optimization Reference Manual". */
30498 static bool
30499 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30501 rtx src, dest;
30502 enum rtx_code ccode;
30503 rtx compare_set = NULL_RTX, test_if, cond;
30504 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30506 if (!any_condjump_p (condjmp))
30507 return false;
30509 unsigned int condreg1, condreg2;
30510 rtx cc_reg_1;
30511 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30512 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30513 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30514 || !condgen
30515 || !modified_in_p (cc_reg_1, condgen))
30516 return false;
30518 if (get_attr_type (condgen) != TYPE_TEST
30519 && get_attr_type (condgen) != TYPE_ICMP
30520 && get_attr_type (condgen) != TYPE_INCDEC
30521 && get_attr_type (condgen) != TYPE_ALU)
30522 return false;
30524 compare_set = single_set (condgen);
30525 if (compare_set == NULL_RTX
30526 && !TARGET_FUSE_ALU_AND_BRANCH)
30527 return false;
30529 if (compare_set == NULL_RTX)
30531 int i;
30532 rtx pat = PATTERN (condgen);
30533 for (i = 0; i < XVECLEN (pat, 0); i++)
30534 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30536 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30537 if (GET_CODE (set_src) == COMPARE)
30538 compare_set = XVECEXP (pat, 0, i);
30539 else
30540 alu_set = XVECEXP (pat, 0, i);
30543 if (compare_set == NULL_RTX)
30544 return false;
30545 src = SET_SRC (compare_set);
30546 if (GET_CODE (src) != COMPARE)
30547 return false;
30549 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30550 supported. */
30551 if ((MEM_P (XEXP (src, 0))
30552 && CONST_INT_P (XEXP (src, 1)))
30553 || (MEM_P (XEXP (src, 1))
30554 && CONST_INT_P (XEXP (src, 0))))
30555 return false;
30557 /* No fusion for RIP-relative address. */
30558 if (MEM_P (XEXP (src, 0)))
30559 addr = XEXP (XEXP (src, 0), 0);
30560 else if (MEM_P (XEXP (src, 1)))
30561 addr = XEXP (XEXP (src, 1), 0);
30563 if (addr) {
30564 ix86_address parts;
30565 int ok = ix86_decompose_address (addr, &parts);
30566 gcc_assert (ok);
30568 if (rip_relative_addr_p (&parts))
30569 return false;
30572 test_if = SET_SRC (pc_set (condjmp));
30573 cond = XEXP (test_if, 0);
30574 ccode = GET_CODE (cond);
30575 /* Check whether conditional jump use Sign or Overflow Flags. */
30576 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30577 && (ccode == GE
30578 || ccode == GT
30579 || ccode == LE
30580 || ccode == LT))
30581 return false;
30583 /* Return true for TYPE_TEST and TYPE_ICMP. */
30584 if (get_attr_type (condgen) == TYPE_TEST
30585 || get_attr_type (condgen) == TYPE_ICMP)
30586 return true;
30588 /* The following is the case that macro-fusion for alu + jmp. */
30589 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30590 return false;
30592 /* No fusion for alu op with memory destination operand. */
30593 dest = SET_DEST (alu_set);
30594 if (MEM_P (dest))
30595 return false;
30597 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30598 supported. */
30599 if (get_attr_type (condgen) == TYPE_INCDEC
30600 && (ccode == GEU
30601 || ccode == GTU
30602 || ccode == LEU
30603 || ccode == LTU))
30604 return false;
30606 return true;
30609 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30610 execution. It is applied if
30611 (1) IMUL instruction is on the top of list;
30612 (2) There exists the only producer of independent IMUL instruction in
30613 ready list.
30614 Return index of IMUL producer if it was found and -1 otherwise. */
30615 static int
30616 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30618 rtx_insn *insn;
30619 rtx set, insn1, insn2;
30620 sd_iterator_def sd_it;
30621 dep_t dep;
30622 int index = -1;
30623 int i;
30625 if (!TARGET_BONNELL)
30626 return index;
30628 /* Check that IMUL instruction is on the top of ready list. */
30629 insn = ready[n_ready - 1];
30630 set = single_set (insn);
30631 if (!set)
30632 return index;
30633 if (!(GET_CODE (SET_SRC (set)) == MULT
30634 && GET_MODE (SET_SRC (set)) == SImode))
30635 return index;
30637 /* Search for producer of independent IMUL instruction. */
30638 for (i = n_ready - 2; i >= 0; i--)
30640 insn = ready[i];
30641 if (!NONDEBUG_INSN_P (insn))
30642 continue;
30643 /* Skip IMUL instruction. */
30644 insn2 = PATTERN (insn);
30645 if (GET_CODE (insn2) == PARALLEL)
30646 insn2 = XVECEXP (insn2, 0, 0);
30647 if (GET_CODE (insn2) == SET
30648 && GET_CODE (SET_SRC (insn2)) == MULT
30649 && GET_MODE (SET_SRC (insn2)) == SImode)
30650 continue;
30652 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30654 rtx con;
30655 con = DEP_CON (dep);
30656 if (!NONDEBUG_INSN_P (con))
30657 continue;
30658 insn1 = PATTERN (con);
30659 if (GET_CODE (insn1) == PARALLEL)
30660 insn1 = XVECEXP (insn1, 0, 0);
30662 if (GET_CODE (insn1) == SET
30663 && GET_CODE (SET_SRC (insn1)) == MULT
30664 && GET_MODE (SET_SRC (insn1)) == SImode)
30666 sd_iterator_def sd_it1;
30667 dep_t dep1;
30668 /* Check if there is no other dependee for IMUL. */
30669 index = i;
30670 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30672 rtx pro;
30673 pro = DEP_PRO (dep1);
30674 if (!NONDEBUG_INSN_P (pro))
30675 continue;
30676 if (pro != insn)
30677 index = -1;
30679 if (index >= 0)
30680 break;
30683 if (index >= 0)
30684 break;
30686 return index;
30689 /* Try to find the best candidate on the top of ready list if two insns
30690 have the same priority - candidate is best if its dependees were
30691 scheduled earlier. Applied for Silvermont only.
30692 Return true if top 2 insns must be interchanged. */
30693 static bool
30694 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30696 rtx_insn *top = ready[n_ready - 1];
30697 rtx_insn *next = ready[n_ready - 2];
30698 rtx set;
30699 sd_iterator_def sd_it;
30700 dep_t dep;
30701 int clock1 = -1;
30702 int clock2 = -1;
30703 #define INSN_TICK(INSN) (HID (INSN)->tick)
30705 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30706 return false;
30708 if (!NONDEBUG_INSN_P (top))
30709 return false;
30710 if (!NONJUMP_INSN_P (top))
30711 return false;
30712 if (!NONDEBUG_INSN_P (next))
30713 return false;
30714 if (!NONJUMP_INSN_P (next))
30715 return false;
30716 set = single_set (top);
30717 if (!set)
30718 return false;
30719 set = single_set (next);
30720 if (!set)
30721 return false;
30723 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30725 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30726 return false;
30727 /* Determine winner more precise. */
30728 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30730 rtx pro;
30731 pro = DEP_PRO (dep);
30732 if (!NONDEBUG_INSN_P (pro))
30733 continue;
30734 if (INSN_TICK (pro) > clock1)
30735 clock1 = INSN_TICK (pro);
30737 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30739 rtx pro;
30740 pro = DEP_PRO (dep);
30741 if (!NONDEBUG_INSN_P (pro))
30742 continue;
30743 if (INSN_TICK (pro) > clock2)
30744 clock2 = INSN_TICK (pro);
30747 if (clock1 == clock2)
30749 /* Determine winner - load must win. */
30750 enum attr_memory memory1, memory2;
30751 memory1 = get_attr_memory (top);
30752 memory2 = get_attr_memory (next);
30753 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30754 return true;
30756 return (bool) (clock2 < clock1);
30758 return false;
30759 #undef INSN_TICK
30762 /* Perform possible reodering of ready list for Atom/Silvermont only.
30763 Return issue rate. */
30764 static int
30765 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30766 int *pn_ready, int clock_var)
30768 int issue_rate = -1;
30769 int n_ready = *pn_ready;
30770 int i;
30771 rtx_insn *insn;
30772 int index = -1;
30774 /* Set up issue rate. */
30775 issue_rate = ix86_issue_rate ();
30777 /* Do reodering for BONNELL/SILVERMONT only. */
30778 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30779 return issue_rate;
30781 /* Nothing to do if ready list contains only 1 instruction. */
30782 if (n_ready <= 1)
30783 return issue_rate;
30785 /* Do reodering for post-reload scheduler only. */
30786 if (!reload_completed)
30787 return issue_rate;
30789 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30791 if (sched_verbose > 1)
30792 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30793 INSN_UID (ready[index]));
30795 /* Put IMUL producer (ready[index]) at the top of ready list. */
30796 insn = ready[index];
30797 for (i = index; i < n_ready - 1; i++)
30798 ready[i] = ready[i + 1];
30799 ready[n_ready - 1] = insn;
30800 return issue_rate;
30803 /* Skip selective scheduling since HID is not populated in it. */
30804 if (clock_var != 0
30805 && !sel_sched_p ()
30806 && swap_top_of_ready_list (ready, n_ready))
30808 if (sched_verbose > 1)
30809 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30810 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30811 /* Swap 2 top elements of ready list. */
30812 insn = ready[n_ready - 1];
30813 ready[n_ready - 1] = ready[n_ready - 2];
30814 ready[n_ready - 2] = insn;
30816 return issue_rate;
30819 static bool
30820 ix86_class_likely_spilled_p (reg_class_t);
30822 /* Returns true if lhs of insn is HW function argument register and set up
30823 is_spilled to true if it is likely spilled HW register. */
30824 static bool
30825 insn_is_function_arg (rtx insn, bool* is_spilled)
30827 rtx dst;
30829 if (!NONDEBUG_INSN_P (insn))
30830 return false;
30831 /* Call instructions are not movable, ignore it. */
30832 if (CALL_P (insn))
30833 return false;
30834 insn = PATTERN (insn);
30835 if (GET_CODE (insn) == PARALLEL)
30836 insn = XVECEXP (insn, 0, 0);
30837 if (GET_CODE (insn) != SET)
30838 return false;
30839 dst = SET_DEST (insn);
30840 if (REG_P (dst) && HARD_REGISTER_P (dst)
30841 && ix86_function_arg_regno_p (REGNO (dst)))
30843 /* Is it likely spilled HW register? */
30844 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30845 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30846 *is_spilled = true;
30847 return true;
30849 return false;
30852 /* Add output dependencies for chain of function adjacent arguments if only
30853 there is a move to likely spilled HW register. Return first argument
30854 if at least one dependence was added or NULL otherwise. */
30855 static rtx_insn *
30856 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30858 rtx_insn *insn;
30859 rtx_insn *last = call;
30860 rtx_insn *first_arg = NULL;
30861 bool is_spilled = false;
30863 head = PREV_INSN (head);
30865 /* Find nearest to call argument passing instruction. */
30866 while (true)
30868 last = PREV_INSN (last);
30869 if (last == head)
30870 return NULL;
30871 if (!NONDEBUG_INSN_P (last))
30872 continue;
30873 if (insn_is_function_arg (last, &is_spilled))
30874 break;
30875 return NULL;
30878 first_arg = last;
30879 while (true)
30881 insn = PREV_INSN (last);
30882 if (!INSN_P (insn))
30883 break;
30884 if (insn == head)
30885 break;
30886 if (!NONDEBUG_INSN_P (insn))
30888 last = insn;
30889 continue;
30891 if (insn_is_function_arg (insn, &is_spilled))
30893 /* Add output depdendence between two function arguments if chain
30894 of output arguments contains likely spilled HW registers. */
30895 if (is_spilled)
30896 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30897 first_arg = last = insn;
30899 else
30900 break;
30902 if (!is_spilled)
30903 return NULL;
30904 return first_arg;
30907 /* Add output or anti dependency from insn to first_arg to restrict its code
30908 motion. */
30909 static void
30910 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
30912 rtx set;
30913 rtx tmp;
30915 /* Add anti dependencies for bounds stores. */
30916 if (INSN_P (insn)
30917 && GET_CODE (PATTERN (insn)) == PARALLEL
30918 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
30919 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
30921 add_dependence (first_arg, insn, REG_DEP_ANTI);
30922 return;
30925 set = single_set (insn);
30926 if (!set)
30927 return;
30928 tmp = SET_DEST (set);
30929 if (REG_P (tmp))
30931 /* Add output dependency to the first function argument. */
30932 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30933 return;
30935 /* Add anti dependency. */
30936 add_dependence (first_arg, insn, REG_DEP_ANTI);
30939 /* Avoid cross block motion of function argument through adding dependency
30940 from the first non-jump instruction in bb. */
30941 static void
30942 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
30944 rtx_insn *insn = BB_END (bb);
30946 while (insn)
30948 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
30950 rtx set = single_set (insn);
30951 if (set)
30953 avoid_func_arg_motion (arg, insn);
30954 return;
30957 if (insn == BB_HEAD (bb))
30958 return;
30959 insn = PREV_INSN (insn);
30963 /* Hook for pre-reload schedule - avoid motion of function arguments
30964 passed in likely spilled HW registers. */
30965 static void
30966 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
30968 rtx_insn *insn;
30969 rtx_insn *first_arg = NULL;
30970 if (reload_completed)
30971 return;
30972 while (head != tail && DEBUG_INSN_P (head))
30973 head = NEXT_INSN (head);
30974 for (insn = tail; insn != head; insn = PREV_INSN (insn))
30975 if (INSN_P (insn) && CALL_P (insn))
30977 first_arg = add_parameter_dependencies (insn, head);
30978 if (first_arg)
30980 /* Add dependee for first argument to predecessors if only
30981 region contains more than one block. */
30982 basic_block bb = BLOCK_FOR_INSN (insn);
30983 int rgn = CONTAINING_RGN (bb->index);
30984 int nr_blks = RGN_NR_BLOCKS (rgn);
30985 /* Skip trivial regions and region head blocks that can have
30986 predecessors outside of region. */
30987 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
30989 edge e;
30990 edge_iterator ei;
30992 /* Regions are SCCs with the exception of selective
30993 scheduling with pipelining of outer blocks enabled.
30994 So also check that immediate predecessors of a non-head
30995 block are in the same region. */
30996 FOR_EACH_EDGE (e, ei, bb->preds)
30998 /* Avoid creating of loop-carried dependencies through
30999 using topological ordering in the region. */
31000 if (rgn == CONTAINING_RGN (e->src->index)
31001 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
31002 add_dependee_for_func_arg (first_arg, e->src);
31005 insn = first_arg;
31006 if (insn == head)
31007 break;
31010 else if (first_arg)
31011 avoid_func_arg_motion (first_arg, insn);
31014 /* Hook for pre-reload schedule - set priority of moves from likely spilled
31015 HW registers to maximum, to schedule them at soon as possible. These are
31016 moves from function argument registers at the top of the function entry
31017 and moves from function return value registers after call. */
31018 static int
31019 ix86_adjust_priority (rtx_insn *insn, int priority)
31021 rtx set;
31023 if (reload_completed)
31024 return priority;
31026 if (!NONDEBUG_INSN_P (insn))
31027 return priority;
31029 set = single_set (insn);
31030 if (set)
31032 rtx tmp = SET_SRC (set);
31033 if (REG_P (tmp)
31034 && HARD_REGISTER_P (tmp)
31035 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
31036 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
31037 return current_sched_info->sched_max_insns_priority;
31040 return priority;
31043 /* Model decoder of Core 2/i7.
31044 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
31045 track the instruction fetch block boundaries and make sure that long
31046 (9+ bytes) instructions are assigned to D0. */
31048 /* Maximum length of an insn that can be handled by
31049 a secondary decoder unit. '8' for Core 2/i7. */
31050 static int core2i7_secondary_decoder_max_insn_size;
31052 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
31053 '16' for Core 2/i7. */
31054 static int core2i7_ifetch_block_size;
31056 /* Maximum number of instructions decoder can handle per cycle.
31057 '6' for Core 2/i7. */
31058 static int core2i7_ifetch_block_max_insns;
31060 typedef struct ix86_first_cycle_multipass_data_ *
31061 ix86_first_cycle_multipass_data_t;
31062 typedef const struct ix86_first_cycle_multipass_data_ *
31063 const_ix86_first_cycle_multipass_data_t;
31065 /* A variable to store target state across calls to max_issue within
31066 one cycle. */
31067 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
31068 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
31070 /* Initialize DATA. */
31071 static void
31072 core2i7_first_cycle_multipass_init (void *_data)
31074 ix86_first_cycle_multipass_data_t data
31075 = (ix86_first_cycle_multipass_data_t) _data;
31077 data->ifetch_block_len = 0;
31078 data->ifetch_block_n_insns = 0;
31079 data->ready_try_change = NULL;
31080 data->ready_try_change_size = 0;
31083 /* Advancing the cycle; reset ifetch block counts. */
31084 static void
31085 core2i7_dfa_post_advance_cycle (void)
31087 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
31089 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31091 data->ifetch_block_len = 0;
31092 data->ifetch_block_n_insns = 0;
31095 static int min_insn_size (rtx_insn *);
31097 /* Filter out insns from ready_try that the core will not be able to issue
31098 on current cycle due to decoder. */
31099 static void
31100 core2i7_first_cycle_multipass_filter_ready_try
31101 (const_ix86_first_cycle_multipass_data_t data,
31102 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
31104 while (n_ready--)
31106 rtx_insn *insn;
31107 int insn_size;
31109 if (ready_try[n_ready])
31110 continue;
31112 insn = get_ready_element (n_ready);
31113 insn_size = min_insn_size (insn);
31115 if (/* If this is a too long an insn for a secondary decoder ... */
31116 (!first_cycle_insn_p
31117 && insn_size > core2i7_secondary_decoder_max_insn_size)
31118 /* ... or it would not fit into the ifetch block ... */
31119 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
31120 /* ... or the decoder is full already ... */
31121 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
31122 /* ... mask the insn out. */
31124 ready_try[n_ready] = 1;
31126 if (data->ready_try_change)
31127 bitmap_set_bit (data->ready_try_change, n_ready);
31132 /* Prepare for a new round of multipass lookahead scheduling. */
31133 static void
31134 core2i7_first_cycle_multipass_begin (void *_data,
31135 signed char *ready_try, int n_ready,
31136 bool first_cycle_insn_p)
31138 ix86_first_cycle_multipass_data_t data
31139 = (ix86_first_cycle_multipass_data_t) _data;
31140 const_ix86_first_cycle_multipass_data_t prev_data
31141 = ix86_first_cycle_multipass_data;
31143 /* Restore the state from the end of the previous round. */
31144 data->ifetch_block_len = prev_data->ifetch_block_len;
31145 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31147 /* Filter instructions that cannot be issued on current cycle due to
31148 decoder restrictions. */
31149 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31150 first_cycle_insn_p);
31153 /* INSN is being issued in current solution. Account for its impact on
31154 the decoder model. */
31155 static void
31156 core2i7_first_cycle_multipass_issue (void *_data,
31157 signed char *ready_try, int n_ready,
31158 rtx_insn *insn, const void *_prev_data)
31160 ix86_first_cycle_multipass_data_t data
31161 = (ix86_first_cycle_multipass_data_t) _data;
31162 const_ix86_first_cycle_multipass_data_t prev_data
31163 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31165 int insn_size = min_insn_size (insn);
31167 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31168 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31169 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31170 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31172 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31173 if (!data->ready_try_change)
31175 data->ready_try_change = sbitmap_alloc (n_ready);
31176 data->ready_try_change_size = n_ready;
31178 else if (data->ready_try_change_size < n_ready)
31180 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31181 n_ready, 0);
31182 data->ready_try_change_size = n_ready;
31184 bitmap_clear (data->ready_try_change);
31186 /* Filter out insns from ready_try that the core will not be able to issue
31187 on current cycle due to decoder. */
31188 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31189 false);
31192 /* Revert the effect on ready_try. */
31193 static void
31194 core2i7_first_cycle_multipass_backtrack (const void *_data,
31195 signed char *ready_try,
31196 int n_ready ATTRIBUTE_UNUSED)
31198 const_ix86_first_cycle_multipass_data_t data
31199 = (const_ix86_first_cycle_multipass_data_t) _data;
31200 unsigned int i = 0;
31201 sbitmap_iterator sbi;
31203 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31204 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31206 ready_try[i] = 0;
31210 /* Save the result of multipass lookahead scheduling for the next round. */
31211 static void
31212 core2i7_first_cycle_multipass_end (const void *_data)
31214 const_ix86_first_cycle_multipass_data_t data
31215 = (const_ix86_first_cycle_multipass_data_t) _data;
31216 ix86_first_cycle_multipass_data_t next_data
31217 = ix86_first_cycle_multipass_data;
31219 if (data != NULL)
31221 next_data->ifetch_block_len = data->ifetch_block_len;
31222 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31226 /* Deallocate target data. */
31227 static void
31228 core2i7_first_cycle_multipass_fini (void *_data)
31230 ix86_first_cycle_multipass_data_t data
31231 = (ix86_first_cycle_multipass_data_t) _data;
31233 if (data->ready_try_change)
31235 sbitmap_free (data->ready_try_change);
31236 data->ready_try_change = NULL;
31237 data->ready_try_change_size = 0;
31241 /* Prepare for scheduling pass. */
31242 static void
31243 ix86_sched_init_global (FILE *, int, int)
31245 /* Install scheduling hooks for current CPU. Some of these hooks are used
31246 in time-critical parts of the scheduler, so we only set them up when
31247 they are actually used. */
31248 switch (ix86_tune)
31250 case PROCESSOR_CORE2:
31251 case PROCESSOR_NEHALEM:
31252 case PROCESSOR_SANDYBRIDGE:
31253 case PROCESSOR_HASWELL:
31254 /* Do not perform multipass scheduling for pre-reload schedule
31255 to save compile time. */
31256 if (reload_completed)
31258 targetm.sched.dfa_post_advance_cycle
31259 = core2i7_dfa_post_advance_cycle;
31260 targetm.sched.first_cycle_multipass_init
31261 = core2i7_first_cycle_multipass_init;
31262 targetm.sched.first_cycle_multipass_begin
31263 = core2i7_first_cycle_multipass_begin;
31264 targetm.sched.first_cycle_multipass_issue
31265 = core2i7_first_cycle_multipass_issue;
31266 targetm.sched.first_cycle_multipass_backtrack
31267 = core2i7_first_cycle_multipass_backtrack;
31268 targetm.sched.first_cycle_multipass_end
31269 = core2i7_first_cycle_multipass_end;
31270 targetm.sched.first_cycle_multipass_fini
31271 = core2i7_first_cycle_multipass_fini;
31273 /* Set decoder parameters. */
31274 core2i7_secondary_decoder_max_insn_size = 8;
31275 core2i7_ifetch_block_size = 16;
31276 core2i7_ifetch_block_max_insns = 6;
31277 break;
31279 /* Fall through. */
31280 default:
31281 targetm.sched.dfa_post_advance_cycle = NULL;
31282 targetm.sched.first_cycle_multipass_init = NULL;
31283 targetm.sched.first_cycle_multipass_begin = NULL;
31284 targetm.sched.first_cycle_multipass_issue = NULL;
31285 targetm.sched.first_cycle_multipass_backtrack = NULL;
31286 targetm.sched.first_cycle_multipass_end = NULL;
31287 targetm.sched.first_cycle_multipass_fini = NULL;
31288 break;
31293 /* Compute the alignment given to a constant that is being placed in memory.
31294 EXP is the constant and ALIGN is the alignment that the object would
31295 ordinarily have.
31296 The value of this function is used instead of that alignment to align
31297 the object. */
31300 ix86_constant_alignment (tree exp, int align)
31302 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31303 || TREE_CODE (exp) == INTEGER_CST)
31305 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31306 return 64;
31307 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31308 return 128;
31310 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31311 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31312 return BITS_PER_WORD;
31314 return align;
31317 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31318 the data type, and ALIGN is the alignment that the object would
31319 ordinarily have. */
31321 static int
31322 iamcu_alignment (tree type, int align)
31324 machine_mode mode;
31326 if (align < 32 || TYPE_USER_ALIGN (type))
31327 return align;
31329 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31330 bytes. */
31331 mode = TYPE_MODE (strip_array_types (type));
31332 switch (GET_MODE_CLASS (mode))
31334 case MODE_INT:
31335 case MODE_COMPLEX_INT:
31336 case MODE_COMPLEX_FLOAT:
31337 case MODE_FLOAT:
31338 case MODE_DECIMAL_FLOAT:
31339 return 32;
31340 default:
31341 return align;
31345 /* Compute the alignment for a static variable.
31346 TYPE is the data type, and ALIGN is the alignment that
31347 the object would ordinarily have. The value of this function is used
31348 instead of that alignment to align the object. */
31351 ix86_data_alignment (tree type, int align, bool opt)
31353 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31354 for symbols from other compilation units or symbols that don't need
31355 to bind locally. In order to preserve some ABI compatibility with
31356 those compilers, ensure we don't decrease alignment from what we
31357 used to assume. */
31359 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31361 /* A data structure, equal or greater than the size of a cache line
31362 (64 bytes in the Pentium 4 and other recent Intel processors, including
31363 processors based on Intel Core microarchitecture) should be aligned
31364 so that its base address is a multiple of a cache line size. */
31366 int max_align
31367 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31369 if (max_align < BITS_PER_WORD)
31370 max_align = BITS_PER_WORD;
31372 switch (ix86_align_data_type)
31374 case ix86_align_data_type_abi: opt = false; break;
31375 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31376 case ix86_align_data_type_cacheline: break;
31379 if (TARGET_IAMCU)
31380 align = iamcu_alignment (type, align);
31382 if (opt
31383 && AGGREGATE_TYPE_P (type)
31384 && TYPE_SIZE (type)
31385 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31387 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31388 && align < max_align_compat)
31389 align = max_align_compat;
31390 if (wi::geu_p (TYPE_SIZE (type), max_align)
31391 && align < max_align)
31392 align = max_align;
31395 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31396 to 16byte boundary. */
31397 if (TARGET_64BIT)
31399 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31400 && TYPE_SIZE (type)
31401 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31402 && wi::geu_p (TYPE_SIZE (type), 128)
31403 && align < 128)
31404 return 128;
31407 if (!opt)
31408 return align;
31410 if (TREE_CODE (type) == ARRAY_TYPE)
31412 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31413 return 64;
31414 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31415 return 128;
31417 else if (TREE_CODE (type) == COMPLEX_TYPE)
31420 if (TYPE_MODE (type) == DCmode && align < 64)
31421 return 64;
31422 if ((TYPE_MODE (type) == XCmode
31423 || TYPE_MODE (type) == TCmode) && align < 128)
31424 return 128;
31426 else if ((TREE_CODE (type) == RECORD_TYPE
31427 || TREE_CODE (type) == UNION_TYPE
31428 || TREE_CODE (type) == QUAL_UNION_TYPE)
31429 && TYPE_FIELDS (type))
31431 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31432 return 64;
31433 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31434 return 128;
31436 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31437 || TREE_CODE (type) == INTEGER_TYPE)
31439 if (TYPE_MODE (type) == DFmode && align < 64)
31440 return 64;
31441 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31442 return 128;
31445 return align;
31448 /* Compute the alignment for a local variable or a stack slot. EXP is
31449 the data type or decl itself, MODE is the widest mode available and
31450 ALIGN is the alignment that the object would ordinarily have. The
31451 value of this macro is used instead of that alignment to align the
31452 object. */
31454 unsigned int
31455 ix86_local_alignment (tree exp, machine_mode mode,
31456 unsigned int align)
31458 tree type, decl;
31460 if (exp && DECL_P (exp))
31462 type = TREE_TYPE (exp);
31463 decl = exp;
31465 else
31467 type = exp;
31468 decl = NULL;
31471 /* Don't do dynamic stack realignment for long long objects with
31472 -mpreferred-stack-boundary=2. */
31473 if (!TARGET_64BIT
31474 && align == 64
31475 && ix86_preferred_stack_boundary < 64
31476 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31477 && (!type || !TYPE_USER_ALIGN (type))
31478 && (!decl || !DECL_USER_ALIGN (decl)))
31479 align = 32;
31481 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31482 register in MODE. We will return the largest alignment of XF
31483 and DF. */
31484 if (!type)
31486 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31487 align = GET_MODE_ALIGNMENT (DFmode);
31488 return align;
31491 /* Don't increase alignment for Intel MCU psABI. */
31492 if (TARGET_IAMCU)
31493 return align;
31495 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31496 to 16byte boundary. Exact wording is:
31498 An array uses the same alignment as its elements, except that a local or
31499 global array variable of length at least 16 bytes or
31500 a C99 variable-length array variable always has alignment of at least 16 bytes.
31502 This was added to allow use of aligned SSE instructions at arrays. This
31503 rule is meant for static storage (where compiler can not do the analysis
31504 by itself). We follow it for automatic variables only when convenient.
31505 We fully control everything in the function compiled and functions from
31506 other unit can not rely on the alignment.
31508 Exclude va_list type. It is the common case of local array where
31509 we can not benefit from the alignment.
31511 TODO: Probably one should optimize for size only when var is not escaping. */
31512 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31513 && TARGET_SSE)
31515 if (AGGREGATE_TYPE_P (type)
31516 && (va_list_type_node == NULL_TREE
31517 || (TYPE_MAIN_VARIANT (type)
31518 != TYPE_MAIN_VARIANT (va_list_type_node)))
31519 && TYPE_SIZE (type)
31520 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31521 && wi::geu_p (TYPE_SIZE (type), 128)
31522 && align < 128)
31523 return 128;
31525 if (TREE_CODE (type) == ARRAY_TYPE)
31527 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31528 return 64;
31529 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31530 return 128;
31532 else if (TREE_CODE (type) == COMPLEX_TYPE)
31534 if (TYPE_MODE (type) == DCmode && align < 64)
31535 return 64;
31536 if ((TYPE_MODE (type) == XCmode
31537 || TYPE_MODE (type) == TCmode) && align < 128)
31538 return 128;
31540 else if ((TREE_CODE (type) == RECORD_TYPE
31541 || TREE_CODE (type) == UNION_TYPE
31542 || TREE_CODE (type) == QUAL_UNION_TYPE)
31543 && TYPE_FIELDS (type))
31545 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31546 return 64;
31547 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31548 return 128;
31550 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31551 || TREE_CODE (type) == INTEGER_TYPE)
31554 if (TYPE_MODE (type) == DFmode && align < 64)
31555 return 64;
31556 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31557 return 128;
31559 return align;
31562 /* Compute the minimum required alignment for dynamic stack realignment
31563 purposes for a local variable, parameter or a stack slot. EXP is
31564 the data type or decl itself, MODE is its mode and ALIGN is the
31565 alignment that the object would ordinarily have. */
31567 unsigned int
31568 ix86_minimum_alignment (tree exp, machine_mode mode,
31569 unsigned int align)
31571 tree type, decl;
31573 if (exp && DECL_P (exp))
31575 type = TREE_TYPE (exp);
31576 decl = exp;
31578 else
31580 type = exp;
31581 decl = NULL;
31584 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31585 return align;
31587 /* Don't do dynamic stack realignment for long long objects with
31588 -mpreferred-stack-boundary=2. */
31589 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31590 && (!type || !TYPE_USER_ALIGN (type))
31591 && (!decl || !DECL_USER_ALIGN (decl)))
31593 gcc_checking_assert (!TARGET_STV);
31594 return 32;
31597 return align;
31600 /* Find a location for the static chain incoming to a nested function.
31601 This is a register, unless all free registers are used by arguments. */
31603 static rtx
31604 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31606 unsigned regno;
31608 /* While this function won't be called by the middle-end when a static
31609 chain isn't needed, it's also used throughout the backend so it's
31610 easiest to keep this check centralized. */
31611 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31612 return NULL;
31614 if (TARGET_64BIT)
31616 /* We always use R10 in 64-bit mode. */
31617 regno = R10_REG;
31619 else
31621 const_tree fntype, fndecl;
31622 unsigned int ccvt;
31624 /* By default in 32-bit mode we use ECX to pass the static chain. */
31625 regno = CX_REG;
31627 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31629 fntype = TREE_TYPE (fndecl_or_type);
31630 fndecl = fndecl_or_type;
31632 else
31634 fntype = fndecl_or_type;
31635 fndecl = NULL;
31638 ccvt = ix86_get_callcvt (fntype);
31639 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31641 /* Fastcall functions use ecx/edx for arguments, which leaves
31642 us with EAX for the static chain.
31643 Thiscall functions use ecx for arguments, which also
31644 leaves us with EAX for the static chain. */
31645 regno = AX_REG;
31647 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31649 /* Thiscall functions use ecx for arguments, which leaves
31650 us with EAX and EDX for the static chain.
31651 We are using for abi-compatibility EAX. */
31652 regno = AX_REG;
31654 else if (ix86_function_regparm (fntype, fndecl) == 3)
31656 /* For regparm 3, we have no free call-clobbered registers in
31657 which to store the static chain. In order to implement this,
31658 we have the trampoline push the static chain to the stack.
31659 However, we can't push a value below the return address when
31660 we call the nested function directly, so we have to use an
31661 alternate entry point. For this we use ESI, and have the
31662 alternate entry point push ESI, so that things appear the
31663 same once we're executing the nested function. */
31664 if (incoming_p)
31666 if (fndecl == current_function_decl
31667 && !ix86_static_chain_on_stack)
31669 gcc_assert (!reload_completed);
31670 ix86_static_chain_on_stack = true;
31672 return gen_frame_mem (SImode,
31673 plus_constant (Pmode,
31674 arg_pointer_rtx, -8));
31676 regno = SI_REG;
31680 return gen_rtx_REG (Pmode, regno);
31683 /* Emit RTL insns to initialize the variable parts of a trampoline.
31684 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31685 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31686 to be passed to the target function. */
31688 static void
31689 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31691 rtx mem, fnaddr;
31692 int opcode;
31693 int offset = 0;
31695 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31697 if (TARGET_64BIT)
31699 int size;
31701 /* Load the function address to r11. Try to load address using
31702 the shorter movl instead of movabs. We may want to support
31703 movq for kernel mode, but kernel does not use trampolines at
31704 the moment. FNADDR is a 32bit address and may not be in
31705 DImode when ptr_mode == SImode. Always use movl in this
31706 case. */
31707 if (ptr_mode == SImode
31708 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31710 fnaddr = copy_addr_to_reg (fnaddr);
31712 mem = adjust_address (m_tramp, HImode, offset);
31713 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31715 mem = adjust_address (m_tramp, SImode, offset + 2);
31716 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31717 offset += 6;
31719 else
31721 mem = adjust_address (m_tramp, HImode, offset);
31722 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31724 mem = adjust_address (m_tramp, DImode, offset + 2);
31725 emit_move_insn (mem, fnaddr);
31726 offset += 10;
31729 /* Load static chain using movabs to r10. Use the shorter movl
31730 instead of movabs when ptr_mode == SImode. */
31731 if (ptr_mode == SImode)
31733 opcode = 0xba41;
31734 size = 6;
31736 else
31738 opcode = 0xba49;
31739 size = 10;
31742 mem = adjust_address (m_tramp, HImode, offset);
31743 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31745 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31746 emit_move_insn (mem, chain_value);
31747 offset += size;
31749 /* Jump to r11; the last (unused) byte is a nop, only there to
31750 pad the write out to a single 32-bit store. */
31751 mem = adjust_address (m_tramp, SImode, offset);
31752 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31753 offset += 4;
31755 else
31757 rtx disp, chain;
31759 /* Depending on the static chain location, either load a register
31760 with a constant, or push the constant to the stack. All of the
31761 instructions are the same size. */
31762 chain = ix86_static_chain (fndecl, true);
31763 if (REG_P (chain))
31765 switch (REGNO (chain))
31767 case AX_REG:
31768 opcode = 0xb8; break;
31769 case CX_REG:
31770 opcode = 0xb9; break;
31771 default:
31772 gcc_unreachable ();
31775 else
31776 opcode = 0x68;
31778 mem = adjust_address (m_tramp, QImode, offset);
31779 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31781 mem = adjust_address (m_tramp, SImode, offset + 1);
31782 emit_move_insn (mem, chain_value);
31783 offset += 5;
31785 mem = adjust_address (m_tramp, QImode, offset);
31786 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31788 mem = adjust_address (m_tramp, SImode, offset + 1);
31790 /* Compute offset from the end of the jmp to the target function.
31791 In the case in which the trampoline stores the static chain on
31792 the stack, we need to skip the first insn which pushes the
31793 (call-saved) register static chain; this push is 1 byte. */
31794 offset += 5;
31795 disp = expand_binop (SImode, sub_optab, fnaddr,
31796 plus_constant (Pmode, XEXP (m_tramp, 0),
31797 offset - (MEM_P (chain) ? 1 : 0)),
31798 NULL_RTX, 1, OPTAB_DIRECT);
31799 emit_move_insn (mem, disp);
31802 gcc_assert (offset <= TRAMPOLINE_SIZE);
31804 #ifdef HAVE_ENABLE_EXECUTE_STACK
31805 #ifdef CHECK_EXECUTE_STACK_ENABLED
31806 if (CHECK_EXECUTE_STACK_ENABLED)
31807 #endif
31808 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31809 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
31810 #endif
31813 static bool
31814 ix86_allocate_stack_slots_for_args (void)
31816 /* Naked functions should not allocate stack slots for arguments. */
31817 return !ix86_function_naked (current_function_decl);
31820 static bool
31821 ix86_warn_func_return (tree decl)
31823 /* Naked functions are implemented entirely in assembly, including the
31824 return sequence, so suppress warnings about this. */
31825 return !ix86_function_naked (decl);
31828 /* The following file contains several enumerations and data structures
31829 built from the definitions in i386-builtin-types.def. */
31831 #include "i386-builtin-types.inc"
31833 /* Table for the ix86 builtin non-function types. */
31834 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31836 /* Retrieve an element from the above table, building some of
31837 the types lazily. */
31839 static tree
31840 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31842 unsigned int index;
31843 tree type, itype;
31845 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31847 type = ix86_builtin_type_tab[(int) tcode];
31848 if (type != NULL)
31849 return type;
31851 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31852 if (tcode <= IX86_BT_LAST_VECT)
31854 machine_mode mode;
31856 index = tcode - IX86_BT_LAST_PRIM - 1;
31857 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31858 mode = ix86_builtin_type_vect_mode[index];
31860 type = build_vector_type_for_mode (itype, mode);
31862 else
31864 int quals;
31866 index = tcode - IX86_BT_LAST_VECT - 1;
31867 if (tcode <= IX86_BT_LAST_PTR)
31868 quals = TYPE_UNQUALIFIED;
31869 else
31870 quals = TYPE_QUAL_CONST;
31872 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
31873 if (quals != TYPE_UNQUALIFIED)
31874 itype = build_qualified_type (itype, quals);
31876 type = build_pointer_type (itype);
31879 ix86_builtin_type_tab[(int) tcode] = type;
31880 return type;
31883 /* Table for the ix86 builtin function types. */
31884 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
31886 /* Retrieve an element from the above table, building some of
31887 the types lazily. */
31889 static tree
31890 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
31892 tree type;
31894 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
31896 type = ix86_builtin_func_type_tab[(int) tcode];
31897 if (type != NULL)
31898 return type;
31900 if (tcode <= IX86_BT_LAST_FUNC)
31902 unsigned start = ix86_builtin_func_start[(int) tcode];
31903 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
31904 tree rtype, atype, args = void_list_node;
31905 unsigned i;
31907 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
31908 for (i = after - 1; i > start; --i)
31910 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
31911 args = tree_cons (NULL, atype, args);
31914 type = build_function_type (rtype, args);
31916 else
31918 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
31919 enum ix86_builtin_func_type icode;
31921 icode = ix86_builtin_func_alias_base[index];
31922 type = ix86_get_builtin_func_type (icode);
31925 ix86_builtin_func_type_tab[(int) tcode] = type;
31926 return type;
31930 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
31931 bdesc_* arrays below should come first, then builtins for each bdesc_*
31932 array in ascending order, so that we can use direct array accesses. */
31933 enum ix86_builtins
31935 IX86_BUILTIN_MASKMOVQ,
31936 IX86_BUILTIN_LDMXCSR,
31937 IX86_BUILTIN_STMXCSR,
31938 IX86_BUILTIN_MASKMOVDQU,
31939 IX86_BUILTIN_PSLLDQ128,
31940 IX86_BUILTIN_CLFLUSH,
31941 IX86_BUILTIN_MONITOR,
31942 IX86_BUILTIN_MWAIT,
31943 IX86_BUILTIN_CLZERO,
31944 IX86_BUILTIN_VEC_INIT_V2SI,
31945 IX86_BUILTIN_VEC_INIT_V4HI,
31946 IX86_BUILTIN_VEC_INIT_V8QI,
31947 IX86_BUILTIN_VEC_EXT_V2DF,
31948 IX86_BUILTIN_VEC_EXT_V2DI,
31949 IX86_BUILTIN_VEC_EXT_V4SF,
31950 IX86_BUILTIN_VEC_EXT_V4SI,
31951 IX86_BUILTIN_VEC_EXT_V8HI,
31952 IX86_BUILTIN_VEC_EXT_V2SI,
31953 IX86_BUILTIN_VEC_EXT_V4HI,
31954 IX86_BUILTIN_VEC_EXT_V16QI,
31955 IX86_BUILTIN_VEC_SET_V2DI,
31956 IX86_BUILTIN_VEC_SET_V4SF,
31957 IX86_BUILTIN_VEC_SET_V4SI,
31958 IX86_BUILTIN_VEC_SET_V8HI,
31959 IX86_BUILTIN_VEC_SET_V4HI,
31960 IX86_BUILTIN_VEC_SET_V16QI,
31961 IX86_BUILTIN_GATHERSIV2DF,
31962 IX86_BUILTIN_GATHERSIV4DF,
31963 IX86_BUILTIN_GATHERDIV2DF,
31964 IX86_BUILTIN_GATHERDIV4DF,
31965 IX86_BUILTIN_GATHERSIV4SF,
31966 IX86_BUILTIN_GATHERSIV8SF,
31967 IX86_BUILTIN_GATHERDIV4SF,
31968 IX86_BUILTIN_GATHERDIV8SF,
31969 IX86_BUILTIN_GATHERSIV2DI,
31970 IX86_BUILTIN_GATHERSIV4DI,
31971 IX86_BUILTIN_GATHERDIV2DI,
31972 IX86_BUILTIN_GATHERDIV4DI,
31973 IX86_BUILTIN_GATHERSIV4SI,
31974 IX86_BUILTIN_GATHERSIV8SI,
31975 IX86_BUILTIN_GATHERDIV4SI,
31976 IX86_BUILTIN_GATHERDIV8SI,
31977 IX86_BUILTIN_VFMSUBSD3_MASK3,
31978 IX86_BUILTIN_VFMSUBSS3_MASK3,
31979 IX86_BUILTIN_GATHER3SIV8SF,
31980 IX86_BUILTIN_GATHER3SIV4SF,
31981 IX86_BUILTIN_GATHER3SIV4DF,
31982 IX86_BUILTIN_GATHER3SIV2DF,
31983 IX86_BUILTIN_GATHER3DIV8SF,
31984 IX86_BUILTIN_GATHER3DIV4SF,
31985 IX86_BUILTIN_GATHER3DIV4DF,
31986 IX86_BUILTIN_GATHER3DIV2DF,
31987 IX86_BUILTIN_GATHER3SIV8SI,
31988 IX86_BUILTIN_GATHER3SIV4SI,
31989 IX86_BUILTIN_GATHER3SIV4DI,
31990 IX86_BUILTIN_GATHER3SIV2DI,
31991 IX86_BUILTIN_GATHER3DIV8SI,
31992 IX86_BUILTIN_GATHER3DIV4SI,
31993 IX86_BUILTIN_GATHER3DIV4DI,
31994 IX86_BUILTIN_GATHER3DIV2DI,
31995 IX86_BUILTIN_SCATTERSIV8SF,
31996 IX86_BUILTIN_SCATTERSIV4SF,
31997 IX86_BUILTIN_SCATTERSIV4DF,
31998 IX86_BUILTIN_SCATTERSIV2DF,
31999 IX86_BUILTIN_SCATTERDIV8SF,
32000 IX86_BUILTIN_SCATTERDIV4SF,
32001 IX86_BUILTIN_SCATTERDIV4DF,
32002 IX86_BUILTIN_SCATTERDIV2DF,
32003 IX86_BUILTIN_SCATTERSIV8SI,
32004 IX86_BUILTIN_SCATTERSIV4SI,
32005 IX86_BUILTIN_SCATTERSIV4DI,
32006 IX86_BUILTIN_SCATTERSIV2DI,
32007 IX86_BUILTIN_SCATTERDIV8SI,
32008 IX86_BUILTIN_SCATTERDIV4SI,
32009 IX86_BUILTIN_SCATTERDIV4DI,
32010 IX86_BUILTIN_SCATTERDIV2DI,
32011 /* Alternate 4 and 8 element gather/scatter for the vectorizer
32012 where all operands are 32-byte or 64-byte wide respectively. */
32013 IX86_BUILTIN_GATHERALTSIV4DF,
32014 IX86_BUILTIN_GATHERALTDIV8SF,
32015 IX86_BUILTIN_GATHERALTSIV4DI,
32016 IX86_BUILTIN_GATHERALTDIV8SI,
32017 IX86_BUILTIN_GATHER3ALTDIV16SF,
32018 IX86_BUILTIN_GATHER3ALTDIV16SI,
32019 IX86_BUILTIN_GATHER3ALTSIV4DF,
32020 IX86_BUILTIN_GATHER3ALTDIV8SF,
32021 IX86_BUILTIN_GATHER3ALTSIV4DI,
32022 IX86_BUILTIN_GATHER3ALTDIV8SI,
32023 IX86_BUILTIN_GATHER3ALTSIV8DF,
32024 IX86_BUILTIN_GATHER3ALTSIV8DI,
32025 IX86_BUILTIN_GATHER3DIV16SF,
32026 IX86_BUILTIN_GATHER3DIV16SI,
32027 IX86_BUILTIN_GATHER3DIV8DF,
32028 IX86_BUILTIN_GATHER3DIV8DI,
32029 IX86_BUILTIN_GATHER3SIV16SF,
32030 IX86_BUILTIN_GATHER3SIV16SI,
32031 IX86_BUILTIN_GATHER3SIV8DF,
32032 IX86_BUILTIN_GATHER3SIV8DI,
32033 IX86_BUILTIN_SCATTERALTSIV8DF,
32034 IX86_BUILTIN_SCATTERALTDIV16SF,
32035 IX86_BUILTIN_SCATTERALTSIV8DI,
32036 IX86_BUILTIN_SCATTERALTDIV16SI,
32037 IX86_BUILTIN_SCATTERDIV16SF,
32038 IX86_BUILTIN_SCATTERDIV16SI,
32039 IX86_BUILTIN_SCATTERDIV8DF,
32040 IX86_BUILTIN_SCATTERDIV8DI,
32041 IX86_BUILTIN_SCATTERSIV16SF,
32042 IX86_BUILTIN_SCATTERSIV16SI,
32043 IX86_BUILTIN_SCATTERSIV8DF,
32044 IX86_BUILTIN_SCATTERSIV8DI,
32045 IX86_BUILTIN_GATHERPFQPD,
32046 IX86_BUILTIN_GATHERPFDPS,
32047 IX86_BUILTIN_GATHERPFDPD,
32048 IX86_BUILTIN_GATHERPFQPS,
32049 IX86_BUILTIN_SCATTERPFDPD,
32050 IX86_BUILTIN_SCATTERPFDPS,
32051 IX86_BUILTIN_SCATTERPFQPD,
32052 IX86_BUILTIN_SCATTERPFQPS,
32053 IX86_BUILTIN_CLWB,
32054 IX86_BUILTIN_CLFLUSHOPT,
32055 IX86_BUILTIN_INFQ,
32056 IX86_BUILTIN_HUGE_VALQ,
32057 IX86_BUILTIN_NANQ,
32058 IX86_BUILTIN_NANSQ,
32059 IX86_BUILTIN_XABORT,
32060 IX86_BUILTIN_ADDCARRYX32,
32061 IX86_BUILTIN_ADDCARRYX64,
32062 IX86_BUILTIN_SBB32,
32063 IX86_BUILTIN_SBB64,
32064 IX86_BUILTIN_RDRAND16_STEP,
32065 IX86_BUILTIN_RDRAND32_STEP,
32066 IX86_BUILTIN_RDRAND64_STEP,
32067 IX86_BUILTIN_RDSEED16_STEP,
32068 IX86_BUILTIN_RDSEED32_STEP,
32069 IX86_BUILTIN_RDSEED64_STEP,
32070 IX86_BUILTIN_MONITORX,
32071 IX86_BUILTIN_MWAITX,
32072 IX86_BUILTIN_CFSTRING,
32073 IX86_BUILTIN_CPU_INIT,
32074 IX86_BUILTIN_CPU_IS,
32075 IX86_BUILTIN_CPU_SUPPORTS,
32076 IX86_BUILTIN_READ_FLAGS,
32077 IX86_BUILTIN_WRITE_FLAGS,
32079 /* All the remaining builtins are tracked in bdesc_* arrays in
32080 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
32081 this point. */
32082 #define BDESC(mask, icode, name, code, comparison, flag) \
32083 code,
32084 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32085 code, \
32086 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
32087 #define BDESC_END(kind, next_kind)
32089 #include "i386-builtin.def"
32091 #undef BDESC
32092 #undef BDESC_FIRST
32093 #undef BDESC_END
32095 IX86_BUILTIN_MAX,
32097 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
32099 /* Now just the aliases for bdesc_* start/end. */
32100 #define BDESC(mask, icode, name, code, comparison, flag)
32101 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
32102 #define BDESC_END(kind, next_kind) \
32103 IX86_BUILTIN__BDESC_##kind##_LAST \
32104 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
32106 #include "i386-builtin.def"
32108 #undef BDESC
32109 #undef BDESC_FIRST
32110 #undef BDESC_END
32112 /* Just to make sure there is no comma after the last enumerator. */
32113 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
32116 /* Table for the ix86 builtin decls. */
32117 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
32119 /* Table of all of the builtin functions that are possible with different ISA's
32120 but are waiting to be built until a function is declared to use that
32121 ISA. */
32122 struct builtin_isa {
32123 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
32124 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
32125 const char *name; /* function name */
32126 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
32127 unsigned char const_p:1; /* true if the declaration is constant */
32128 unsigned char pure_p:1; /* true if the declaration has pure attribute */
32129 bool leaf_p; /* true if the declaration has leaf attribute */
32130 bool nothrow_p; /* true if the declaration has nothrow attribute */
32131 bool set_and_not_built_p;
32134 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
32136 /* Bits that can still enable any inclusion of a builtin. */
32137 static HOST_WIDE_INT deferred_isa_values = 0;
32138 static HOST_WIDE_INT deferred_isa_values2 = 0;
32140 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
32141 of which isa_flags to use in the ix86_builtins_isa array. Stores the
32142 function decl in the ix86_builtins array. Returns the function decl or
32143 NULL_TREE, if the builtin was not added.
32145 If the front end has a special hook for builtin functions, delay adding
32146 builtin functions that aren't in the current ISA until the ISA is changed
32147 with function specific optimization. Doing so, can save about 300K for the
32148 default compiler. When the builtin is expanded, check at that time whether
32149 it is valid.
32151 If the front end doesn't have a special hook, record all builtins, even if
32152 it isn't an instruction set in the current ISA in case the user uses
32153 function specific options for a different ISA, so that we don't get scope
32154 errors if a builtin is added in the middle of a function scope. */
32156 static inline tree
32157 def_builtin (HOST_WIDE_INT mask, const char *name,
32158 enum ix86_builtin_func_type tcode,
32159 enum ix86_builtins code)
32161 tree decl = NULL_TREE;
32163 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32165 ix86_builtins_isa[(int) code].isa = mask;
32167 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32168 where any bit set means that built-in is enable, this bit must be *and-ed*
32169 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32170 means that *both* cpuid bits must be set for the built-in to be available.
32171 Handle this here. */
32172 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32173 mask &= ~OPTION_MASK_ISA_AVX512VL;
32175 mask &= ~OPTION_MASK_ISA_64BIT;
32176 if (mask == 0
32177 || (mask & ix86_isa_flags) != 0
32178 || (lang_hooks.builtin_function
32179 == lang_hooks.builtin_function_ext_scope))
32182 tree type = ix86_get_builtin_func_type (tcode);
32183 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32184 NULL, NULL_TREE);
32185 ix86_builtins[(int) code] = decl;
32186 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32188 else
32190 /* Just a MASK where set_and_not_built_p == true can potentially
32191 include a builtin. */
32192 deferred_isa_values |= mask;
32193 ix86_builtins[(int) code] = NULL_TREE;
32194 ix86_builtins_isa[(int) code].tcode = tcode;
32195 ix86_builtins_isa[(int) code].name = name;
32196 ix86_builtins_isa[(int) code].leaf_p = false;
32197 ix86_builtins_isa[(int) code].nothrow_p = false;
32198 ix86_builtins_isa[(int) code].const_p = false;
32199 ix86_builtins_isa[(int) code].pure_p = false;
32200 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32204 return decl;
32207 /* Like def_builtin, but also marks the function decl "const". */
32209 static inline tree
32210 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32211 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32213 tree decl = def_builtin (mask, name, tcode, code);
32214 if (decl)
32215 TREE_READONLY (decl) = 1;
32216 else
32217 ix86_builtins_isa[(int) code].const_p = true;
32219 return decl;
32222 /* Like def_builtin, but also marks the function decl "pure". */
32224 static inline tree
32225 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32226 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32228 tree decl = def_builtin (mask, name, tcode, code);
32229 if (decl)
32230 DECL_PURE_P (decl) = 1;
32231 else
32232 ix86_builtins_isa[(int) code].pure_p = true;
32234 return decl;
32237 /* Like def_builtin, but for additional isa2 flags. */
32239 static inline tree
32240 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32241 enum ix86_builtin_func_type tcode,
32242 enum ix86_builtins code)
32244 tree decl = NULL_TREE;
32246 ix86_builtins_isa[(int) code].isa2 = mask;
32248 if (mask == 0
32249 || (mask & ix86_isa_flags2) != 0
32250 || (lang_hooks.builtin_function
32251 == lang_hooks.builtin_function_ext_scope))
32254 tree type = ix86_get_builtin_func_type (tcode);
32255 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32256 NULL, NULL_TREE);
32257 ix86_builtins[(int) code] = decl;
32258 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32260 else
32262 /* Just a MASK where set_and_not_built_p == true can potentially
32263 include a builtin. */
32264 deferred_isa_values2 |= mask;
32265 ix86_builtins[(int) code] = NULL_TREE;
32266 ix86_builtins_isa[(int) code].tcode = tcode;
32267 ix86_builtins_isa[(int) code].name = name;
32268 ix86_builtins_isa[(int) code].leaf_p = false;
32269 ix86_builtins_isa[(int) code].nothrow_p = false;
32270 ix86_builtins_isa[(int) code].const_p = false;
32271 ix86_builtins_isa[(int) code].pure_p = false;
32272 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32275 return decl;
32278 /* Like def_builtin, but also marks the function decl "const". */
32280 static inline tree
32281 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32282 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32284 tree decl = def_builtin2 (mask, name, tcode, code);
32285 if (decl)
32286 TREE_READONLY (decl) = 1;
32287 else
32288 ix86_builtins_isa[(int) code].const_p = true;
32290 return decl;
32293 /* Like def_builtin, but also marks the function decl "pure". */
32295 static inline tree
32296 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32297 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32299 tree decl = def_builtin2 (mask, name, tcode, code);
32300 if (decl)
32301 DECL_PURE_P (decl) = 1;
32302 else
32303 ix86_builtins_isa[(int) code].pure_p = true;
32305 return decl;
32308 /* Add any new builtin functions for a given ISA that may not have been
32309 declared. This saves a bit of space compared to adding all of the
32310 declarations to the tree, even if we didn't use them. */
32312 static void
32313 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32315 if ((isa & deferred_isa_values) == 0
32316 && (isa2 & deferred_isa_values2) == 0)
32317 return;
32319 /* Bits in ISA value can be removed from potential isa values. */
32320 deferred_isa_values &= ~isa;
32321 deferred_isa_values2 &= ~isa2;
32323 int i;
32324 tree saved_current_target_pragma = current_target_pragma;
32325 current_target_pragma = NULL_TREE;
32327 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32329 if (((ix86_builtins_isa[i].isa & isa) != 0
32330 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32331 && ix86_builtins_isa[i].set_and_not_built_p)
32333 tree decl, type;
32335 /* Don't define the builtin again. */
32336 ix86_builtins_isa[i].set_and_not_built_p = false;
32338 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32339 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32340 type, i, BUILT_IN_MD, NULL,
32341 NULL_TREE);
32343 ix86_builtins[i] = decl;
32344 if (ix86_builtins_isa[i].const_p)
32345 TREE_READONLY (decl) = 1;
32346 if (ix86_builtins_isa[i].pure_p)
32347 DECL_PURE_P (decl) = 1;
32348 if (ix86_builtins_isa[i].leaf_p)
32349 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32350 NULL_TREE);
32351 if (ix86_builtins_isa[i].nothrow_p)
32352 TREE_NOTHROW (decl) = 1;
32356 current_target_pragma = saved_current_target_pragma;
32359 /* Bits for builtin_description.flag. */
32361 /* Set when we don't support the comparison natively, and should
32362 swap_comparison in order to support it. */
32363 #define BUILTIN_DESC_SWAP_OPERANDS 1
32365 struct builtin_description
32367 const HOST_WIDE_INT mask;
32368 const enum insn_code icode;
32369 const char *const name;
32370 const enum ix86_builtins code;
32371 const enum rtx_code comparison;
32372 const int flag;
32375 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32376 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32377 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32378 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32379 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32380 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32381 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32382 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32383 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32384 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32385 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32386 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32387 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32388 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32389 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32390 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32391 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32392 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32393 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32394 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32395 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32396 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32397 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32398 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32399 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32400 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32401 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32402 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32403 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32404 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32405 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32406 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32407 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32408 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32409 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32410 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32411 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32412 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32413 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32414 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32415 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32416 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32417 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32418 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32419 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32420 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32421 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32422 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32423 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32424 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32425 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32426 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32428 #define BDESC(mask, icode, name, code, comparison, flag) \
32429 { mask, icode, name, code, comparison, flag },
32430 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32431 static const struct builtin_description bdesc_##kind[] = \
32433 BDESC (mask, icode, name, code, comparison, flag)
32434 #define BDESC_END(kind, next_kind) \
32437 #include "i386-builtin.def"
32439 #undef BDESC
32440 #undef BDESC_FIRST
32441 #undef BDESC_END
32443 /* TM vector builtins. */
32445 /* Reuse the existing x86-specific `struct builtin_description' cause
32446 we're lazy. Add casts to make them fit. */
32447 static const struct builtin_description bdesc_tm[] =
32449 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32450 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32451 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32452 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32453 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32454 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32455 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32457 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32458 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32459 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32460 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32461 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32462 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32463 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32465 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32466 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32467 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32468 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32469 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32470 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32471 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32473 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32474 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32475 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32478 /* Initialize the transactional memory vector load/store builtins. */
32480 static void
32481 ix86_init_tm_builtins (void)
32483 enum ix86_builtin_func_type ftype;
32484 const struct builtin_description *d;
32485 size_t i;
32486 tree decl;
32487 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32488 tree attrs_log, attrs_type_log;
32490 if (!flag_tm)
32491 return;
32493 /* If there are no builtins defined, we must be compiling in a
32494 language without trans-mem support. */
32495 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32496 return;
32498 /* Use whatever attributes a normal TM load has. */
32499 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32500 attrs_load = DECL_ATTRIBUTES (decl);
32501 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32502 /* Use whatever attributes a normal TM store has. */
32503 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32504 attrs_store = DECL_ATTRIBUTES (decl);
32505 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32506 /* Use whatever attributes a normal TM log has. */
32507 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32508 attrs_log = DECL_ATTRIBUTES (decl);
32509 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32511 for (i = 0, d = bdesc_tm;
32512 i < ARRAY_SIZE (bdesc_tm);
32513 i++, d++)
32515 if ((d->mask & ix86_isa_flags) != 0
32516 || (lang_hooks.builtin_function
32517 == lang_hooks.builtin_function_ext_scope))
32519 tree type, attrs, attrs_type;
32520 enum built_in_function code = (enum built_in_function) d->code;
32522 ftype = (enum ix86_builtin_func_type) d->flag;
32523 type = ix86_get_builtin_func_type (ftype);
32525 if (BUILTIN_TM_LOAD_P (code))
32527 attrs = attrs_load;
32528 attrs_type = attrs_type_load;
32530 else if (BUILTIN_TM_STORE_P (code))
32532 attrs = attrs_store;
32533 attrs_type = attrs_type_store;
32535 else
32537 attrs = attrs_log;
32538 attrs_type = attrs_type_log;
32540 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32541 /* The builtin without the prefix for
32542 calling it directly. */
32543 d->name + strlen ("__builtin_"),
32544 attrs);
32545 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32546 set the TYPE_ATTRIBUTES. */
32547 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32549 set_builtin_decl (code, decl, false);
32554 /* Macros for verification of enum ix86_builtins order. */
32555 #define BDESC_VERIFY(x, y, z) \
32556 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32557 #define BDESC_VERIFYS(x, y, z) \
32558 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32560 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32561 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32562 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32563 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32564 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32565 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32566 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32567 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32568 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32569 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32570 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32571 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32572 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32573 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32574 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32575 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32576 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32577 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32578 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32579 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32581 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32582 in the current target ISA to allow the user to compile particular modules
32583 with different target specific options that differ from the command line
32584 options. */
32585 static void
32586 ix86_init_mmx_sse_builtins (void)
32588 const struct builtin_description * d;
32589 enum ix86_builtin_func_type ftype;
32590 size_t i;
32592 /* Add all special builtins with variable number of operands. */
32593 for (i = 0, d = bdesc_special_args;
32594 i < ARRAY_SIZE (bdesc_special_args);
32595 i++, d++)
32597 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32598 if (d->name == 0)
32599 continue;
32601 ftype = (enum ix86_builtin_func_type) d->flag;
32602 def_builtin (d->mask, d->name, ftype, d->code);
32604 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32605 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32606 ARRAY_SIZE (bdesc_special_args) - 1);
32608 /* Add all builtins with variable number of operands. */
32609 for (i = 0, d = bdesc_args;
32610 i < ARRAY_SIZE (bdesc_args);
32611 i++, d++)
32613 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32614 if (d->name == 0)
32615 continue;
32617 ftype = (enum ix86_builtin_func_type) d->flag;
32618 def_builtin_const (d->mask, d->name, ftype, d->code);
32620 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32621 IX86_BUILTIN__BDESC_ARGS_FIRST,
32622 ARRAY_SIZE (bdesc_args) - 1);
32624 /* Add all builtins with variable number of operands. */
32625 for (i = 0, d = bdesc_args2;
32626 i < ARRAY_SIZE (bdesc_args2);
32627 i++, d++)
32629 if (d->name == 0)
32630 continue;
32632 ftype = (enum ix86_builtin_func_type) d->flag;
32633 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32636 /* Add all builtins with rounding. */
32637 for (i = 0, d = bdesc_round_args;
32638 i < ARRAY_SIZE (bdesc_round_args);
32639 i++, d++)
32641 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32642 if (d->name == 0)
32643 continue;
32645 ftype = (enum ix86_builtin_func_type) d->flag;
32646 def_builtin_const (d->mask, d->name, ftype, d->code);
32648 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32649 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32650 ARRAY_SIZE (bdesc_round_args) - 1);
32652 /* pcmpestr[im] insns. */
32653 for (i = 0, d = bdesc_pcmpestr;
32654 i < ARRAY_SIZE (bdesc_pcmpestr);
32655 i++, d++)
32657 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32658 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32659 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32660 else
32661 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32662 def_builtin_const (d->mask, d->name, ftype, d->code);
32664 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32665 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32666 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32668 /* pcmpistr[im] insns. */
32669 for (i = 0, d = bdesc_pcmpistr;
32670 i < ARRAY_SIZE (bdesc_pcmpistr);
32671 i++, d++)
32673 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32674 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32675 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32676 else
32677 ftype = INT_FTYPE_V16QI_V16QI_INT;
32678 def_builtin_const (d->mask, d->name, ftype, d->code);
32680 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32681 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32682 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32684 /* comi/ucomi insns. */
32685 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32687 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32688 if (d->mask == OPTION_MASK_ISA_SSE2)
32689 ftype = INT_FTYPE_V2DF_V2DF;
32690 else
32691 ftype = INT_FTYPE_V4SF_V4SF;
32692 def_builtin_const (d->mask, d->name, ftype, d->code);
32694 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32695 IX86_BUILTIN__BDESC_COMI_FIRST,
32696 ARRAY_SIZE (bdesc_comi) - 1);
32698 /* SSE */
32699 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32700 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32701 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32702 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32704 /* SSE or 3DNow!A */
32705 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32706 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32707 IX86_BUILTIN_MASKMOVQ);
32709 /* SSE2 */
32710 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32711 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32713 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32714 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32715 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32716 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32718 /* SSE3. */
32719 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32720 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32721 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32722 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32724 /* AES */
32725 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32726 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32727 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32728 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32729 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32730 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32731 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32732 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32733 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32734 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32735 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32736 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32738 /* PCLMUL */
32739 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32740 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32742 /* RDRND */
32743 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32744 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32745 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32746 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32747 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32748 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32749 IX86_BUILTIN_RDRAND64_STEP);
32751 /* AVX2 */
32752 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32753 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32754 IX86_BUILTIN_GATHERSIV2DF);
32756 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32757 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32758 IX86_BUILTIN_GATHERSIV4DF);
32760 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32761 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32762 IX86_BUILTIN_GATHERDIV2DF);
32764 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32765 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32766 IX86_BUILTIN_GATHERDIV4DF);
32768 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32769 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32770 IX86_BUILTIN_GATHERSIV4SF);
32772 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32773 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32774 IX86_BUILTIN_GATHERSIV8SF);
32776 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32777 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32778 IX86_BUILTIN_GATHERDIV4SF);
32780 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32781 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32782 IX86_BUILTIN_GATHERDIV8SF);
32784 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32785 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32786 IX86_BUILTIN_GATHERSIV2DI);
32788 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32789 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32790 IX86_BUILTIN_GATHERSIV4DI);
32792 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32793 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32794 IX86_BUILTIN_GATHERDIV2DI);
32796 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32797 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32798 IX86_BUILTIN_GATHERDIV4DI);
32800 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32801 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32802 IX86_BUILTIN_GATHERSIV4SI);
32804 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32805 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32806 IX86_BUILTIN_GATHERSIV8SI);
32808 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32809 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32810 IX86_BUILTIN_GATHERDIV4SI);
32812 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32813 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32814 IX86_BUILTIN_GATHERDIV8SI);
32816 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32817 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32818 IX86_BUILTIN_GATHERALTSIV4DF);
32820 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32821 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32822 IX86_BUILTIN_GATHERALTDIV8SF);
32824 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32825 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32826 IX86_BUILTIN_GATHERALTSIV4DI);
32828 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32829 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32830 IX86_BUILTIN_GATHERALTDIV8SI);
32832 /* AVX512F */
32833 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32834 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32835 IX86_BUILTIN_GATHER3SIV16SF);
32837 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32838 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32839 IX86_BUILTIN_GATHER3SIV8DF);
32841 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32842 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32843 IX86_BUILTIN_GATHER3DIV16SF);
32845 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32846 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32847 IX86_BUILTIN_GATHER3DIV8DF);
32849 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32850 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32851 IX86_BUILTIN_GATHER3SIV16SI);
32853 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32854 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32855 IX86_BUILTIN_GATHER3SIV8DI);
32857 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32858 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32859 IX86_BUILTIN_GATHER3DIV16SI);
32861 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32862 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32863 IX86_BUILTIN_GATHER3DIV8DI);
32865 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32866 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32867 IX86_BUILTIN_GATHER3ALTSIV8DF);
32869 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32870 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32871 IX86_BUILTIN_GATHER3ALTDIV16SF);
32873 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
32874 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
32875 IX86_BUILTIN_GATHER3ALTSIV8DI);
32877 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
32878 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
32879 IX86_BUILTIN_GATHER3ALTDIV16SI);
32881 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
32882 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
32883 IX86_BUILTIN_SCATTERSIV16SF);
32885 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
32886 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
32887 IX86_BUILTIN_SCATTERSIV8DF);
32889 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
32890 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
32891 IX86_BUILTIN_SCATTERDIV16SF);
32893 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
32894 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
32895 IX86_BUILTIN_SCATTERDIV8DF);
32897 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
32898 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
32899 IX86_BUILTIN_SCATTERSIV16SI);
32901 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
32902 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
32903 IX86_BUILTIN_SCATTERSIV8DI);
32905 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
32906 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
32907 IX86_BUILTIN_SCATTERDIV16SI);
32909 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
32910 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
32911 IX86_BUILTIN_SCATTERDIV8DI);
32913 /* AVX512VL */
32914 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
32915 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
32916 IX86_BUILTIN_GATHER3SIV2DF);
32918 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
32919 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
32920 IX86_BUILTIN_GATHER3SIV4DF);
32922 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
32923 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
32924 IX86_BUILTIN_GATHER3DIV2DF);
32926 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
32927 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
32928 IX86_BUILTIN_GATHER3DIV4DF);
32930 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
32931 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
32932 IX86_BUILTIN_GATHER3SIV4SF);
32934 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
32935 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
32936 IX86_BUILTIN_GATHER3SIV8SF);
32938 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
32939 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
32940 IX86_BUILTIN_GATHER3DIV4SF);
32942 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
32943 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
32944 IX86_BUILTIN_GATHER3DIV8SF);
32946 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
32947 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
32948 IX86_BUILTIN_GATHER3SIV2DI);
32950 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
32951 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
32952 IX86_BUILTIN_GATHER3SIV4DI);
32954 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
32955 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
32956 IX86_BUILTIN_GATHER3DIV2DI);
32958 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
32959 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
32960 IX86_BUILTIN_GATHER3DIV4DI);
32962 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
32963 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
32964 IX86_BUILTIN_GATHER3SIV4SI);
32966 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
32967 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
32968 IX86_BUILTIN_GATHER3SIV8SI);
32970 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
32971 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
32972 IX86_BUILTIN_GATHER3DIV4SI);
32974 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
32975 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
32976 IX86_BUILTIN_GATHER3DIV8SI);
32978 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
32979 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
32980 IX86_BUILTIN_GATHER3ALTSIV4DF);
32982 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
32983 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
32984 IX86_BUILTIN_GATHER3ALTDIV8SF);
32986 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
32987 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
32988 IX86_BUILTIN_GATHER3ALTSIV4DI);
32990 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
32991 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
32992 IX86_BUILTIN_GATHER3ALTDIV8SI);
32994 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
32995 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
32996 IX86_BUILTIN_SCATTERSIV8SF);
32998 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
32999 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
33000 IX86_BUILTIN_SCATTERSIV4SF);
33002 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
33003 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
33004 IX86_BUILTIN_SCATTERSIV4DF);
33006 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
33007 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
33008 IX86_BUILTIN_SCATTERSIV2DF);
33010 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
33011 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
33012 IX86_BUILTIN_SCATTERDIV8SF);
33014 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
33015 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
33016 IX86_BUILTIN_SCATTERDIV4SF);
33018 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
33019 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
33020 IX86_BUILTIN_SCATTERDIV4DF);
33022 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
33023 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
33024 IX86_BUILTIN_SCATTERDIV2DF);
33026 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
33027 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
33028 IX86_BUILTIN_SCATTERSIV8SI);
33030 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
33031 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
33032 IX86_BUILTIN_SCATTERSIV4SI);
33034 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
33035 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
33036 IX86_BUILTIN_SCATTERSIV4DI);
33038 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
33039 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
33040 IX86_BUILTIN_SCATTERSIV2DI);
33042 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
33043 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
33044 IX86_BUILTIN_SCATTERDIV8SI);
33046 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
33047 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
33048 IX86_BUILTIN_SCATTERDIV4SI);
33050 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
33051 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
33052 IX86_BUILTIN_SCATTERDIV4DI);
33054 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
33055 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
33056 IX86_BUILTIN_SCATTERDIV2DI);
33057 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
33058 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
33059 IX86_BUILTIN_SCATTERALTSIV8DF);
33061 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
33062 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
33063 IX86_BUILTIN_SCATTERALTDIV16SF);
33065 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
33066 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
33067 IX86_BUILTIN_SCATTERALTSIV8DI);
33069 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
33070 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
33071 IX86_BUILTIN_SCATTERALTDIV16SI);
33073 /* AVX512PF */
33074 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
33075 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33076 IX86_BUILTIN_GATHERPFDPD);
33077 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
33078 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33079 IX86_BUILTIN_GATHERPFDPS);
33080 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
33081 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33082 IX86_BUILTIN_GATHERPFQPD);
33083 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
33084 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33085 IX86_BUILTIN_GATHERPFQPS);
33086 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
33087 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33088 IX86_BUILTIN_SCATTERPFDPD);
33089 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
33090 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33091 IX86_BUILTIN_SCATTERPFDPS);
33092 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
33093 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33094 IX86_BUILTIN_SCATTERPFQPD);
33095 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
33096 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33097 IX86_BUILTIN_SCATTERPFQPS);
33099 /* SHA */
33100 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
33101 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
33102 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
33103 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
33104 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
33105 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
33106 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
33107 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
33108 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
33109 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
33110 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
33111 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
33112 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
33113 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
33115 /* RTM. */
33116 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
33117 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
33119 /* MMX access to the vec_init patterns. */
33120 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
33121 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
33123 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
33124 V4HI_FTYPE_HI_HI_HI_HI,
33125 IX86_BUILTIN_VEC_INIT_V4HI);
33127 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
33128 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
33129 IX86_BUILTIN_VEC_INIT_V8QI);
33131 /* Access to the vec_extract patterns. */
33132 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
33133 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
33134 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
33135 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
33136 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
33137 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
33138 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
33139 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
33140 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
33141 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
33143 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33144 "__builtin_ia32_vec_ext_v4hi",
33145 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
33147 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
33148 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
33150 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
33151 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
33153 /* Access to the vec_set patterns. */
33154 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
33155 "__builtin_ia32_vec_set_v2di",
33156 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33158 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33159 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33161 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33162 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33164 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33165 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33167 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33168 "__builtin_ia32_vec_set_v4hi",
33169 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33171 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33172 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33174 /* RDSEED */
33175 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33176 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33177 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33178 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33179 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33180 "__builtin_ia32_rdseed_di_step",
33181 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33183 /* ADCX */
33184 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33185 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33186 def_builtin (OPTION_MASK_ISA_64BIT,
33187 "__builtin_ia32_addcarryx_u64",
33188 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33189 IX86_BUILTIN_ADDCARRYX64);
33191 /* SBB */
33192 def_builtin (0, "__builtin_ia32_sbb_u32",
33193 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33194 def_builtin (OPTION_MASK_ISA_64BIT,
33195 "__builtin_ia32_sbb_u64",
33196 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33197 IX86_BUILTIN_SBB64);
33199 /* Read/write FLAGS. */
33200 def_builtin (0, "__builtin_ia32_readeflags_u32",
33201 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33202 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33203 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33204 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33205 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33206 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33207 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33209 /* CLFLUSHOPT. */
33210 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33211 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33213 /* CLWB. */
33214 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33215 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33217 /* MONITORX and MWAITX. */
33218 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33219 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33220 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33221 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33223 /* CLZERO. */
33224 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33225 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33227 /* Add FMA4 multi-arg argument instructions */
33228 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33230 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33231 if (d->name == 0)
33232 continue;
33234 ftype = (enum ix86_builtin_func_type) d->flag;
33235 def_builtin_const (d->mask, d->name, ftype, d->code);
33237 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33238 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33239 ARRAY_SIZE (bdesc_multi_arg) - 1);
33242 static void
33243 ix86_init_mpx_builtins ()
33245 const struct builtin_description * d;
33246 enum ix86_builtin_func_type ftype;
33247 tree decl;
33248 size_t i;
33250 for (i = 0, d = bdesc_mpx;
33251 i < ARRAY_SIZE (bdesc_mpx);
33252 i++, d++)
33254 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33255 if (d->name == 0)
33256 continue;
33258 ftype = (enum ix86_builtin_func_type) d->flag;
33259 decl = def_builtin (d->mask, d->name, ftype, d->code);
33261 /* With no leaf and nothrow flags for MPX builtins
33262 abnormal edges may follow its call when setjmp
33263 presents in the function. Since we may have a lot
33264 of MPX builtins calls it causes lots of useless
33265 edges and enormous PHI nodes. To avoid this we mark
33266 MPX builtins as leaf and nothrow. */
33267 if (decl)
33269 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33270 NULL_TREE);
33271 TREE_NOTHROW (decl) = 1;
33273 else
33275 ix86_builtins_isa[(int)d->code].leaf_p = true;
33276 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33279 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33280 IX86_BUILTIN__BDESC_MPX_FIRST,
33281 ARRAY_SIZE (bdesc_mpx) - 1);
33283 for (i = 0, d = bdesc_mpx_const;
33284 i < ARRAY_SIZE (bdesc_mpx_const);
33285 i++, d++)
33287 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33288 if (d->name == 0)
33289 continue;
33291 ftype = (enum ix86_builtin_func_type) d->flag;
33292 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33294 if (decl)
33296 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33297 NULL_TREE);
33298 TREE_NOTHROW (decl) = 1;
33300 else
33302 ix86_builtins_isa[(int)d->code].leaf_p = true;
33303 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33306 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33307 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33308 ARRAY_SIZE (bdesc_mpx_const) - 1);
33310 #undef BDESC_VERIFY
33311 #undef BDESC_VERIFYS
33313 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33314 to return a pointer to VERSION_DECL if the outcome of the expression
33315 formed by PREDICATE_CHAIN is true. This function will be called during
33316 version dispatch to decide which function version to execute. It returns
33317 the basic block at the end, to which more conditions can be added. */
33319 static basic_block
33320 add_condition_to_bb (tree function_decl, tree version_decl,
33321 tree predicate_chain, basic_block new_bb)
33323 gimple *return_stmt;
33324 tree convert_expr, result_var;
33325 gimple *convert_stmt;
33326 gimple *call_cond_stmt;
33327 gimple *if_else_stmt;
33329 basic_block bb1, bb2, bb3;
33330 edge e12, e23;
33332 tree cond_var, and_expr_var = NULL_TREE;
33333 gimple_seq gseq;
33335 tree predicate_decl, predicate_arg;
33337 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33339 gcc_assert (new_bb != NULL);
33340 gseq = bb_seq (new_bb);
33343 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33344 build_fold_addr_expr (version_decl));
33345 result_var = create_tmp_var (ptr_type_node);
33346 convert_stmt = gimple_build_assign (result_var, convert_expr);
33347 return_stmt = gimple_build_return (result_var);
33349 if (predicate_chain == NULL_TREE)
33351 gimple_seq_add_stmt (&gseq, convert_stmt);
33352 gimple_seq_add_stmt (&gseq, return_stmt);
33353 set_bb_seq (new_bb, gseq);
33354 gimple_set_bb (convert_stmt, new_bb);
33355 gimple_set_bb (return_stmt, new_bb);
33356 pop_cfun ();
33357 return new_bb;
33360 while (predicate_chain != NULL)
33362 cond_var = create_tmp_var (integer_type_node);
33363 predicate_decl = TREE_PURPOSE (predicate_chain);
33364 predicate_arg = TREE_VALUE (predicate_chain);
33365 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33366 gimple_call_set_lhs (call_cond_stmt, cond_var);
33368 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33369 gimple_set_bb (call_cond_stmt, new_bb);
33370 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33372 predicate_chain = TREE_CHAIN (predicate_chain);
33374 if (and_expr_var == NULL)
33375 and_expr_var = cond_var;
33376 else
33378 gimple *assign_stmt;
33379 /* Use MIN_EXPR to check if any integer is zero?.
33380 and_expr_var = min_expr <cond_var, and_expr_var> */
33381 assign_stmt = gimple_build_assign (and_expr_var,
33382 build2 (MIN_EXPR, integer_type_node,
33383 cond_var, and_expr_var));
33385 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33386 gimple_set_bb (assign_stmt, new_bb);
33387 gimple_seq_add_stmt (&gseq, assign_stmt);
33391 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33392 integer_zero_node,
33393 NULL_TREE, NULL_TREE);
33394 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33395 gimple_set_bb (if_else_stmt, new_bb);
33396 gimple_seq_add_stmt (&gseq, if_else_stmt);
33398 gimple_seq_add_stmt (&gseq, convert_stmt);
33399 gimple_seq_add_stmt (&gseq, return_stmt);
33400 set_bb_seq (new_bb, gseq);
33402 bb1 = new_bb;
33403 e12 = split_block (bb1, if_else_stmt);
33404 bb2 = e12->dest;
33405 e12->flags &= ~EDGE_FALLTHRU;
33406 e12->flags |= EDGE_TRUE_VALUE;
33408 e23 = split_block (bb2, return_stmt);
33410 gimple_set_bb (convert_stmt, bb2);
33411 gimple_set_bb (return_stmt, bb2);
33413 bb3 = e23->dest;
33414 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33416 remove_edge (e23);
33417 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33419 pop_cfun ();
33421 return bb3;
33424 /* This parses the attribute arguments to target in DECL and determines
33425 the right builtin to use to match the platform specification.
33426 It returns the priority value for this version decl. If PREDICATE_LIST
33427 is not NULL, it stores the list of cpu features that need to be checked
33428 before dispatching this function. */
33430 static unsigned int
33431 get_builtin_code_for_version (tree decl, tree *predicate_list)
33433 tree attrs;
33434 struct cl_target_option cur_target;
33435 tree target_node;
33436 struct cl_target_option *new_target;
33437 const char *arg_str = NULL;
33438 const char *attrs_str = NULL;
33439 char *tok_str = NULL;
33440 char *token;
33442 /* Priority of i386 features, greater value is higher priority. This is
33443 used to decide the order in which function dispatch must happen. For
33444 instance, a version specialized for SSE4.2 should be checked for dispatch
33445 before a version for SSE3, as SSE4.2 implies SSE3. */
33446 enum feature_priority
33448 P_ZERO = 0,
33449 P_MMX,
33450 P_SSE,
33451 P_SSE2,
33452 P_SSE3,
33453 P_SSSE3,
33454 P_PROC_SSSE3,
33455 P_SSE4_A,
33456 P_PROC_SSE4_A,
33457 P_SSE4_1,
33458 P_SSE4_2,
33459 P_PROC_SSE4_2,
33460 P_POPCNT,
33461 P_AES,
33462 P_PCLMUL,
33463 P_AVX,
33464 P_PROC_AVX,
33465 P_BMI,
33466 P_PROC_BMI,
33467 P_FMA4,
33468 P_XOP,
33469 P_PROC_XOP,
33470 P_FMA,
33471 P_PROC_FMA,
33472 P_BMI2,
33473 P_AVX2,
33474 P_PROC_AVX2,
33475 P_AVX512F,
33476 P_PROC_AVX512F
33479 enum feature_priority priority = P_ZERO;
33481 /* These are the target attribute strings for which a dispatcher is
33482 available, from fold_builtin_cpu. */
33484 static struct _feature_list
33486 const char *const name;
33487 const enum feature_priority priority;
33489 const feature_list[] =
33491 {"mmx", P_MMX},
33492 {"sse", P_SSE},
33493 {"sse2", P_SSE2},
33494 {"sse3", P_SSE3},
33495 {"sse4a", P_SSE4_A},
33496 {"ssse3", P_SSSE3},
33497 {"sse4.1", P_SSE4_1},
33498 {"sse4.2", P_SSE4_2},
33499 {"popcnt", P_POPCNT},
33500 {"aes", P_AES},
33501 {"pclmul", P_PCLMUL},
33502 {"avx", P_AVX},
33503 {"bmi", P_BMI},
33504 {"fma4", P_FMA4},
33505 {"xop", P_XOP},
33506 {"fma", P_FMA},
33507 {"bmi2", P_BMI2},
33508 {"avx2", P_AVX2},
33509 {"avx512f", P_AVX512F}
33513 static unsigned int NUM_FEATURES
33514 = sizeof (feature_list) / sizeof (struct _feature_list);
33516 unsigned int i;
33518 tree predicate_chain = NULL_TREE;
33519 tree predicate_decl, predicate_arg;
33521 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33522 gcc_assert (attrs != NULL);
33524 attrs = TREE_VALUE (TREE_VALUE (attrs));
33526 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33527 attrs_str = TREE_STRING_POINTER (attrs);
33529 /* Return priority zero for default function. */
33530 if (strcmp (attrs_str, "default") == 0)
33531 return 0;
33533 /* Handle arch= if specified. For priority, set it to be 1 more than
33534 the best instruction set the processor can handle. For instance, if
33535 there is a version for atom and a version for ssse3 (the highest ISA
33536 priority for atom), the atom version must be checked for dispatch
33537 before the ssse3 version. */
33538 if (strstr (attrs_str, "arch=") != NULL)
33540 cl_target_option_save (&cur_target, &global_options);
33541 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33542 &global_options_set);
33544 gcc_assert (target_node);
33545 new_target = TREE_TARGET_OPTION (target_node);
33546 gcc_assert (new_target);
33548 if (new_target->arch_specified && new_target->arch > 0)
33550 switch (new_target->arch)
33552 case PROCESSOR_CORE2:
33553 arg_str = "core2";
33554 priority = P_PROC_SSSE3;
33555 break;
33556 case PROCESSOR_NEHALEM:
33557 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33559 arg_str = "westmere";
33560 priority = P_AES;
33562 else
33564 /* We translate "arch=corei7" and "arch=nehalem" to
33565 "corei7" so that it will be mapped to M_INTEL_COREI7
33566 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33567 arg_str = "corei7";
33568 priority = P_PROC_SSE4_2;
33570 break;
33571 case PROCESSOR_SANDYBRIDGE:
33572 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33573 arg_str = "ivybridge";
33574 else
33575 arg_str = "sandybridge";
33576 priority = P_PROC_AVX;
33577 break;
33578 case PROCESSOR_HASWELL:
33579 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33580 arg_str = "skylake-avx512";
33581 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33582 arg_str = "skylake";
33583 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33584 arg_str = "broadwell";
33585 else
33586 arg_str = "haswell";
33587 priority = P_PROC_AVX2;
33588 break;
33589 case PROCESSOR_BONNELL:
33590 arg_str = "bonnell";
33591 priority = P_PROC_SSSE3;
33592 break;
33593 case PROCESSOR_KNL:
33594 arg_str = "knl";
33595 priority = P_PROC_AVX512F;
33596 break;
33597 case PROCESSOR_SILVERMONT:
33598 arg_str = "silvermont";
33599 priority = P_PROC_SSE4_2;
33600 break;
33601 case PROCESSOR_AMDFAM10:
33602 arg_str = "amdfam10h";
33603 priority = P_PROC_SSE4_A;
33604 break;
33605 case PROCESSOR_BTVER1:
33606 arg_str = "btver1";
33607 priority = P_PROC_SSE4_A;
33608 break;
33609 case PROCESSOR_BTVER2:
33610 arg_str = "btver2";
33611 priority = P_PROC_BMI;
33612 break;
33613 case PROCESSOR_BDVER1:
33614 arg_str = "bdver1";
33615 priority = P_PROC_XOP;
33616 break;
33617 case PROCESSOR_BDVER2:
33618 arg_str = "bdver2";
33619 priority = P_PROC_FMA;
33620 break;
33621 case PROCESSOR_BDVER3:
33622 arg_str = "bdver3";
33623 priority = P_PROC_FMA;
33624 break;
33625 case PROCESSOR_BDVER4:
33626 arg_str = "bdver4";
33627 priority = P_PROC_AVX2;
33628 break;
33629 case PROCESSOR_ZNVER1:
33630 arg_str = "znver1";
33631 priority = P_PROC_AVX2;
33632 break;
33636 cl_target_option_restore (&global_options, &cur_target);
33638 if (predicate_list && arg_str == NULL)
33640 error_at (DECL_SOURCE_LOCATION (decl),
33641 "No dispatcher found for the versioning attributes");
33642 return 0;
33645 if (predicate_list)
33647 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33648 /* For a C string literal the length includes the trailing NULL. */
33649 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33650 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33651 predicate_chain);
33655 /* Process feature name. */
33656 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33657 strcpy (tok_str, attrs_str);
33658 token = strtok (tok_str, ",");
33659 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33661 while (token != NULL)
33663 /* Do not process "arch=" */
33664 if (strncmp (token, "arch=", 5) == 0)
33666 token = strtok (NULL, ",");
33667 continue;
33669 for (i = 0; i < NUM_FEATURES; ++i)
33671 if (strcmp (token, feature_list[i].name) == 0)
33673 if (predicate_list)
33675 predicate_arg = build_string_literal (
33676 strlen (feature_list[i].name) + 1,
33677 feature_list[i].name);
33678 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33679 predicate_chain);
33681 /* Find the maximum priority feature. */
33682 if (feature_list[i].priority > priority)
33683 priority = feature_list[i].priority;
33685 break;
33688 if (predicate_list && i == NUM_FEATURES)
33690 error_at (DECL_SOURCE_LOCATION (decl),
33691 "No dispatcher found for %s", token);
33692 return 0;
33694 token = strtok (NULL, ",");
33696 free (tok_str);
33698 if (predicate_list && predicate_chain == NULL_TREE)
33700 error_at (DECL_SOURCE_LOCATION (decl),
33701 "No dispatcher found for the versioning attributes : %s",
33702 attrs_str);
33703 return 0;
33705 else if (predicate_list)
33707 predicate_chain = nreverse (predicate_chain);
33708 *predicate_list = predicate_chain;
33711 return priority;
33714 /* This compares the priority of target features in function DECL1
33715 and DECL2. It returns positive value if DECL1 is higher priority,
33716 negative value if DECL2 is higher priority and 0 if they are the
33717 same. */
33719 static int
33720 ix86_compare_version_priority (tree decl1, tree decl2)
33722 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33723 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33725 return (int)priority1 - (int)priority2;
33728 /* V1 and V2 point to function versions with different priorities
33729 based on the target ISA. This function compares their priorities. */
33731 static int
33732 feature_compare (const void *v1, const void *v2)
33734 typedef struct _function_version_info
33736 tree version_decl;
33737 tree predicate_chain;
33738 unsigned int dispatch_priority;
33739 } function_version_info;
33741 const function_version_info c1 = *(const function_version_info *)v1;
33742 const function_version_info c2 = *(const function_version_info *)v2;
33743 return (c2.dispatch_priority - c1.dispatch_priority);
33746 /* This function generates the dispatch function for
33747 multi-versioned functions. DISPATCH_DECL is the function which will
33748 contain the dispatch logic. FNDECLS are the function choices for
33749 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33750 in DISPATCH_DECL in which the dispatch code is generated. */
33752 static int
33753 dispatch_function_versions (tree dispatch_decl,
33754 void *fndecls_p,
33755 basic_block *empty_bb)
33757 tree default_decl;
33758 gimple *ifunc_cpu_init_stmt;
33759 gimple_seq gseq;
33760 int ix;
33761 tree ele;
33762 vec<tree> *fndecls;
33763 unsigned int num_versions = 0;
33764 unsigned int actual_versions = 0;
33765 unsigned int i;
33767 struct _function_version_info
33769 tree version_decl;
33770 tree predicate_chain;
33771 unsigned int dispatch_priority;
33772 }*function_version_info;
33774 gcc_assert (dispatch_decl != NULL
33775 && fndecls_p != NULL
33776 && empty_bb != NULL);
33778 /*fndecls_p is actually a vector. */
33779 fndecls = static_cast<vec<tree> *> (fndecls_p);
33781 /* At least one more version other than the default. */
33782 num_versions = fndecls->length ();
33783 gcc_assert (num_versions >= 2);
33785 function_version_info = (struct _function_version_info *)
33786 XNEWVEC (struct _function_version_info, (num_versions - 1));
33788 /* The first version in the vector is the default decl. */
33789 default_decl = (*fndecls)[0];
33791 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33793 gseq = bb_seq (*empty_bb);
33794 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33795 constructors, so explicity call __builtin_cpu_init here. */
33796 ifunc_cpu_init_stmt = gimple_build_call_vec (
33797 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33798 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33799 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33800 set_bb_seq (*empty_bb, gseq);
33802 pop_cfun ();
33805 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33807 tree version_decl = ele;
33808 tree predicate_chain = NULL_TREE;
33809 unsigned int priority;
33810 /* Get attribute string, parse it and find the right predicate decl.
33811 The predicate function could be a lengthy combination of many
33812 features, like arch-type and various isa-variants. */
33813 priority = get_builtin_code_for_version (version_decl,
33814 &predicate_chain);
33816 if (predicate_chain == NULL_TREE)
33817 continue;
33819 function_version_info [actual_versions].version_decl = version_decl;
33820 function_version_info [actual_versions].predicate_chain
33821 = predicate_chain;
33822 function_version_info [actual_versions].dispatch_priority = priority;
33823 actual_versions++;
33826 /* Sort the versions according to descending order of dispatch priority. The
33827 priority is based on the ISA. This is not a perfect solution. There
33828 could still be ambiguity. If more than one function version is suitable
33829 to execute, which one should be dispatched? In future, allow the user
33830 to specify a dispatch priority next to the version. */
33831 qsort (function_version_info, actual_versions,
33832 sizeof (struct _function_version_info), feature_compare);
33834 for (i = 0; i < actual_versions; ++i)
33835 *empty_bb = add_condition_to_bb (dispatch_decl,
33836 function_version_info[i].version_decl,
33837 function_version_info[i].predicate_chain,
33838 *empty_bb);
33840 /* dispatch default version at the end. */
33841 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33842 NULL, *empty_bb);
33844 free (function_version_info);
33845 return 0;
33848 /* This function changes the assembler name for functions that are
33849 versions. If DECL is a function version and has a "target"
33850 attribute, it appends the attribute string to its assembler name. */
33852 static tree
33853 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33855 tree version_attr;
33856 const char *orig_name, *version_string;
33857 char *attr_str, *assembler_name;
33859 if (DECL_DECLARED_INLINE_P (decl)
33860 && lookup_attribute ("gnu_inline",
33861 DECL_ATTRIBUTES (decl)))
33862 error_at (DECL_SOURCE_LOCATION (decl),
33863 "Function versions cannot be marked as gnu_inline,"
33864 " bodies have to be generated");
33866 if (DECL_VIRTUAL_P (decl)
33867 || DECL_VINDEX (decl))
33868 sorry ("Virtual function multiversioning not supported");
33870 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33872 /* target attribute string cannot be NULL. */
33873 gcc_assert (version_attr != NULL_TREE);
33875 orig_name = IDENTIFIER_POINTER (id);
33876 version_string
33877 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
33879 if (strcmp (version_string, "default") == 0)
33880 return id;
33882 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
33883 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
33885 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
33887 /* Allow assembler name to be modified if already set. */
33888 if (DECL_ASSEMBLER_NAME_SET_P (decl))
33889 SET_DECL_RTL (decl, NULL);
33891 tree ret = get_identifier (assembler_name);
33892 XDELETEVEC (attr_str);
33893 XDELETEVEC (assembler_name);
33894 return ret;
33898 static tree
33899 ix86_mangle_decl_assembler_name (tree decl, tree id)
33901 /* For function version, add the target suffix to the assembler name. */
33902 if (TREE_CODE (decl) == FUNCTION_DECL
33903 && DECL_FUNCTION_VERSIONED (decl))
33904 id = ix86_mangle_function_version_assembler_name (decl, id);
33905 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
33906 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
33907 #endif
33909 return id;
33912 /* Make a dispatcher declaration for the multi-versioned function DECL.
33913 Calls to DECL function will be replaced with calls to the dispatcher
33914 by the front-end. Returns the decl of the dispatcher function. */
33916 static tree
33917 ix86_get_function_versions_dispatcher (void *decl)
33919 tree fn = (tree) decl;
33920 struct cgraph_node *node = NULL;
33921 struct cgraph_node *default_node = NULL;
33922 struct cgraph_function_version_info *node_v = NULL;
33923 struct cgraph_function_version_info *first_v = NULL;
33925 tree dispatch_decl = NULL;
33927 struct cgraph_function_version_info *default_version_info = NULL;
33929 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33931 node = cgraph_node::get (fn);
33932 gcc_assert (node != NULL);
33934 node_v = node->function_version ();
33935 gcc_assert (node_v != NULL);
33937 if (node_v->dispatcher_resolver != NULL)
33938 return node_v->dispatcher_resolver;
33940 /* Find the default version and make it the first node. */
33941 first_v = node_v;
33942 /* Go to the beginning of the chain. */
33943 while (first_v->prev != NULL)
33944 first_v = first_v->prev;
33945 default_version_info = first_v;
33946 while (default_version_info != NULL)
33948 if (is_function_default_version
33949 (default_version_info->this_node->decl))
33950 break;
33951 default_version_info = default_version_info->next;
33954 /* If there is no default node, just return NULL. */
33955 if (default_version_info == NULL)
33956 return NULL;
33958 /* Make default info the first node. */
33959 if (first_v != default_version_info)
33961 default_version_info->prev->next = default_version_info->next;
33962 if (default_version_info->next)
33963 default_version_info->next->prev = default_version_info->prev;
33964 first_v->prev = default_version_info;
33965 default_version_info->next = first_v;
33966 default_version_info->prev = NULL;
33969 default_node = default_version_info->this_node;
33971 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33972 if (targetm.has_ifunc_p ())
33974 struct cgraph_function_version_info *it_v = NULL;
33975 struct cgraph_node *dispatcher_node = NULL;
33976 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33978 /* Right now, the dispatching is done via ifunc. */
33979 dispatch_decl = make_dispatcher_decl (default_node->decl);
33981 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33982 gcc_assert (dispatcher_node != NULL);
33983 dispatcher_node->dispatcher_function = 1;
33984 dispatcher_version_info
33985 = dispatcher_node->insert_new_function_version ();
33986 dispatcher_version_info->next = default_version_info;
33987 dispatcher_node->definition = 1;
33989 /* Set the dispatcher for all the versions. */
33990 it_v = default_version_info;
33991 while (it_v != NULL)
33993 it_v->dispatcher_resolver = dispatch_decl;
33994 it_v = it_v->next;
33997 else
33998 #endif
34000 error_at (DECL_SOURCE_LOCATION (default_node->decl),
34001 "multiversioning needs ifunc which is not supported "
34002 "on this target");
34005 return dispatch_decl;
34008 /* Make the resolver function decl to dispatch the versions of
34009 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
34010 ifunc alias that will point to the created resolver. Create an
34011 empty basic block in the resolver and store the pointer in
34012 EMPTY_BB. Return the decl of the resolver function. */
34014 static tree
34015 make_resolver_func (const tree default_decl,
34016 const tree ifunc_alias_decl,
34017 basic_block *empty_bb)
34019 char *resolver_name;
34020 tree decl, type, decl_name, t;
34022 /* IFUNC's have to be globally visible. So, if the default_decl is
34023 not, then the name of the IFUNC should be made unique. */
34024 if (TREE_PUBLIC (default_decl) == 0)
34026 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
34027 symtab->change_decl_assembler_name (ifunc_alias_decl,
34028 get_identifier (ifunc_name));
34029 XDELETEVEC (ifunc_name);
34032 resolver_name = make_unique_name (default_decl, "resolver", false);
34034 /* The resolver function should return a (void *). */
34035 type = build_function_type_list (ptr_type_node, NULL_TREE);
34037 decl = build_fn_decl (resolver_name, type);
34038 decl_name = get_identifier (resolver_name);
34039 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
34041 DECL_NAME (decl) = decl_name;
34042 TREE_USED (decl) = 1;
34043 DECL_ARTIFICIAL (decl) = 1;
34044 DECL_IGNORED_P (decl) = 1;
34045 TREE_PUBLIC (decl) = 0;
34046 DECL_UNINLINABLE (decl) = 1;
34048 /* Resolver is not external, body is generated. */
34049 DECL_EXTERNAL (decl) = 0;
34050 DECL_EXTERNAL (ifunc_alias_decl) = 0;
34052 DECL_CONTEXT (decl) = NULL_TREE;
34053 DECL_INITIAL (decl) = make_node (BLOCK);
34054 DECL_STATIC_CONSTRUCTOR (decl) = 0;
34056 if (DECL_COMDAT_GROUP (default_decl)
34057 || TREE_PUBLIC (default_decl))
34059 /* In this case, each translation unit with a call to this
34060 versioned function will put out a resolver. Ensure it
34061 is comdat to keep just one copy. */
34062 DECL_COMDAT (decl) = 1;
34063 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
34065 /* Build result decl and add to function_decl. */
34066 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
34067 DECL_ARTIFICIAL (t) = 1;
34068 DECL_IGNORED_P (t) = 1;
34069 DECL_RESULT (decl) = t;
34071 gimplify_function_tree (decl);
34072 push_cfun (DECL_STRUCT_FUNCTION (decl));
34073 *empty_bb = init_lowered_empty_function (decl, false,
34074 profile_count::uninitialized ());
34076 cgraph_node::add_new_function (decl, true);
34077 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
34079 pop_cfun ();
34081 gcc_assert (ifunc_alias_decl != NULL);
34082 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
34083 DECL_ATTRIBUTES (ifunc_alias_decl)
34084 = make_attribute ("ifunc", resolver_name,
34085 DECL_ATTRIBUTES (ifunc_alias_decl));
34087 /* Create the alias for dispatch to resolver here. */
34088 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
34089 XDELETEVEC (resolver_name);
34090 return decl;
34093 /* Generate the dispatching code body to dispatch multi-versioned function
34094 DECL. The target hook is called to process the "target" attributes and
34095 provide the code to dispatch the right function at run-time. NODE points
34096 to the dispatcher decl whose body will be created. */
34098 static tree
34099 ix86_generate_version_dispatcher_body (void *node_p)
34101 tree resolver_decl;
34102 basic_block empty_bb;
34103 tree default_ver_decl;
34104 struct cgraph_node *versn;
34105 struct cgraph_node *node;
34107 struct cgraph_function_version_info *node_version_info = NULL;
34108 struct cgraph_function_version_info *versn_info = NULL;
34110 node = (cgraph_node *)node_p;
34112 node_version_info = node->function_version ();
34113 gcc_assert (node->dispatcher_function
34114 && node_version_info != NULL);
34116 if (node_version_info->dispatcher_resolver)
34117 return node_version_info->dispatcher_resolver;
34119 /* The first version in the chain corresponds to the default version. */
34120 default_ver_decl = node_version_info->next->this_node->decl;
34122 /* node is going to be an alias, so remove the finalized bit. */
34123 node->definition = false;
34125 resolver_decl = make_resolver_func (default_ver_decl,
34126 node->decl, &empty_bb);
34128 node_version_info->dispatcher_resolver = resolver_decl;
34130 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
34132 auto_vec<tree, 2> fn_ver_vec;
34134 for (versn_info = node_version_info->next; versn_info;
34135 versn_info = versn_info->next)
34137 versn = versn_info->this_node;
34138 /* Check for virtual functions here again, as by this time it should
34139 have been determined if this function needs a vtable index or
34140 not. This happens for methods in derived classes that override
34141 virtual methods in base classes but are not explicitly marked as
34142 virtual. */
34143 if (DECL_VINDEX (versn->decl))
34144 sorry ("Virtual function multiversioning not supported");
34146 fn_ver_vec.safe_push (versn->decl);
34149 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
34150 cgraph_edge::rebuild_edges ();
34151 pop_cfun ();
34152 return resolver_decl;
34154 /* This builds the processor_model struct type defined in
34155 libgcc/config/i386/cpuinfo.c */
34157 static tree
34158 build_processor_model_struct (void)
34160 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
34161 "__cpu_features"};
34162 tree field = NULL_TREE, field_chain = NULL_TREE;
34163 int i;
34164 tree type = make_node (RECORD_TYPE);
34166 /* The first 3 fields are unsigned int. */
34167 for (i = 0; i < 3; ++i)
34169 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34170 get_identifier (field_name[i]), unsigned_type_node);
34171 if (field_chain != NULL_TREE)
34172 DECL_CHAIN (field) = field_chain;
34173 field_chain = field;
34176 /* The last field is an array of unsigned integers of size one. */
34177 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34178 get_identifier (field_name[3]),
34179 build_array_type (unsigned_type_node,
34180 build_index_type (size_one_node)));
34181 if (field_chain != NULL_TREE)
34182 DECL_CHAIN (field) = field_chain;
34183 field_chain = field;
34185 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34186 return type;
34189 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34191 static tree
34192 make_var_decl (tree type, const char *name)
34194 tree new_decl;
34196 new_decl = build_decl (UNKNOWN_LOCATION,
34197 VAR_DECL,
34198 get_identifier(name),
34199 type);
34201 DECL_EXTERNAL (new_decl) = 1;
34202 TREE_STATIC (new_decl) = 1;
34203 TREE_PUBLIC (new_decl) = 1;
34204 DECL_INITIAL (new_decl) = 0;
34205 DECL_ARTIFICIAL (new_decl) = 0;
34206 DECL_PRESERVE_P (new_decl) = 1;
34208 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34209 assemble_variable (new_decl, 0, 0, 0);
34211 return new_decl;
34214 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34215 into an integer defined in libgcc/config/i386/cpuinfo.c */
34217 static tree
34218 fold_builtin_cpu (tree fndecl, tree *args)
34220 unsigned int i;
34221 enum ix86_builtins fn_code = (enum ix86_builtins)
34222 DECL_FUNCTION_CODE (fndecl);
34223 tree param_string_cst = NULL;
34225 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34226 enum processor_features
34228 F_CMOV = 0,
34229 F_MMX,
34230 F_POPCNT,
34231 F_SSE,
34232 F_SSE2,
34233 F_SSE3,
34234 F_SSSE3,
34235 F_SSE4_1,
34236 F_SSE4_2,
34237 F_AVX,
34238 F_AVX2,
34239 F_SSE4_A,
34240 F_FMA4,
34241 F_XOP,
34242 F_FMA,
34243 F_AVX512F,
34244 F_BMI,
34245 F_BMI2,
34246 F_AES,
34247 F_PCLMUL,
34248 F_AVX512VL,
34249 F_AVX512BW,
34250 F_AVX512DQ,
34251 F_AVX512CD,
34252 F_AVX512ER,
34253 F_AVX512PF,
34254 F_AVX512VBMI,
34255 F_AVX512IFMA,
34256 F_AVX5124VNNIW,
34257 F_AVX5124FMAPS,
34258 F_AVX512VPOPCNTDQ,
34259 F_MAX
34262 /* These are the values for vendor types and cpu types and subtypes
34263 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34264 the corresponding start value. */
34265 enum processor_model
34267 M_INTEL = 1,
34268 M_AMD,
34269 M_CPU_TYPE_START,
34270 M_INTEL_BONNELL,
34271 M_INTEL_CORE2,
34272 M_INTEL_COREI7,
34273 M_AMDFAM10H,
34274 M_AMDFAM15H,
34275 M_INTEL_SILVERMONT,
34276 M_INTEL_KNL,
34277 M_AMD_BTVER1,
34278 M_AMD_BTVER2,
34279 M_CPU_SUBTYPE_START,
34280 M_INTEL_COREI7_NEHALEM,
34281 M_INTEL_COREI7_WESTMERE,
34282 M_INTEL_COREI7_SANDYBRIDGE,
34283 M_AMDFAM10H_BARCELONA,
34284 M_AMDFAM10H_SHANGHAI,
34285 M_AMDFAM10H_ISTANBUL,
34286 M_AMDFAM15H_BDVER1,
34287 M_AMDFAM15H_BDVER2,
34288 M_AMDFAM15H_BDVER3,
34289 M_AMDFAM15H_BDVER4,
34290 M_AMDFAM17H_ZNVER1,
34291 M_INTEL_COREI7_IVYBRIDGE,
34292 M_INTEL_COREI7_HASWELL,
34293 M_INTEL_COREI7_BROADWELL,
34294 M_INTEL_COREI7_SKYLAKE,
34295 M_INTEL_COREI7_SKYLAKE_AVX512
34298 static struct _arch_names_table
34300 const char *const name;
34301 const enum processor_model model;
34303 const arch_names_table[] =
34305 {"amd", M_AMD},
34306 {"intel", M_INTEL},
34307 {"atom", M_INTEL_BONNELL},
34308 {"slm", M_INTEL_SILVERMONT},
34309 {"core2", M_INTEL_CORE2},
34310 {"corei7", M_INTEL_COREI7},
34311 {"nehalem", M_INTEL_COREI7_NEHALEM},
34312 {"westmere", M_INTEL_COREI7_WESTMERE},
34313 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34314 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34315 {"haswell", M_INTEL_COREI7_HASWELL},
34316 {"broadwell", M_INTEL_COREI7_BROADWELL},
34317 {"skylake", M_INTEL_COREI7_SKYLAKE},
34318 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34319 {"bonnell", M_INTEL_BONNELL},
34320 {"silvermont", M_INTEL_SILVERMONT},
34321 {"knl", M_INTEL_KNL},
34322 {"amdfam10h", M_AMDFAM10H},
34323 {"barcelona", M_AMDFAM10H_BARCELONA},
34324 {"shanghai", M_AMDFAM10H_SHANGHAI},
34325 {"istanbul", M_AMDFAM10H_ISTANBUL},
34326 {"btver1", M_AMD_BTVER1},
34327 {"amdfam15h", M_AMDFAM15H},
34328 {"bdver1", M_AMDFAM15H_BDVER1},
34329 {"bdver2", M_AMDFAM15H_BDVER2},
34330 {"bdver3", M_AMDFAM15H_BDVER3},
34331 {"bdver4", M_AMDFAM15H_BDVER4},
34332 {"btver2", M_AMD_BTVER2},
34333 {"znver1", M_AMDFAM17H_ZNVER1},
34336 static struct _isa_names_table
34338 const char *const name;
34339 const enum processor_features feature;
34341 const isa_names_table[] =
34343 {"cmov", F_CMOV},
34344 {"mmx", F_MMX},
34345 {"popcnt", F_POPCNT},
34346 {"sse", F_SSE},
34347 {"sse2", F_SSE2},
34348 {"sse3", F_SSE3},
34349 {"ssse3", F_SSSE3},
34350 {"sse4a", F_SSE4_A},
34351 {"sse4.1", F_SSE4_1},
34352 {"sse4.2", F_SSE4_2},
34353 {"avx", F_AVX},
34354 {"fma4", F_FMA4},
34355 {"xop", F_XOP},
34356 {"fma", F_FMA},
34357 {"avx2", F_AVX2},
34358 {"avx512f", F_AVX512F},
34359 {"bmi", F_BMI},
34360 {"bmi2", F_BMI2},
34361 {"aes", F_AES},
34362 {"pclmul", F_PCLMUL},
34363 {"avx512vl",F_AVX512VL},
34364 {"avx512bw",F_AVX512BW},
34365 {"avx512dq",F_AVX512DQ},
34366 {"avx512cd",F_AVX512CD},
34367 {"avx512er",F_AVX512ER},
34368 {"avx512pf",F_AVX512PF},
34369 {"avx512vbmi",F_AVX512VBMI},
34370 {"avx512ifma",F_AVX512IFMA},
34371 {"avx5124vnniw",F_AVX5124VNNIW},
34372 {"avx5124fmaps",F_AVX5124FMAPS},
34373 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34376 tree __processor_model_type = build_processor_model_struct ();
34377 tree __cpu_model_var = make_var_decl (__processor_model_type,
34378 "__cpu_model");
34381 varpool_node::add (__cpu_model_var);
34383 gcc_assert ((args != NULL) && (*args != NULL));
34385 param_string_cst = *args;
34386 while (param_string_cst
34387 && TREE_CODE (param_string_cst) != STRING_CST)
34389 /* *args must be a expr that can contain other EXPRS leading to a
34390 STRING_CST. */
34391 if (!EXPR_P (param_string_cst))
34393 error ("Parameter to builtin must be a string constant or literal");
34394 return integer_zero_node;
34396 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34399 gcc_assert (param_string_cst);
34401 if (fn_code == IX86_BUILTIN_CPU_IS)
34403 tree ref;
34404 tree field;
34405 tree final;
34407 unsigned int field_val = 0;
34408 unsigned int NUM_ARCH_NAMES
34409 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34411 for (i = 0; i < NUM_ARCH_NAMES; i++)
34412 if (strcmp (arch_names_table[i].name,
34413 TREE_STRING_POINTER (param_string_cst)) == 0)
34414 break;
34416 if (i == NUM_ARCH_NAMES)
34418 error ("Parameter to builtin not valid: %s",
34419 TREE_STRING_POINTER (param_string_cst));
34420 return integer_zero_node;
34423 field = TYPE_FIELDS (__processor_model_type);
34424 field_val = arch_names_table[i].model;
34426 /* CPU types are stored in the next field. */
34427 if (field_val > M_CPU_TYPE_START
34428 && field_val < M_CPU_SUBTYPE_START)
34430 field = DECL_CHAIN (field);
34431 field_val -= M_CPU_TYPE_START;
34434 /* CPU subtypes are stored in the next field. */
34435 if (field_val > M_CPU_SUBTYPE_START)
34437 field = DECL_CHAIN ( DECL_CHAIN (field));
34438 field_val -= M_CPU_SUBTYPE_START;
34441 /* Get the appropriate field in __cpu_model. */
34442 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34443 field, NULL_TREE);
34445 /* Check the value. */
34446 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34447 build_int_cstu (unsigned_type_node, field_val));
34448 return build1 (CONVERT_EXPR, integer_type_node, final);
34450 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34452 tree ref;
34453 tree array_elt;
34454 tree field;
34455 tree final;
34457 unsigned int field_val = 0;
34458 unsigned int NUM_ISA_NAMES
34459 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34461 for (i = 0; i < NUM_ISA_NAMES; i++)
34462 if (strcmp (isa_names_table[i].name,
34463 TREE_STRING_POINTER (param_string_cst)) == 0)
34464 break;
34466 if (i == NUM_ISA_NAMES)
34468 error ("Parameter to builtin not valid: %s",
34469 TREE_STRING_POINTER (param_string_cst));
34470 return integer_zero_node;
34473 field = TYPE_FIELDS (__processor_model_type);
34474 /* Get the last field, which is __cpu_features. */
34475 while (DECL_CHAIN (field))
34476 field = DECL_CHAIN (field);
34478 /* Get the appropriate field: __cpu_model.__cpu_features */
34479 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34480 field, NULL_TREE);
34482 /* Access the 0th element of __cpu_features array. */
34483 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34484 integer_zero_node, NULL_TREE, NULL_TREE);
34486 field_val = (1 << isa_names_table[i].feature);
34487 /* Return __cpu_model.__cpu_features[0] & field_val */
34488 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34489 build_int_cstu (unsigned_type_node, field_val));
34490 return build1 (CONVERT_EXPR, integer_type_node, final);
34492 gcc_unreachable ();
34495 static tree
34496 ix86_fold_builtin (tree fndecl, int n_args,
34497 tree *args, bool ignore ATTRIBUTE_UNUSED)
34499 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34501 enum ix86_builtins fn_code = (enum ix86_builtins)
34502 DECL_FUNCTION_CODE (fndecl);
34503 switch (fn_code)
34505 case IX86_BUILTIN_CPU_IS:
34506 case IX86_BUILTIN_CPU_SUPPORTS:
34507 gcc_assert (n_args == 1);
34508 return fold_builtin_cpu (fndecl, args);
34510 case IX86_BUILTIN_NANQ:
34511 case IX86_BUILTIN_NANSQ:
34513 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34514 const char *str = c_getstr (*args);
34515 int quiet = fn_code == IX86_BUILTIN_NANQ;
34516 REAL_VALUE_TYPE real;
34518 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34519 return build_real (type, real);
34520 return NULL_TREE;
34523 case IX86_BUILTIN_INFQ:
34524 case IX86_BUILTIN_HUGE_VALQ:
34526 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34527 REAL_VALUE_TYPE inf;
34528 real_inf (&inf);
34529 return build_real (type, inf);
34532 case IX86_BUILTIN_TZCNT16:
34533 case IX86_BUILTIN_CTZS:
34534 case IX86_BUILTIN_TZCNT32:
34535 case IX86_BUILTIN_TZCNT64:
34536 gcc_assert (n_args == 1);
34537 if (TREE_CODE (args[0]) == INTEGER_CST)
34539 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34540 tree arg = args[0];
34541 if (fn_code == IX86_BUILTIN_TZCNT16
34542 || fn_code == IX86_BUILTIN_CTZS)
34543 arg = fold_convert (short_unsigned_type_node, arg);
34544 if (integer_zerop (arg))
34545 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34546 else
34547 return fold_const_call (CFN_CTZ, type, arg);
34549 break;
34551 case IX86_BUILTIN_LZCNT16:
34552 case IX86_BUILTIN_CLZS:
34553 case IX86_BUILTIN_LZCNT32:
34554 case IX86_BUILTIN_LZCNT64:
34555 gcc_assert (n_args == 1);
34556 if (TREE_CODE (args[0]) == INTEGER_CST)
34558 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34559 tree arg = args[0];
34560 if (fn_code == IX86_BUILTIN_LZCNT16
34561 || fn_code == IX86_BUILTIN_CLZS)
34562 arg = fold_convert (short_unsigned_type_node, arg);
34563 if (integer_zerop (arg))
34564 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34565 else
34566 return fold_const_call (CFN_CLZ, type, arg);
34568 break;
34570 case IX86_BUILTIN_BEXTR32:
34571 case IX86_BUILTIN_BEXTR64:
34572 case IX86_BUILTIN_BEXTRI32:
34573 case IX86_BUILTIN_BEXTRI64:
34574 gcc_assert (n_args == 2);
34575 if (tree_fits_uhwi_p (args[1]))
34577 unsigned HOST_WIDE_INT res = 0;
34578 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34579 unsigned int start = tree_to_uhwi (args[1]);
34580 unsigned int len = (start & 0xff00) >> 8;
34581 start &= 0xff;
34582 if (start >= prec || len == 0)
34583 res = 0;
34584 else if (!tree_fits_uhwi_p (args[0]))
34585 break;
34586 else
34587 res = tree_to_uhwi (args[0]) >> start;
34588 if (len > prec)
34589 len = prec;
34590 if (len < HOST_BITS_PER_WIDE_INT)
34591 res &= (HOST_WIDE_INT_1U << len) - 1;
34592 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34594 break;
34596 case IX86_BUILTIN_BZHI32:
34597 case IX86_BUILTIN_BZHI64:
34598 gcc_assert (n_args == 2);
34599 if (tree_fits_uhwi_p (args[1]))
34601 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34602 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34603 return args[0];
34604 if (!tree_fits_uhwi_p (args[0]))
34605 break;
34606 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34607 res &= ~(HOST_WIDE_INT_M1U << idx);
34608 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34610 break;
34612 case IX86_BUILTIN_PDEP32:
34613 case IX86_BUILTIN_PDEP64:
34614 gcc_assert (n_args == 2);
34615 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34617 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34618 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34619 unsigned HOST_WIDE_INT res = 0;
34620 unsigned HOST_WIDE_INT m, k = 1;
34621 for (m = 1; m; m <<= 1)
34622 if ((mask & m) != 0)
34624 if ((src & k) != 0)
34625 res |= m;
34626 k <<= 1;
34628 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34630 break;
34632 case IX86_BUILTIN_PEXT32:
34633 case IX86_BUILTIN_PEXT64:
34634 gcc_assert (n_args == 2);
34635 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34637 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34638 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34639 unsigned HOST_WIDE_INT res = 0;
34640 unsigned HOST_WIDE_INT m, k = 1;
34641 for (m = 1; m; m <<= 1)
34642 if ((mask & m) != 0)
34644 if ((src & m) != 0)
34645 res |= k;
34646 k <<= 1;
34648 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34650 break;
34652 default:
34653 break;
34657 #ifdef SUBTARGET_FOLD_BUILTIN
34658 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34659 #endif
34661 return NULL_TREE;
34664 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34665 constant) in GIMPLE. */
34667 bool
34668 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34670 gimple *stmt = gsi_stmt (*gsi);
34671 tree fndecl = gimple_call_fndecl (stmt);
34672 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34673 int n_args = gimple_call_num_args (stmt);
34674 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34675 tree decl = NULL_TREE;
34676 tree arg0, arg1;
34678 switch (fn_code)
34680 case IX86_BUILTIN_TZCNT32:
34681 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34682 goto fold_tzcnt_lzcnt;
34684 case IX86_BUILTIN_TZCNT64:
34685 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34686 goto fold_tzcnt_lzcnt;
34688 case IX86_BUILTIN_LZCNT32:
34689 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34690 goto fold_tzcnt_lzcnt;
34692 case IX86_BUILTIN_LZCNT64:
34693 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34694 goto fold_tzcnt_lzcnt;
34696 fold_tzcnt_lzcnt:
34697 gcc_assert (n_args == 1);
34698 arg0 = gimple_call_arg (stmt, 0);
34699 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34701 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34702 /* If arg0 is provably non-zero, optimize into generic
34703 __builtin_c[tl]z{,ll} function the middle-end handles
34704 better. */
34705 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34706 return false;
34708 location_t loc = gimple_location (stmt);
34709 gimple *g = gimple_build_call (decl, 1, arg0);
34710 gimple_set_location (g, loc);
34711 tree lhs = make_ssa_name (integer_type_node);
34712 gimple_call_set_lhs (g, lhs);
34713 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34714 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34715 gimple_set_location (g, loc);
34716 gsi_replace (gsi, g, false);
34717 return true;
34719 break;
34721 case IX86_BUILTIN_BZHI32:
34722 case IX86_BUILTIN_BZHI64:
34723 gcc_assert (n_args == 2);
34724 arg1 = gimple_call_arg (stmt, 1);
34725 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34727 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34728 arg0 = gimple_call_arg (stmt, 0);
34729 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34730 break;
34731 location_t loc = gimple_location (stmt);
34732 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34733 gimple_set_location (g, loc);
34734 gsi_replace (gsi, g, false);
34735 return true;
34737 break;
34739 case IX86_BUILTIN_PDEP32:
34740 case IX86_BUILTIN_PDEP64:
34741 case IX86_BUILTIN_PEXT32:
34742 case IX86_BUILTIN_PEXT64:
34743 gcc_assert (n_args == 2);
34744 arg1 = gimple_call_arg (stmt, 1);
34745 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34747 location_t loc = gimple_location (stmt);
34748 arg0 = gimple_call_arg (stmt, 0);
34749 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34750 gimple_set_location (g, loc);
34751 gsi_replace (gsi, g, false);
34752 return true;
34754 break;
34756 default:
34757 break;
34760 return false;
34763 /* Make builtins to detect cpu type and features supported. NAME is
34764 the builtin name, CODE is the builtin code, and FTYPE is the function
34765 type of the builtin. */
34767 static void
34768 make_cpu_type_builtin (const char* name, int code,
34769 enum ix86_builtin_func_type ftype, bool is_const)
34771 tree decl;
34772 tree type;
34774 type = ix86_get_builtin_func_type (ftype);
34775 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34776 NULL, NULL_TREE);
34777 gcc_assert (decl != NULL_TREE);
34778 ix86_builtins[(int) code] = decl;
34779 TREE_READONLY (decl) = is_const;
34782 /* Make builtins to get CPU type and features supported. The created
34783 builtins are :
34785 __builtin_cpu_init (), to detect cpu type and features,
34786 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34787 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34790 static void
34791 ix86_init_platform_type_builtins (void)
34793 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34794 INT_FTYPE_VOID, false);
34795 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34796 INT_FTYPE_PCCHAR, true);
34797 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34798 INT_FTYPE_PCCHAR, true);
34801 /* Internal method for ix86_init_builtins. */
34803 static void
34804 ix86_init_builtins_va_builtins_abi (void)
34806 tree ms_va_ref, sysv_va_ref;
34807 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34808 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34809 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34810 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34812 if (!TARGET_64BIT)
34813 return;
34814 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34815 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34816 ms_va_ref = build_reference_type (ms_va_list_type_node);
34817 sysv_va_ref =
34818 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34820 fnvoid_va_end_ms =
34821 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34822 fnvoid_va_start_ms =
34823 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34824 fnvoid_va_end_sysv =
34825 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34826 fnvoid_va_start_sysv =
34827 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34828 NULL_TREE);
34829 fnvoid_va_copy_ms =
34830 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34831 NULL_TREE);
34832 fnvoid_va_copy_sysv =
34833 build_function_type_list (void_type_node, sysv_va_ref,
34834 sysv_va_ref, NULL_TREE);
34836 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34837 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34838 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34839 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34840 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34841 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34842 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34843 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34844 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34845 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34846 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34847 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34850 static void
34851 ix86_init_builtin_types (void)
34853 tree float80_type_node, const_string_type_node;
34855 /* The __float80 type. */
34856 float80_type_node = long_double_type_node;
34857 if (TYPE_MODE (float80_type_node) != XFmode)
34859 if (float64x_type_node != NULL_TREE
34860 && TYPE_MODE (float64x_type_node) == XFmode)
34861 float80_type_node = float64x_type_node;
34862 else
34864 /* The __float80 type. */
34865 float80_type_node = make_node (REAL_TYPE);
34867 TYPE_PRECISION (float80_type_node) = 80;
34868 layout_type (float80_type_node);
34871 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34873 /* The __float128 type. The node has already been created as
34874 _Float128, so we only need to register the __float128 name for
34875 it. */
34876 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34878 const_string_type_node
34879 = build_pointer_type (build_qualified_type
34880 (char_type_node, TYPE_QUAL_CONST));
34882 /* This macro is built by i386-builtin-types.awk. */
34883 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34886 static void
34887 ix86_init_builtins (void)
34889 tree ftype, decl;
34891 ix86_init_builtin_types ();
34893 /* Builtins to get CPU type and features. */
34894 ix86_init_platform_type_builtins ();
34896 /* TFmode support builtins. */
34897 def_builtin_const (0, "__builtin_infq",
34898 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34899 def_builtin_const (0, "__builtin_huge_valq",
34900 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34902 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34903 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34904 BUILT_IN_MD, "nanq", NULL_TREE);
34905 TREE_READONLY (decl) = 1;
34906 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34908 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34909 BUILT_IN_MD, "nansq", NULL_TREE);
34910 TREE_READONLY (decl) = 1;
34911 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34913 /* We will expand them to normal call if SSE isn't available since
34914 they are used by libgcc. */
34915 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34916 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34917 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34918 TREE_READONLY (decl) = 1;
34919 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34921 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34922 decl = add_builtin_function ("__builtin_copysignq", ftype,
34923 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34924 "__copysigntf3", NULL_TREE);
34925 TREE_READONLY (decl) = 1;
34926 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34928 ix86_init_tm_builtins ();
34929 ix86_init_mmx_sse_builtins ();
34930 ix86_init_mpx_builtins ();
34932 if (TARGET_LP64)
34933 ix86_init_builtins_va_builtins_abi ();
34935 #ifdef SUBTARGET_INIT_BUILTINS
34936 SUBTARGET_INIT_BUILTINS;
34937 #endif
34940 /* Return the ix86 builtin for CODE. */
34942 static tree
34943 ix86_builtin_decl (unsigned code, bool)
34945 if (code >= IX86_BUILTIN_MAX)
34946 return error_mark_node;
34948 return ix86_builtins[code];
34951 /* Errors in the source file can cause expand_expr to return const0_rtx
34952 where we expect a vector. To avoid crashing, use one of the vector
34953 clear instructions. */
34954 static rtx
34955 safe_vector_operand (rtx x, machine_mode mode)
34957 if (x == const0_rtx)
34958 x = CONST0_RTX (mode);
34959 return x;
34962 /* Fixup modeless constants to fit required mode. */
34963 static rtx
34964 fixup_modeless_constant (rtx x, machine_mode mode)
34966 if (GET_MODE (x) == VOIDmode)
34967 x = convert_to_mode (mode, x, 1);
34968 return x;
34971 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34973 static rtx
34974 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34976 rtx pat;
34977 tree arg0 = CALL_EXPR_ARG (exp, 0);
34978 tree arg1 = CALL_EXPR_ARG (exp, 1);
34979 rtx op0 = expand_normal (arg0);
34980 rtx op1 = expand_normal (arg1);
34981 machine_mode tmode = insn_data[icode].operand[0].mode;
34982 machine_mode mode0 = insn_data[icode].operand[1].mode;
34983 machine_mode mode1 = insn_data[icode].operand[2].mode;
34985 if (VECTOR_MODE_P (mode0))
34986 op0 = safe_vector_operand (op0, mode0);
34987 if (VECTOR_MODE_P (mode1))
34988 op1 = safe_vector_operand (op1, mode1);
34990 if (optimize || !target
34991 || GET_MODE (target) != tmode
34992 || !insn_data[icode].operand[0].predicate (target, tmode))
34993 target = gen_reg_rtx (tmode);
34995 if (GET_MODE (op1) == SImode && mode1 == TImode)
34997 rtx x = gen_reg_rtx (V4SImode);
34998 emit_insn (gen_sse2_loadd (x, op1));
34999 op1 = gen_lowpart (TImode, x);
35002 if (!insn_data[icode].operand[1].predicate (op0, mode0))
35003 op0 = copy_to_mode_reg (mode0, op0);
35004 if (!insn_data[icode].operand[2].predicate (op1, mode1))
35005 op1 = copy_to_mode_reg (mode1, op1);
35007 pat = GEN_FCN (icode) (target, op0, op1);
35008 if (! pat)
35009 return 0;
35011 emit_insn (pat);
35013 return target;
35016 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
35018 static rtx
35019 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
35020 enum ix86_builtin_func_type m_type,
35021 enum rtx_code sub_code)
35023 rtx pat;
35024 int i;
35025 int nargs;
35026 bool comparison_p = false;
35027 bool tf_p = false;
35028 bool last_arg_constant = false;
35029 int num_memory = 0;
35030 struct {
35031 rtx op;
35032 machine_mode mode;
35033 } args[4];
35035 machine_mode tmode = insn_data[icode].operand[0].mode;
35037 switch (m_type)
35039 case MULTI_ARG_4_DF2_DI_I:
35040 case MULTI_ARG_4_DF2_DI_I1:
35041 case MULTI_ARG_4_SF2_SI_I:
35042 case MULTI_ARG_4_SF2_SI_I1:
35043 nargs = 4;
35044 last_arg_constant = true;
35045 break;
35047 case MULTI_ARG_3_SF:
35048 case MULTI_ARG_3_DF:
35049 case MULTI_ARG_3_SF2:
35050 case MULTI_ARG_3_DF2:
35051 case MULTI_ARG_3_DI:
35052 case MULTI_ARG_3_SI:
35053 case MULTI_ARG_3_SI_DI:
35054 case MULTI_ARG_3_HI:
35055 case MULTI_ARG_3_HI_SI:
35056 case MULTI_ARG_3_QI:
35057 case MULTI_ARG_3_DI2:
35058 case MULTI_ARG_3_SI2:
35059 case MULTI_ARG_3_HI2:
35060 case MULTI_ARG_3_QI2:
35061 nargs = 3;
35062 break;
35064 case MULTI_ARG_2_SF:
35065 case MULTI_ARG_2_DF:
35066 case MULTI_ARG_2_DI:
35067 case MULTI_ARG_2_SI:
35068 case MULTI_ARG_2_HI:
35069 case MULTI_ARG_2_QI:
35070 nargs = 2;
35071 break;
35073 case MULTI_ARG_2_DI_IMM:
35074 case MULTI_ARG_2_SI_IMM:
35075 case MULTI_ARG_2_HI_IMM:
35076 case MULTI_ARG_2_QI_IMM:
35077 nargs = 2;
35078 last_arg_constant = true;
35079 break;
35081 case MULTI_ARG_1_SF:
35082 case MULTI_ARG_1_DF:
35083 case MULTI_ARG_1_SF2:
35084 case MULTI_ARG_1_DF2:
35085 case MULTI_ARG_1_DI:
35086 case MULTI_ARG_1_SI:
35087 case MULTI_ARG_1_HI:
35088 case MULTI_ARG_1_QI:
35089 case MULTI_ARG_1_SI_DI:
35090 case MULTI_ARG_1_HI_DI:
35091 case MULTI_ARG_1_HI_SI:
35092 case MULTI_ARG_1_QI_DI:
35093 case MULTI_ARG_1_QI_SI:
35094 case MULTI_ARG_1_QI_HI:
35095 nargs = 1;
35096 break;
35098 case MULTI_ARG_2_DI_CMP:
35099 case MULTI_ARG_2_SI_CMP:
35100 case MULTI_ARG_2_HI_CMP:
35101 case MULTI_ARG_2_QI_CMP:
35102 nargs = 2;
35103 comparison_p = true;
35104 break;
35106 case MULTI_ARG_2_SF_TF:
35107 case MULTI_ARG_2_DF_TF:
35108 case MULTI_ARG_2_DI_TF:
35109 case MULTI_ARG_2_SI_TF:
35110 case MULTI_ARG_2_HI_TF:
35111 case MULTI_ARG_2_QI_TF:
35112 nargs = 2;
35113 tf_p = true;
35114 break;
35116 default:
35117 gcc_unreachable ();
35120 if (optimize || !target
35121 || GET_MODE (target) != tmode
35122 || !insn_data[icode].operand[0].predicate (target, tmode))
35123 target = gen_reg_rtx (tmode);
35124 else if (memory_operand (target, tmode))
35125 num_memory++;
35127 gcc_assert (nargs <= 4);
35129 for (i = 0; i < nargs; i++)
35131 tree arg = CALL_EXPR_ARG (exp, i);
35132 rtx op = expand_normal (arg);
35133 int adjust = (comparison_p) ? 1 : 0;
35134 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
35136 if (last_arg_constant && i == nargs - 1)
35138 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
35140 enum insn_code new_icode = icode;
35141 switch (icode)
35143 case CODE_FOR_xop_vpermil2v2df3:
35144 case CODE_FOR_xop_vpermil2v4sf3:
35145 case CODE_FOR_xop_vpermil2v4df3:
35146 case CODE_FOR_xop_vpermil2v8sf3:
35147 error ("the last argument must be a 2-bit immediate");
35148 return gen_reg_rtx (tmode);
35149 case CODE_FOR_xop_rotlv2di3:
35150 new_icode = CODE_FOR_rotlv2di3;
35151 goto xop_rotl;
35152 case CODE_FOR_xop_rotlv4si3:
35153 new_icode = CODE_FOR_rotlv4si3;
35154 goto xop_rotl;
35155 case CODE_FOR_xop_rotlv8hi3:
35156 new_icode = CODE_FOR_rotlv8hi3;
35157 goto xop_rotl;
35158 case CODE_FOR_xop_rotlv16qi3:
35159 new_icode = CODE_FOR_rotlv16qi3;
35160 xop_rotl:
35161 if (CONST_INT_P (op))
35163 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35164 op = GEN_INT (INTVAL (op) & mask);
35165 gcc_checking_assert
35166 (insn_data[icode].operand[i + 1].predicate (op, mode));
35168 else
35170 gcc_checking_assert
35171 (nargs == 2
35172 && insn_data[new_icode].operand[0].mode == tmode
35173 && insn_data[new_icode].operand[1].mode == tmode
35174 && insn_data[new_icode].operand[2].mode == mode
35175 && insn_data[new_icode].operand[0].predicate
35176 == insn_data[icode].operand[0].predicate
35177 && insn_data[new_icode].operand[1].predicate
35178 == insn_data[icode].operand[1].predicate);
35179 icode = new_icode;
35180 goto non_constant;
35182 break;
35183 default:
35184 gcc_unreachable ();
35188 else
35190 non_constant:
35191 if (VECTOR_MODE_P (mode))
35192 op = safe_vector_operand (op, mode);
35194 /* If we aren't optimizing, only allow one memory operand to be
35195 generated. */
35196 if (memory_operand (op, mode))
35197 num_memory++;
35199 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35201 if (optimize
35202 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35203 || num_memory > 1)
35204 op = force_reg (mode, op);
35207 args[i].op = op;
35208 args[i].mode = mode;
35211 switch (nargs)
35213 case 1:
35214 pat = GEN_FCN (icode) (target, args[0].op);
35215 break;
35217 case 2:
35218 if (tf_p)
35219 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35220 GEN_INT ((int)sub_code));
35221 else if (! comparison_p)
35222 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35223 else
35225 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35226 args[0].op,
35227 args[1].op);
35229 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35231 break;
35233 case 3:
35234 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35235 break;
35237 case 4:
35238 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35239 break;
35241 default:
35242 gcc_unreachable ();
35245 if (! pat)
35246 return 0;
35248 emit_insn (pat);
35249 return target;
35252 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35253 insns with vec_merge. */
35255 static rtx
35256 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35257 rtx target)
35259 rtx pat;
35260 tree arg0 = CALL_EXPR_ARG (exp, 0);
35261 rtx op1, op0 = expand_normal (arg0);
35262 machine_mode tmode = insn_data[icode].operand[0].mode;
35263 machine_mode mode0 = insn_data[icode].operand[1].mode;
35265 if (optimize || !target
35266 || GET_MODE (target) != tmode
35267 || !insn_data[icode].operand[0].predicate (target, tmode))
35268 target = gen_reg_rtx (tmode);
35270 if (VECTOR_MODE_P (mode0))
35271 op0 = safe_vector_operand (op0, mode0);
35273 if ((optimize && !register_operand (op0, mode0))
35274 || !insn_data[icode].operand[1].predicate (op0, mode0))
35275 op0 = copy_to_mode_reg (mode0, op0);
35277 op1 = op0;
35278 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35279 op1 = copy_to_mode_reg (mode0, op1);
35281 pat = GEN_FCN (icode) (target, op0, op1);
35282 if (! pat)
35283 return 0;
35284 emit_insn (pat);
35285 return target;
35288 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35290 static rtx
35291 ix86_expand_sse_compare (const struct builtin_description *d,
35292 tree exp, rtx target, bool swap)
35294 rtx pat;
35295 tree arg0 = CALL_EXPR_ARG (exp, 0);
35296 tree arg1 = CALL_EXPR_ARG (exp, 1);
35297 rtx op0 = expand_normal (arg0);
35298 rtx op1 = expand_normal (arg1);
35299 rtx op2;
35300 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35301 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35302 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35303 enum rtx_code comparison = d->comparison;
35305 if (VECTOR_MODE_P (mode0))
35306 op0 = safe_vector_operand (op0, mode0);
35307 if (VECTOR_MODE_P (mode1))
35308 op1 = safe_vector_operand (op1, mode1);
35310 /* Swap operands if we have a comparison that isn't available in
35311 hardware. */
35312 if (swap)
35313 std::swap (op0, op1);
35315 if (optimize || !target
35316 || GET_MODE (target) != tmode
35317 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35318 target = gen_reg_rtx (tmode);
35320 if ((optimize && !register_operand (op0, mode0))
35321 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35322 op0 = copy_to_mode_reg (mode0, op0);
35323 if ((optimize && !register_operand (op1, mode1))
35324 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35325 op1 = copy_to_mode_reg (mode1, op1);
35327 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35328 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35329 if (! pat)
35330 return 0;
35331 emit_insn (pat);
35332 return target;
35335 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35337 static rtx
35338 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35339 rtx target)
35341 rtx pat;
35342 tree arg0 = CALL_EXPR_ARG (exp, 0);
35343 tree arg1 = CALL_EXPR_ARG (exp, 1);
35344 rtx op0 = expand_normal (arg0);
35345 rtx op1 = expand_normal (arg1);
35346 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35347 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35348 enum rtx_code comparison = d->comparison;
35350 if (VECTOR_MODE_P (mode0))
35351 op0 = safe_vector_operand (op0, mode0);
35352 if (VECTOR_MODE_P (mode1))
35353 op1 = safe_vector_operand (op1, mode1);
35355 /* Swap operands if we have a comparison that isn't available in
35356 hardware. */
35357 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35358 std::swap (op0, op1);
35360 target = gen_reg_rtx (SImode);
35361 emit_move_insn (target, const0_rtx);
35362 target = gen_rtx_SUBREG (QImode, target, 0);
35364 if ((optimize && !register_operand (op0, mode0))
35365 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35366 op0 = copy_to_mode_reg (mode0, op0);
35367 if ((optimize && !register_operand (op1, mode1))
35368 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35369 op1 = copy_to_mode_reg (mode1, op1);
35371 pat = GEN_FCN (d->icode) (op0, op1);
35372 if (! pat)
35373 return 0;
35374 emit_insn (pat);
35375 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35376 gen_rtx_fmt_ee (comparison, QImode,
35377 SET_DEST (pat),
35378 const0_rtx)));
35380 return SUBREG_REG (target);
35383 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35385 static rtx
35386 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35387 rtx target)
35389 rtx pat;
35390 tree arg0 = CALL_EXPR_ARG (exp, 0);
35391 rtx op1, op0 = expand_normal (arg0);
35392 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35393 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35395 if (optimize || target == 0
35396 || GET_MODE (target) != tmode
35397 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35398 target = gen_reg_rtx (tmode);
35400 if (VECTOR_MODE_P (mode0))
35401 op0 = safe_vector_operand (op0, mode0);
35403 if ((optimize && !register_operand (op0, mode0))
35404 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35405 op0 = copy_to_mode_reg (mode0, op0);
35407 op1 = GEN_INT (d->comparison);
35409 pat = GEN_FCN (d->icode) (target, op0, op1);
35410 if (! pat)
35411 return 0;
35412 emit_insn (pat);
35413 return target;
35416 static rtx
35417 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35418 tree exp, rtx target)
35420 rtx pat;
35421 tree arg0 = CALL_EXPR_ARG (exp, 0);
35422 tree arg1 = CALL_EXPR_ARG (exp, 1);
35423 rtx op0 = expand_normal (arg0);
35424 rtx op1 = expand_normal (arg1);
35425 rtx op2;
35426 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35427 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35428 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35430 if (optimize || target == 0
35431 || GET_MODE (target) != tmode
35432 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35433 target = gen_reg_rtx (tmode);
35435 op0 = safe_vector_operand (op0, mode0);
35436 op1 = safe_vector_operand (op1, mode1);
35438 if ((optimize && !register_operand (op0, mode0))
35439 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35440 op0 = copy_to_mode_reg (mode0, op0);
35441 if ((optimize && !register_operand (op1, mode1))
35442 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35443 op1 = copy_to_mode_reg (mode1, op1);
35445 op2 = GEN_INT (d->comparison);
35447 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35448 if (! pat)
35449 return 0;
35450 emit_insn (pat);
35451 return target;
35454 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35456 static rtx
35457 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35458 rtx target)
35460 rtx pat;
35461 tree arg0 = CALL_EXPR_ARG (exp, 0);
35462 tree arg1 = CALL_EXPR_ARG (exp, 1);
35463 rtx op0 = expand_normal (arg0);
35464 rtx op1 = expand_normal (arg1);
35465 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35466 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35467 enum rtx_code comparison = d->comparison;
35469 if (VECTOR_MODE_P (mode0))
35470 op0 = safe_vector_operand (op0, mode0);
35471 if (VECTOR_MODE_P (mode1))
35472 op1 = safe_vector_operand (op1, mode1);
35474 target = gen_reg_rtx (SImode);
35475 emit_move_insn (target, const0_rtx);
35476 target = gen_rtx_SUBREG (QImode, target, 0);
35478 if ((optimize && !register_operand (op0, mode0))
35479 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35480 op0 = copy_to_mode_reg (mode0, op0);
35481 if ((optimize && !register_operand (op1, mode1))
35482 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35483 op1 = copy_to_mode_reg (mode1, op1);
35485 pat = GEN_FCN (d->icode) (op0, op1);
35486 if (! pat)
35487 return 0;
35488 emit_insn (pat);
35489 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35490 gen_rtx_fmt_ee (comparison, QImode,
35491 SET_DEST (pat),
35492 const0_rtx)));
35494 return SUBREG_REG (target);
35497 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35499 static rtx
35500 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35501 tree exp, rtx target)
35503 rtx pat;
35504 tree arg0 = CALL_EXPR_ARG (exp, 0);
35505 tree arg1 = CALL_EXPR_ARG (exp, 1);
35506 tree arg2 = CALL_EXPR_ARG (exp, 2);
35507 tree arg3 = CALL_EXPR_ARG (exp, 3);
35508 tree arg4 = CALL_EXPR_ARG (exp, 4);
35509 rtx scratch0, scratch1;
35510 rtx op0 = expand_normal (arg0);
35511 rtx op1 = expand_normal (arg1);
35512 rtx op2 = expand_normal (arg2);
35513 rtx op3 = expand_normal (arg3);
35514 rtx op4 = expand_normal (arg4);
35515 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35517 tmode0 = insn_data[d->icode].operand[0].mode;
35518 tmode1 = insn_data[d->icode].operand[1].mode;
35519 modev2 = insn_data[d->icode].operand[2].mode;
35520 modei3 = insn_data[d->icode].operand[3].mode;
35521 modev4 = insn_data[d->icode].operand[4].mode;
35522 modei5 = insn_data[d->icode].operand[5].mode;
35523 modeimm = insn_data[d->icode].operand[6].mode;
35525 if (VECTOR_MODE_P (modev2))
35526 op0 = safe_vector_operand (op0, modev2);
35527 if (VECTOR_MODE_P (modev4))
35528 op2 = safe_vector_operand (op2, modev4);
35530 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35531 op0 = copy_to_mode_reg (modev2, op0);
35532 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35533 op1 = copy_to_mode_reg (modei3, op1);
35534 if ((optimize && !register_operand (op2, modev4))
35535 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35536 op2 = copy_to_mode_reg (modev4, op2);
35537 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35538 op3 = copy_to_mode_reg (modei5, op3);
35540 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35542 error ("the fifth argument must be an 8-bit immediate");
35543 return const0_rtx;
35546 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35548 if (optimize || !target
35549 || GET_MODE (target) != tmode0
35550 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35551 target = gen_reg_rtx (tmode0);
35553 scratch1 = gen_reg_rtx (tmode1);
35555 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35557 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35559 if (optimize || !target
35560 || GET_MODE (target) != tmode1
35561 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35562 target = gen_reg_rtx (tmode1);
35564 scratch0 = gen_reg_rtx (tmode0);
35566 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35568 else
35570 gcc_assert (d->flag);
35572 scratch0 = gen_reg_rtx (tmode0);
35573 scratch1 = gen_reg_rtx (tmode1);
35575 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35578 if (! pat)
35579 return 0;
35581 emit_insn (pat);
35583 if (d->flag)
35585 target = gen_reg_rtx (SImode);
35586 emit_move_insn (target, const0_rtx);
35587 target = gen_rtx_SUBREG (QImode, target, 0);
35589 emit_insn
35590 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35591 gen_rtx_fmt_ee (EQ, QImode,
35592 gen_rtx_REG ((machine_mode) d->flag,
35593 FLAGS_REG),
35594 const0_rtx)));
35595 return SUBREG_REG (target);
35597 else
35598 return target;
35602 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35604 static rtx
35605 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35606 tree exp, rtx target)
35608 rtx pat;
35609 tree arg0 = CALL_EXPR_ARG (exp, 0);
35610 tree arg1 = CALL_EXPR_ARG (exp, 1);
35611 tree arg2 = CALL_EXPR_ARG (exp, 2);
35612 rtx scratch0, scratch1;
35613 rtx op0 = expand_normal (arg0);
35614 rtx op1 = expand_normal (arg1);
35615 rtx op2 = expand_normal (arg2);
35616 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35618 tmode0 = insn_data[d->icode].operand[0].mode;
35619 tmode1 = insn_data[d->icode].operand[1].mode;
35620 modev2 = insn_data[d->icode].operand[2].mode;
35621 modev3 = insn_data[d->icode].operand[3].mode;
35622 modeimm = insn_data[d->icode].operand[4].mode;
35624 if (VECTOR_MODE_P (modev2))
35625 op0 = safe_vector_operand (op0, modev2);
35626 if (VECTOR_MODE_P (modev3))
35627 op1 = safe_vector_operand (op1, modev3);
35629 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35630 op0 = copy_to_mode_reg (modev2, op0);
35631 if ((optimize && !register_operand (op1, modev3))
35632 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35633 op1 = copy_to_mode_reg (modev3, op1);
35635 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35637 error ("the third argument must be an 8-bit immediate");
35638 return const0_rtx;
35641 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35643 if (optimize || !target
35644 || GET_MODE (target) != tmode0
35645 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35646 target = gen_reg_rtx (tmode0);
35648 scratch1 = gen_reg_rtx (tmode1);
35650 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35652 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35654 if (optimize || !target
35655 || GET_MODE (target) != tmode1
35656 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35657 target = gen_reg_rtx (tmode1);
35659 scratch0 = gen_reg_rtx (tmode0);
35661 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35663 else
35665 gcc_assert (d->flag);
35667 scratch0 = gen_reg_rtx (tmode0);
35668 scratch1 = gen_reg_rtx (tmode1);
35670 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35673 if (! pat)
35674 return 0;
35676 emit_insn (pat);
35678 if (d->flag)
35680 target = gen_reg_rtx (SImode);
35681 emit_move_insn (target, const0_rtx);
35682 target = gen_rtx_SUBREG (QImode, target, 0);
35684 emit_insn
35685 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35686 gen_rtx_fmt_ee (EQ, QImode,
35687 gen_rtx_REG ((machine_mode) d->flag,
35688 FLAGS_REG),
35689 const0_rtx)));
35690 return SUBREG_REG (target);
35692 else
35693 return target;
35696 /* Subroutine of ix86_expand_builtin to take care of insns with
35697 variable number of operands. */
35699 static rtx
35700 ix86_expand_args_builtin (const struct builtin_description *d,
35701 tree exp, rtx target)
35703 rtx pat, real_target;
35704 unsigned int i, nargs;
35705 unsigned int nargs_constant = 0;
35706 unsigned int mask_pos = 0;
35707 int num_memory = 0;
35708 struct
35710 rtx op;
35711 machine_mode mode;
35712 } args[6];
35713 bool second_arg_count = false;
35714 enum insn_code icode = d->icode;
35715 const struct insn_data_d *insn_p = &insn_data[icode];
35716 machine_mode tmode = insn_p->operand[0].mode;
35717 machine_mode rmode = VOIDmode;
35718 bool swap = false;
35719 enum rtx_code comparison = d->comparison;
35721 switch ((enum ix86_builtin_func_type) d->flag)
35723 case V2DF_FTYPE_V2DF_ROUND:
35724 case V4DF_FTYPE_V4DF_ROUND:
35725 case V8DF_FTYPE_V8DF_ROUND:
35726 case V4SF_FTYPE_V4SF_ROUND:
35727 case V8SF_FTYPE_V8SF_ROUND:
35728 case V16SF_FTYPE_V16SF_ROUND:
35729 case V4SI_FTYPE_V4SF_ROUND:
35730 case V8SI_FTYPE_V8SF_ROUND:
35731 case V16SI_FTYPE_V16SF_ROUND:
35732 return ix86_expand_sse_round (d, exp, target);
35733 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35734 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35735 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35736 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35737 case INT_FTYPE_V8SF_V8SF_PTEST:
35738 case INT_FTYPE_V4DI_V4DI_PTEST:
35739 case INT_FTYPE_V4DF_V4DF_PTEST:
35740 case INT_FTYPE_V4SF_V4SF_PTEST:
35741 case INT_FTYPE_V2DI_V2DI_PTEST:
35742 case INT_FTYPE_V2DF_V2DF_PTEST:
35743 return ix86_expand_sse_ptest (d, exp, target);
35744 case FLOAT128_FTYPE_FLOAT128:
35745 case FLOAT_FTYPE_FLOAT:
35746 case INT_FTYPE_INT:
35747 case UINT_FTYPE_UINT:
35748 case UINT16_FTYPE_UINT16:
35749 case UINT64_FTYPE_INT:
35750 case UINT64_FTYPE_UINT64:
35751 case INT64_FTYPE_INT64:
35752 case INT64_FTYPE_V4SF:
35753 case INT64_FTYPE_V2DF:
35754 case INT_FTYPE_V16QI:
35755 case INT_FTYPE_V8QI:
35756 case INT_FTYPE_V8SF:
35757 case INT_FTYPE_V4DF:
35758 case INT_FTYPE_V4SF:
35759 case INT_FTYPE_V2DF:
35760 case INT_FTYPE_V32QI:
35761 case V16QI_FTYPE_V16QI:
35762 case V8SI_FTYPE_V8SF:
35763 case V8SI_FTYPE_V4SI:
35764 case V8HI_FTYPE_V8HI:
35765 case V8HI_FTYPE_V16QI:
35766 case V8QI_FTYPE_V8QI:
35767 case V8SF_FTYPE_V8SF:
35768 case V8SF_FTYPE_V8SI:
35769 case V8SF_FTYPE_V4SF:
35770 case V8SF_FTYPE_V8HI:
35771 case V4SI_FTYPE_V4SI:
35772 case V4SI_FTYPE_V16QI:
35773 case V4SI_FTYPE_V4SF:
35774 case V4SI_FTYPE_V8SI:
35775 case V4SI_FTYPE_V8HI:
35776 case V4SI_FTYPE_V4DF:
35777 case V4SI_FTYPE_V2DF:
35778 case V4HI_FTYPE_V4HI:
35779 case V4DF_FTYPE_V4DF:
35780 case V4DF_FTYPE_V4SI:
35781 case V4DF_FTYPE_V4SF:
35782 case V4DF_FTYPE_V2DF:
35783 case V4SF_FTYPE_V4SF:
35784 case V4SF_FTYPE_V4SI:
35785 case V4SF_FTYPE_V8SF:
35786 case V4SF_FTYPE_V4DF:
35787 case V4SF_FTYPE_V8HI:
35788 case V4SF_FTYPE_V2DF:
35789 case V2DI_FTYPE_V2DI:
35790 case V2DI_FTYPE_V16QI:
35791 case V2DI_FTYPE_V8HI:
35792 case V2DI_FTYPE_V4SI:
35793 case V2DF_FTYPE_V2DF:
35794 case V2DF_FTYPE_V4SI:
35795 case V2DF_FTYPE_V4DF:
35796 case V2DF_FTYPE_V4SF:
35797 case V2DF_FTYPE_V2SI:
35798 case V2SI_FTYPE_V2SI:
35799 case V2SI_FTYPE_V4SF:
35800 case V2SI_FTYPE_V2SF:
35801 case V2SI_FTYPE_V2DF:
35802 case V2SF_FTYPE_V2SF:
35803 case V2SF_FTYPE_V2SI:
35804 case V32QI_FTYPE_V32QI:
35805 case V32QI_FTYPE_V16QI:
35806 case V16HI_FTYPE_V16HI:
35807 case V16HI_FTYPE_V8HI:
35808 case V8SI_FTYPE_V8SI:
35809 case V16HI_FTYPE_V16QI:
35810 case V8SI_FTYPE_V16QI:
35811 case V4DI_FTYPE_V16QI:
35812 case V8SI_FTYPE_V8HI:
35813 case V4DI_FTYPE_V8HI:
35814 case V4DI_FTYPE_V4SI:
35815 case V4DI_FTYPE_V2DI:
35816 case UQI_FTYPE_UQI:
35817 case UHI_FTYPE_UHI:
35818 case USI_FTYPE_USI:
35819 case USI_FTYPE_UQI:
35820 case USI_FTYPE_UHI:
35821 case UDI_FTYPE_UDI:
35822 case UHI_FTYPE_V16QI:
35823 case USI_FTYPE_V32QI:
35824 case UDI_FTYPE_V64QI:
35825 case V16QI_FTYPE_UHI:
35826 case V32QI_FTYPE_USI:
35827 case V64QI_FTYPE_UDI:
35828 case V8HI_FTYPE_UQI:
35829 case V16HI_FTYPE_UHI:
35830 case V32HI_FTYPE_USI:
35831 case V4SI_FTYPE_UQI:
35832 case V8SI_FTYPE_UQI:
35833 case V4SI_FTYPE_UHI:
35834 case V8SI_FTYPE_UHI:
35835 case UQI_FTYPE_V8HI:
35836 case UHI_FTYPE_V16HI:
35837 case USI_FTYPE_V32HI:
35838 case UQI_FTYPE_V4SI:
35839 case UQI_FTYPE_V8SI:
35840 case UHI_FTYPE_V16SI:
35841 case UQI_FTYPE_V2DI:
35842 case UQI_FTYPE_V4DI:
35843 case UQI_FTYPE_V8DI:
35844 case V16SI_FTYPE_UHI:
35845 case V2DI_FTYPE_UQI:
35846 case V4DI_FTYPE_UQI:
35847 case V16SI_FTYPE_INT:
35848 case V16SF_FTYPE_V8SF:
35849 case V16SI_FTYPE_V8SI:
35850 case V16SF_FTYPE_V4SF:
35851 case V16SI_FTYPE_V4SI:
35852 case V16SI_FTYPE_V16SF:
35853 case V16SI_FTYPE_V16SI:
35854 case V16SF_FTYPE_V16SF:
35855 case V8DI_FTYPE_UQI:
35856 case V8DI_FTYPE_V8DI:
35857 case V8DF_FTYPE_V4DF:
35858 case V8DF_FTYPE_V2DF:
35859 case V8DF_FTYPE_V8DF:
35860 nargs = 1;
35861 break;
35862 case V4SF_FTYPE_V4SF_VEC_MERGE:
35863 case V2DF_FTYPE_V2DF_VEC_MERGE:
35864 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35865 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35866 case V16QI_FTYPE_V16QI_V16QI:
35867 case V16QI_FTYPE_V8HI_V8HI:
35868 case V16SF_FTYPE_V16SF_V16SF:
35869 case V8QI_FTYPE_V8QI_V8QI:
35870 case V8QI_FTYPE_V4HI_V4HI:
35871 case V8HI_FTYPE_V8HI_V8HI:
35872 case V8HI_FTYPE_V16QI_V16QI:
35873 case V8HI_FTYPE_V4SI_V4SI:
35874 case V8SF_FTYPE_V8SF_V8SF:
35875 case V8SF_FTYPE_V8SF_V8SI:
35876 case V8DF_FTYPE_V8DF_V8DF:
35877 case V4SI_FTYPE_V4SI_V4SI:
35878 case V4SI_FTYPE_V8HI_V8HI:
35879 case V4SI_FTYPE_V2DF_V2DF:
35880 case V4HI_FTYPE_V4HI_V4HI:
35881 case V4HI_FTYPE_V8QI_V8QI:
35882 case V4HI_FTYPE_V2SI_V2SI:
35883 case V4DF_FTYPE_V4DF_V4DF:
35884 case V4DF_FTYPE_V4DF_V4DI:
35885 case V4SF_FTYPE_V4SF_V4SF:
35886 case V4SF_FTYPE_V4SF_V4SI:
35887 case V4SF_FTYPE_V4SF_V2SI:
35888 case V4SF_FTYPE_V4SF_V2DF:
35889 case V4SF_FTYPE_V4SF_UINT:
35890 case V4SF_FTYPE_V4SF_DI:
35891 case V4SF_FTYPE_V4SF_SI:
35892 case V2DI_FTYPE_V2DI_V2DI:
35893 case V2DI_FTYPE_V16QI_V16QI:
35894 case V2DI_FTYPE_V4SI_V4SI:
35895 case V2DI_FTYPE_V2DI_V16QI:
35896 case V2SI_FTYPE_V2SI_V2SI:
35897 case V2SI_FTYPE_V4HI_V4HI:
35898 case V2SI_FTYPE_V2SF_V2SF:
35899 case V2DF_FTYPE_V2DF_V2DF:
35900 case V2DF_FTYPE_V2DF_V4SF:
35901 case V2DF_FTYPE_V2DF_V2DI:
35902 case V2DF_FTYPE_V2DF_DI:
35903 case V2DF_FTYPE_V2DF_SI:
35904 case V2DF_FTYPE_V2DF_UINT:
35905 case V2SF_FTYPE_V2SF_V2SF:
35906 case V1DI_FTYPE_V1DI_V1DI:
35907 case V1DI_FTYPE_V8QI_V8QI:
35908 case V1DI_FTYPE_V2SI_V2SI:
35909 case V32QI_FTYPE_V16HI_V16HI:
35910 case V16HI_FTYPE_V8SI_V8SI:
35911 case V32QI_FTYPE_V32QI_V32QI:
35912 case V16HI_FTYPE_V32QI_V32QI:
35913 case V16HI_FTYPE_V16HI_V16HI:
35914 case V8SI_FTYPE_V4DF_V4DF:
35915 case V8SI_FTYPE_V8SI_V8SI:
35916 case V8SI_FTYPE_V16HI_V16HI:
35917 case V4DI_FTYPE_V4DI_V4DI:
35918 case V4DI_FTYPE_V8SI_V8SI:
35919 case V8DI_FTYPE_V64QI_V64QI:
35920 if (comparison == UNKNOWN)
35921 return ix86_expand_binop_builtin (icode, exp, target);
35922 nargs = 2;
35923 break;
35924 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35925 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35926 gcc_assert (comparison != UNKNOWN);
35927 nargs = 2;
35928 swap = true;
35929 break;
35930 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35931 case V16HI_FTYPE_V16HI_SI_COUNT:
35932 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35933 case V8SI_FTYPE_V8SI_SI_COUNT:
35934 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35935 case V4DI_FTYPE_V4DI_INT_COUNT:
35936 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35937 case V8HI_FTYPE_V8HI_SI_COUNT:
35938 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35939 case V4SI_FTYPE_V4SI_SI_COUNT:
35940 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35941 case V4HI_FTYPE_V4HI_SI_COUNT:
35942 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35943 case V2DI_FTYPE_V2DI_SI_COUNT:
35944 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35945 case V2SI_FTYPE_V2SI_SI_COUNT:
35946 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35947 case V1DI_FTYPE_V1DI_SI_COUNT:
35948 nargs = 2;
35949 second_arg_count = true;
35950 break;
35951 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35952 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35953 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35954 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35955 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35956 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35957 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35958 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35959 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35960 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35961 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35962 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35963 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35964 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35965 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35966 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35967 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35968 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35969 nargs = 4;
35970 second_arg_count = true;
35971 break;
35972 case UINT64_FTYPE_UINT64_UINT64:
35973 case UINT_FTYPE_UINT_UINT:
35974 case UINT_FTYPE_UINT_USHORT:
35975 case UINT_FTYPE_UINT_UCHAR:
35976 case UINT16_FTYPE_UINT16_INT:
35977 case UINT8_FTYPE_UINT8_INT:
35978 case UQI_FTYPE_UQI_UQI:
35979 case UHI_FTYPE_UHI_UHI:
35980 case USI_FTYPE_USI_USI:
35981 case UDI_FTYPE_UDI_UDI:
35982 case V16SI_FTYPE_V8DF_V8DF:
35983 nargs = 2;
35984 break;
35985 case V2DI_FTYPE_V2DI_INT_CONVERT:
35986 nargs = 2;
35987 rmode = V1TImode;
35988 nargs_constant = 1;
35989 break;
35990 case V4DI_FTYPE_V4DI_INT_CONVERT:
35991 nargs = 2;
35992 rmode = V2TImode;
35993 nargs_constant = 1;
35994 break;
35995 case V8DI_FTYPE_V8DI_INT_CONVERT:
35996 nargs = 2;
35997 rmode = V4TImode;
35998 nargs_constant = 1;
35999 break;
36000 case V8HI_FTYPE_V8HI_INT:
36001 case V8HI_FTYPE_V8SF_INT:
36002 case V16HI_FTYPE_V16SF_INT:
36003 case V8HI_FTYPE_V4SF_INT:
36004 case V8SF_FTYPE_V8SF_INT:
36005 case V4SF_FTYPE_V16SF_INT:
36006 case V16SF_FTYPE_V16SF_INT:
36007 case V4SI_FTYPE_V4SI_INT:
36008 case V4SI_FTYPE_V8SI_INT:
36009 case V4HI_FTYPE_V4HI_INT:
36010 case V4DF_FTYPE_V4DF_INT:
36011 case V4DF_FTYPE_V8DF_INT:
36012 case V4SF_FTYPE_V4SF_INT:
36013 case V4SF_FTYPE_V8SF_INT:
36014 case V2DI_FTYPE_V2DI_INT:
36015 case V2DF_FTYPE_V2DF_INT:
36016 case V2DF_FTYPE_V4DF_INT:
36017 case V16HI_FTYPE_V16HI_INT:
36018 case V8SI_FTYPE_V8SI_INT:
36019 case V16SI_FTYPE_V16SI_INT:
36020 case V4SI_FTYPE_V16SI_INT:
36021 case V4DI_FTYPE_V4DI_INT:
36022 case V2DI_FTYPE_V4DI_INT:
36023 case V4DI_FTYPE_V8DI_INT:
36024 case QI_FTYPE_V4SF_INT:
36025 case QI_FTYPE_V2DF_INT:
36026 case UQI_FTYPE_UQI_UQI_CONST:
36027 case UHI_FTYPE_UHI_UQI:
36028 case USI_FTYPE_USI_UQI:
36029 case UDI_FTYPE_UDI_UQI:
36030 nargs = 2;
36031 nargs_constant = 1;
36032 break;
36033 case V16QI_FTYPE_V16QI_V16QI_V16QI:
36034 case V8SF_FTYPE_V8SF_V8SF_V8SF:
36035 case V4DF_FTYPE_V4DF_V4DF_V4DF:
36036 case V4SF_FTYPE_V4SF_V4SF_V4SF:
36037 case V2DF_FTYPE_V2DF_V2DF_V2DF:
36038 case V32QI_FTYPE_V32QI_V32QI_V32QI:
36039 case UHI_FTYPE_V16SI_V16SI_UHI:
36040 case UQI_FTYPE_V8DI_V8DI_UQI:
36041 case V16HI_FTYPE_V16SI_V16HI_UHI:
36042 case V16QI_FTYPE_V16SI_V16QI_UHI:
36043 case V16QI_FTYPE_V8DI_V16QI_UQI:
36044 case V16SF_FTYPE_V16SF_V16SF_UHI:
36045 case V16SF_FTYPE_V4SF_V16SF_UHI:
36046 case V16SI_FTYPE_SI_V16SI_UHI:
36047 case V16SI_FTYPE_V16HI_V16SI_UHI:
36048 case V16SI_FTYPE_V16QI_V16SI_UHI:
36049 case V8SF_FTYPE_V4SF_V8SF_UQI:
36050 case V4DF_FTYPE_V2DF_V4DF_UQI:
36051 case V8SI_FTYPE_V4SI_V8SI_UQI:
36052 case V8SI_FTYPE_SI_V8SI_UQI:
36053 case V4SI_FTYPE_V4SI_V4SI_UQI:
36054 case V4SI_FTYPE_SI_V4SI_UQI:
36055 case V4DI_FTYPE_V2DI_V4DI_UQI:
36056 case V4DI_FTYPE_DI_V4DI_UQI:
36057 case V2DI_FTYPE_V2DI_V2DI_UQI:
36058 case V2DI_FTYPE_DI_V2DI_UQI:
36059 case V64QI_FTYPE_V64QI_V64QI_UDI:
36060 case V64QI_FTYPE_V16QI_V64QI_UDI:
36061 case V64QI_FTYPE_QI_V64QI_UDI:
36062 case V32QI_FTYPE_V32QI_V32QI_USI:
36063 case V32QI_FTYPE_V16QI_V32QI_USI:
36064 case V32QI_FTYPE_QI_V32QI_USI:
36065 case V16QI_FTYPE_V16QI_V16QI_UHI:
36066 case V16QI_FTYPE_QI_V16QI_UHI:
36067 case V32HI_FTYPE_V8HI_V32HI_USI:
36068 case V32HI_FTYPE_HI_V32HI_USI:
36069 case V16HI_FTYPE_V8HI_V16HI_UHI:
36070 case V16HI_FTYPE_HI_V16HI_UHI:
36071 case V8HI_FTYPE_V8HI_V8HI_UQI:
36072 case V8HI_FTYPE_HI_V8HI_UQI:
36073 case V8SF_FTYPE_V8HI_V8SF_UQI:
36074 case V4SF_FTYPE_V8HI_V4SF_UQI:
36075 case V8SI_FTYPE_V8SF_V8SI_UQI:
36076 case V4SI_FTYPE_V4SF_V4SI_UQI:
36077 case V4DI_FTYPE_V4SF_V4DI_UQI:
36078 case V2DI_FTYPE_V4SF_V2DI_UQI:
36079 case V4SF_FTYPE_V4DI_V4SF_UQI:
36080 case V4SF_FTYPE_V2DI_V4SF_UQI:
36081 case V4DF_FTYPE_V4DI_V4DF_UQI:
36082 case V2DF_FTYPE_V2DI_V2DF_UQI:
36083 case V16QI_FTYPE_V8HI_V16QI_UQI:
36084 case V16QI_FTYPE_V16HI_V16QI_UHI:
36085 case V16QI_FTYPE_V4SI_V16QI_UQI:
36086 case V16QI_FTYPE_V8SI_V16QI_UQI:
36087 case V8HI_FTYPE_V4SI_V8HI_UQI:
36088 case V8HI_FTYPE_V8SI_V8HI_UQI:
36089 case V16QI_FTYPE_V2DI_V16QI_UQI:
36090 case V16QI_FTYPE_V4DI_V16QI_UQI:
36091 case V8HI_FTYPE_V2DI_V8HI_UQI:
36092 case V8HI_FTYPE_V4DI_V8HI_UQI:
36093 case V4SI_FTYPE_V2DI_V4SI_UQI:
36094 case V4SI_FTYPE_V4DI_V4SI_UQI:
36095 case V32QI_FTYPE_V32HI_V32QI_USI:
36096 case UHI_FTYPE_V16QI_V16QI_UHI:
36097 case USI_FTYPE_V32QI_V32QI_USI:
36098 case UDI_FTYPE_V64QI_V64QI_UDI:
36099 case UQI_FTYPE_V8HI_V8HI_UQI:
36100 case UHI_FTYPE_V16HI_V16HI_UHI:
36101 case USI_FTYPE_V32HI_V32HI_USI:
36102 case UQI_FTYPE_V4SI_V4SI_UQI:
36103 case UQI_FTYPE_V8SI_V8SI_UQI:
36104 case UQI_FTYPE_V2DI_V2DI_UQI:
36105 case UQI_FTYPE_V4DI_V4DI_UQI:
36106 case V4SF_FTYPE_V2DF_V4SF_UQI:
36107 case V4SF_FTYPE_V4DF_V4SF_UQI:
36108 case V16SI_FTYPE_V16SI_V16SI_UHI:
36109 case V16SI_FTYPE_V4SI_V16SI_UHI:
36110 case V2DI_FTYPE_V4SI_V2DI_UQI:
36111 case V2DI_FTYPE_V8HI_V2DI_UQI:
36112 case V2DI_FTYPE_V16QI_V2DI_UQI:
36113 case V4DI_FTYPE_V4DI_V4DI_UQI:
36114 case V4DI_FTYPE_V4SI_V4DI_UQI:
36115 case V4DI_FTYPE_V8HI_V4DI_UQI:
36116 case V4DI_FTYPE_V16QI_V4DI_UQI:
36117 case V4DI_FTYPE_V4DF_V4DI_UQI:
36118 case V2DI_FTYPE_V2DF_V2DI_UQI:
36119 case V4SI_FTYPE_V4DF_V4SI_UQI:
36120 case V4SI_FTYPE_V2DF_V4SI_UQI:
36121 case V4SI_FTYPE_V8HI_V4SI_UQI:
36122 case V4SI_FTYPE_V16QI_V4SI_UQI:
36123 case V4DI_FTYPE_V4DI_V4DI_V4DI:
36124 case V8DF_FTYPE_V2DF_V8DF_UQI:
36125 case V8DF_FTYPE_V4DF_V8DF_UQI:
36126 case V8DF_FTYPE_V8DF_V8DF_UQI:
36127 case V8SF_FTYPE_V8SF_V8SF_UQI:
36128 case V8SF_FTYPE_V8SI_V8SF_UQI:
36129 case V4DF_FTYPE_V4DF_V4DF_UQI:
36130 case V4SF_FTYPE_V4SF_V4SF_UQI:
36131 case V2DF_FTYPE_V2DF_V2DF_UQI:
36132 case V2DF_FTYPE_V4SF_V2DF_UQI:
36133 case V2DF_FTYPE_V4SI_V2DF_UQI:
36134 case V4SF_FTYPE_V4SI_V4SF_UQI:
36135 case V4DF_FTYPE_V4SF_V4DF_UQI:
36136 case V4DF_FTYPE_V4SI_V4DF_UQI:
36137 case V8SI_FTYPE_V8SI_V8SI_UQI:
36138 case V8SI_FTYPE_V8HI_V8SI_UQI:
36139 case V8SI_FTYPE_V16QI_V8SI_UQI:
36140 case V8DF_FTYPE_V8SI_V8DF_UQI:
36141 case V8DI_FTYPE_DI_V8DI_UQI:
36142 case V16SF_FTYPE_V8SF_V16SF_UHI:
36143 case V16SI_FTYPE_V8SI_V16SI_UHI:
36144 case V16HI_FTYPE_V16HI_V16HI_UHI:
36145 case V8HI_FTYPE_V16QI_V8HI_UQI:
36146 case V16HI_FTYPE_V16QI_V16HI_UHI:
36147 case V32HI_FTYPE_V32HI_V32HI_USI:
36148 case V32HI_FTYPE_V32QI_V32HI_USI:
36149 case V8DI_FTYPE_V16QI_V8DI_UQI:
36150 case V8DI_FTYPE_V2DI_V8DI_UQI:
36151 case V8DI_FTYPE_V4DI_V8DI_UQI:
36152 case V8DI_FTYPE_V8DI_V8DI_UQI:
36153 case V8DI_FTYPE_V8HI_V8DI_UQI:
36154 case V8DI_FTYPE_V8SI_V8DI_UQI:
36155 case V8HI_FTYPE_V8DI_V8HI_UQI:
36156 case V8SI_FTYPE_V8DI_V8SI_UQI:
36157 case V4SI_FTYPE_V4SI_V4SI_V4SI:
36158 nargs = 3;
36159 break;
36160 case V32QI_FTYPE_V32QI_V32QI_INT:
36161 case V16HI_FTYPE_V16HI_V16HI_INT:
36162 case V16QI_FTYPE_V16QI_V16QI_INT:
36163 case V4DI_FTYPE_V4DI_V4DI_INT:
36164 case V8HI_FTYPE_V8HI_V8HI_INT:
36165 case V8SI_FTYPE_V8SI_V8SI_INT:
36166 case V8SI_FTYPE_V8SI_V4SI_INT:
36167 case V8SF_FTYPE_V8SF_V8SF_INT:
36168 case V8SF_FTYPE_V8SF_V4SF_INT:
36169 case V4SI_FTYPE_V4SI_V4SI_INT:
36170 case V4DF_FTYPE_V4DF_V4DF_INT:
36171 case V16SF_FTYPE_V16SF_V16SF_INT:
36172 case V16SF_FTYPE_V16SF_V4SF_INT:
36173 case V16SI_FTYPE_V16SI_V4SI_INT:
36174 case V4DF_FTYPE_V4DF_V2DF_INT:
36175 case V4SF_FTYPE_V4SF_V4SF_INT:
36176 case V2DI_FTYPE_V2DI_V2DI_INT:
36177 case V4DI_FTYPE_V4DI_V2DI_INT:
36178 case V2DF_FTYPE_V2DF_V2DF_INT:
36179 case UQI_FTYPE_V8DI_V8UDI_INT:
36180 case UQI_FTYPE_V8DF_V8DF_INT:
36181 case UQI_FTYPE_V2DF_V2DF_INT:
36182 case UQI_FTYPE_V4SF_V4SF_INT:
36183 case UHI_FTYPE_V16SI_V16SI_INT:
36184 case UHI_FTYPE_V16SF_V16SF_INT:
36185 nargs = 3;
36186 nargs_constant = 1;
36187 break;
36188 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36189 nargs = 3;
36190 rmode = V4DImode;
36191 nargs_constant = 1;
36192 break;
36193 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36194 nargs = 3;
36195 rmode = V2DImode;
36196 nargs_constant = 1;
36197 break;
36198 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36199 nargs = 3;
36200 rmode = DImode;
36201 nargs_constant = 1;
36202 break;
36203 case V2DI_FTYPE_V2DI_UINT_UINT:
36204 nargs = 3;
36205 nargs_constant = 2;
36206 break;
36207 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36208 nargs = 3;
36209 rmode = V8DImode;
36210 nargs_constant = 1;
36211 break;
36212 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36213 nargs = 5;
36214 rmode = V8DImode;
36215 mask_pos = 2;
36216 nargs_constant = 1;
36217 break;
36218 case QI_FTYPE_V8DF_INT_UQI:
36219 case QI_FTYPE_V4DF_INT_UQI:
36220 case QI_FTYPE_V2DF_INT_UQI:
36221 case HI_FTYPE_V16SF_INT_UHI:
36222 case QI_FTYPE_V8SF_INT_UQI:
36223 case QI_FTYPE_V4SF_INT_UQI:
36224 nargs = 3;
36225 mask_pos = 1;
36226 nargs_constant = 1;
36227 break;
36228 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36229 nargs = 5;
36230 rmode = V4DImode;
36231 mask_pos = 2;
36232 nargs_constant = 1;
36233 break;
36234 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36235 nargs = 5;
36236 rmode = V2DImode;
36237 mask_pos = 2;
36238 nargs_constant = 1;
36239 break;
36240 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36241 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36242 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36243 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36244 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36245 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36246 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36247 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36248 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36249 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36250 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36251 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36252 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36253 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36254 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36255 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36256 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36257 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36258 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36259 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36260 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36261 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36262 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36263 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36264 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36265 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36266 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36267 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36268 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36269 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36270 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36271 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36272 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36273 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36274 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36275 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36276 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36277 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36278 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36279 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36280 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36281 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36282 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36283 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36284 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36285 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36286 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36287 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36288 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36289 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36290 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36291 nargs = 4;
36292 break;
36293 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36294 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36295 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36296 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36297 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36298 nargs = 4;
36299 nargs_constant = 1;
36300 break;
36301 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36302 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36303 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36304 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36305 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36306 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36307 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36308 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36309 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36310 case USI_FTYPE_V32QI_V32QI_INT_USI:
36311 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36312 case USI_FTYPE_V32HI_V32HI_INT_USI:
36313 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36314 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36315 nargs = 4;
36316 mask_pos = 1;
36317 nargs_constant = 1;
36318 break;
36319 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36320 nargs = 4;
36321 nargs_constant = 2;
36322 break;
36323 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36324 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36325 nargs = 4;
36326 break;
36327 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36328 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36329 mask_pos = 1;
36330 nargs = 4;
36331 nargs_constant = 1;
36332 break;
36333 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36334 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36335 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36336 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36337 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36338 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36339 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36340 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36341 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36342 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36343 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36344 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36345 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36346 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36347 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36348 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36349 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36350 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36351 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36352 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36353 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36354 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36355 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36356 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36357 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36358 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36359 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36360 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36361 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36362 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36363 nargs = 4;
36364 mask_pos = 2;
36365 nargs_constant = 1;
36366 break;
36367 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36368 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36369 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36370 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36371 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36372 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36373 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36374 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36375 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36376 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36377 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36378 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36379 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36380 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36381 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36382 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36383 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36384 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36385 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36386 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36387 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36388 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36389 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36390 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36391 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36392 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36393 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36394 nargs = 5;
36395 mask_pos = 2;
36396 nargs_constant = 1;
36397 break;
36398 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36399 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36400 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36401 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36402 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36403 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36404 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36405 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36406 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36407 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36408 nargs = 5;
36409 mask_pos = 1;
36410 nargs_constant = 1;
36411 break;
36413 default:
36414 gcc_unreachable ();
36417 gcc_assert (nargs <= ARRAY_SIZE (args));
36419 if (comparison != UNKNOWN)
36421 gcc_assert (nargs == 2);
36422 return ix86_expand_sse_compare (d, exp, target, swap);
36425 if (rmode == VOIDmode || rmode == tmode)
36427 if (optimize
36428 || target == 0
36429 || GET_MODE (target) != tmode
36430 || !insn_p->operand[0].predicate (target, tmode))
36431 target = gen_reg_rtx (tmode);
36432 else if (memory_operand (target, tmode))
36433 num_memory++;
36434 real_target = target;
36436 else
36438 real_target = gen_reg_rtx (tmode);
36439 target = lowpart_subreg (rmode, real_target, tmode);
36442 for (i = 0; i < nargs; i++)
36444 tree arg = CALL_EXPR_ARG (exp, i);
36445 rtx op = expand_normal (arg);
36446 machine_mode mode = insn_p->operand[i + 1].mode;
36447 bool match = insn_p->operand[i + 1].predicate (op, mode);
36449 if (second_arg_count && i == 1)
36451 /* SIMD shift insns take either an 8-bit immediate or
36452 register as count. But builtin functions take int as
36453 count. If count doesn't match, we put it in register.
36454 The instructions are using 64-bit count, if op is just
36455 32-bit, zero-extend it, as negative shift counts
36456 are undefined behavior and zero-extension is more
36457 efficient. */
36458 if (!match)
36460 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36461 op = convert_modes (mode, GET_MODE (op), op, 1);
36462 else
36463 op = lowpart_subreg (mode, op, GET_MODE (op));
36464 if (!insn_p->operand[i + 1].predicate (op, mode))
36465 op = copy_to_reg (op);
36468 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36469 (!mask_pos && (nargs - i) <= nargs_constant))
36471 if (!match)
36472 switch (icode)
36474 case CODE_FOR_avx_vinsertf128v4di:
36475 case CODE_FOR_avx_vextractf128v4di:
36476 error ("the last argument must be an 1-bit immediate");
36477 return const0_rtx;
36479 case CODE_FOR_avx512f_cmpv8di3_mask:
36480 case CODE_FOR_avx512f_cmpv16si3_mask:
36481 case CODE_FOR_avx512f_ucmpv8di3_mask:
36482 case CODE_FOR_avx512f_ucmpv16si3_mask:
36483 case CODE_FOR_avx512vl_cmpv4di3_mask:
36484 case CODE_FOR_avx512vl_cmpv8si3_mask:
36485 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36486 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36487 case CODE_FOR_avx512vl_cmpv2di3_mask:
36488 case CODE_FOR_avx512vl_cmpv4si3_mask:
36489 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36490 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36491 error ("the last argument must be a 3-bit immediate");
36492 return const0_rtx;
36494 case CODE_FOR_sse4_1_roundsd:
36495 case CODE_FOR_sse4_1_roundss:
36497 case CODE_FOR_sse4_1_roundpd:
36498 case CODE_FOR_sse4_1_roundps:
36499 case CODE_FOR_avx_roundpd256:
36500 case CODE_FOR_avx_roundps256:
36502 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36503 case CODE_FOR_sse4_1_roundps_sfix:
36504 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36505 case CODE_FOR_avx_roundps_sfix256:
36507 case CODE_FOR_sse4_1_blendps:
36508 case CODE_FOR_avx_blendpd256:
36509 case CODE_FOR_avx_vpermilv4df:
36510 case CODE_FOR_avx_vpermilv4df_mask:
36511 case CODE_FOR_avx512f_getmantv8df_mask:
36512 case CODE_FOR_avx512f_getmantv16sf_mask:
36513 case CODE_FOR_avx512vl_getmantv8sf_mask:
36514 case CODE_FOR_avx512vl_getmantv4df_mask:
36515 case CODE_FOR_avx512vl_getmantv4sf_mask:
36516 case CODE_FOR_avx512vl_getmantv2df_mask:
36517 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36518 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36519 case CODE_FOR_avx512dq_rangepv4df_mask:
36520 case CODE_FOR_avx512dq_rangepv8sf_mask:
36521 case CODE_FOR_avx512dq_rangepv2df_mask:
36522 case CODE_FOR_avx512dq_rangepv4sf_mask:
36523 case CODE_FOR_avx_shufpd256_mask:
36524 error ("the last argument must be a 4-bit immediate");
36525 return const0_rtx;
36527 case CODE_FOR_sha1rnds4:
36528 case CODE_FOR_sse4_1_blendpd:
36529 case CODE_FOR_avx_vpermilv2df:
36530 case CODE_FOR_avx_vpermilv2df_mask:
36531 case CODE_FOR_xop_vpermil2v2df3:
36532 case CODE_FOR_xop_vpermil2v4sf3:
36533 case CODE_FOR_xop_vpermil2v4df3:
36534 case CODE_FOR_xop_vpermil2v8sf3:
36535 case CODE_FOR_avx512f_vinsertf32x4_mask:
36536 case CODE_FOR_avx512f_vinserti32x4_mask:
36537 case CODE_FOR_avx512f_vextractf32x4_mask:
36538 case CODE_FOR_avx512f_vextracti32x4_mask:
36539 case CODE_FOR_sse2_shufpd:
36540 case CODE_FOR_sse2_shufpd_mask:
36541 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36542 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36543 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36544 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36545 error ("the last argument must be a 2-bit immediate");
36546 return const0_rtx;
36548 case CODE_FOR_avx_vextractf128v4df:
36549 case CODE_FOR_avx_vextractf128v8sf:
36550 case CODE_FOR_avx_vextractf128v8si:
36551 case CODE_FOR_avx_vinsertf128v4df:
36552 case CODE_FOR_avx_vinsertf128v8sf:
36553 case CODE_FOR_avx_vinsertf128v8si:
36554 case CODE_FOR_avx512f_vinsertf64x4_mask:
36555 case CODE_FOR_avx512f_vinserti64x4_mask:
36556 case CODE_FOR_avx512f_vextractf64x4_mask:
36557 case CODE_FOR_avx512f_vextracti64x4_mask:
36558 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36559 case CODE_FOR_avx512dq_vinserti32x8_mask:
36560 case CODE_FOR_avx512vl_vinsertv4df:
36561 case CODE_FOR_avx512vl_vinsertv4di:
36562 case CODE_FOR_avx512vl_vinsertv8sf:
36563 case CODE_FOR_avx512vl_vinsertv8si:
36564 error ("the last argument must be a 1-bit immediate");
36565 return const0_rtx;
36567 case CODE_FOR_avx_vmcmpv2df3:
36568 case CODE_FOR_avx_vmcmpv4sf3:
36569 case CODE_FOR_avx_cmpv2df3:
36570 case CODE_FOR_avx_cmpv4sf3:
36571 case CODE_FOR_avx_cmpv4df3:
36572 case CODE_FOR_avx_cmpv8sf3:
36573 case CODE_FOR_avx512f_cmpv8df3_mask:
36574 case CODE_FOR_avx512f_cmpv16sf3_mask:
36575 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36576 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36577 error ("the last argument must be a 5-bit immediate");
36578 return const0_rtx;
36580 default:
36581 switch (nargs_constant)
36583 case 2:
36584 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36585 (!mask_pos && (nargs - i) == nargs_constant))
36587 error ("the next to last argument must be an 8-bit immediate");
36588 break;
36590 /* FALLTHRU */
36591 case 1:
36592 error ("the last argument must be an 8-bit immediate");
36593 break;
36594 default:
36595 gcc_unreachable ();
36597 return const0_rtx;
36600 else
36602 if (VECTOR_MODE_P (mode))
36603 op = safe_vector_operand (op, mode);
36605 /* If we aren't optimizing, only allow one memory operand to
36606 be generated. */
36607 if (memory_operand (op, mode))
36608 num_memory++;
36610 op = fixup_modeless_constant (op, mode);
36612 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36614 if (optimize || !match || num_memory > 1)
36615 op = copy_to_mode_reg (mode, op);
36617 else
36619 op = copy_to_reg (op);
36620 op = lowpart_subreg (mode, op, GET_MODE (op));
36624 args[i].op = op;
36625 args[i].mode = mode;
36628 switch (nargs)
36630 case 1:
36631 pat = GEN_FCN (icode) (real_target, args[0].op);
36632 break;
36633 case 2:
36634 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36635 break;
36636 case 3:
36637 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36638 args[2].op);
36639 break;
36640 case 4:
36641 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36642 args[2].op, args[3].op);
36643 break;
36644 case 5:
36645 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36646 args[2].op, args[3].op, args[4].op);
36647 break;
36648 case 6:
36649 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36650 args[2].op, args[3].op, args[4].op,
36651 args[5].op);
36652 break;
36653 default:
36654 gcc_unreachable ();
36657 if (! pat)
36658 return 0;
36660 emit_insn (pat);
36661 return target;
36664 /* Transform pattern of following layout:
36665 (set A
36666 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36668 into:
36669 (set (A B)) */
36671 static rtx
36672 ix86_erase_embedded_rounding (rtx pat)
36674 if (GET_CODE (pat) == INSN)
36675 pat = PATTERN (pat);
36677 gcc_assert (GET_CODE (pat) == SET);
36678 rtx src = SET_SRC (pat);
36679 gcc_assert (XVECLEN (src, 0) == 2);
36680 rtx p0 = XVECEXP (src, 0, 0);
36681 gcc_assert (GET_CODE (src) == UNSPEC
36682 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36683 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36684 return res;
36687 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36688 with rounding. */
36689 static rtx
36690 ix86_expand_sse_comi_round (const struct builtin_description *d,
36691 tree exp, rtx target)
36693 rtx pat, set_dst;
36694 tree arg0 = CALL_EXPR_ARG (exp, 0);
36695 tree arg1 = CALL_EXPR_ARG (exp, 1);
36696 tree arg2 = CALL_EXPR_ARG (exp, 2);
36697 tree arg3 = CALL_EXPR_ARG (exp, 3);
36698 rtx op0 = expand_normal (arg0);
36699 rtx op1 = expand_normal (arg1);
36700 rtx op2 = expand_normal (arg2);
36701 rtx op3 = expand_normal (arg3);
36702 enum insn_code icode = d->icode;
36703 const struct insn_data_d *insn_p = &insn_data[icode];
36704 machine_mode mode0 = insn_p->operand[0].mode;
36705 machine_mode mode1 = insn_p->operand[1].mode;
36706 enum rtx_code comparison = UNEQ;
36707 bool need_ucomi = false;
36709 /* See avxintrin.h for values. */
36710 enum rtx_code comi_comparisons[32] =
36712 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36713 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36714 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36716 bool need_ucomi_values[32] =
36718 true, false, false, true, true, false, false, true,
36719 true, false, false, true, true, false, false, true,
36720 false, true, true, false, false, true, true, false,
36721 false, true, true, false, false, true, true, false
36724 if (!CONST_INT_P (op2))
36726 error ("the third argument must be comparison constant");
36727 return const0_rtx;
36729 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36731 error ("incorrect comparison mode");
36732 return const0_rtx;
36735 if (!insn_p->operand[2].predicate (op3, SImode))
36737 error ("incorrect rounding operand");
36738 return const0_rtx;
36741 comparison = comi_comparisons[INTVAL (op2)];
36742 need_ucomi = need_ucomi_values[INTVAL (op2)];
36744 if (VECTOR_MODE_P (mode0))
36745 op0 = safe_vector_operand (op0, mode0);
36746 if (VECTOR_MODE_P (mode1))
36747 op1 = safe_vector_operand (op1, mode1);
36749 target = gen_reg_rtx (SImode);
36750 emit_move_insn (target, const0_rtx);
36751 target = gen_rtx_SUBREG (QImode, target, 0);
36753 if ((optimize && !register_operand (op0, mode0))
36754 || !insn_p->operand[0].predicate (op0, mode0))
36755 op0 = copy_to_mode_reg (mode0, op0);
36756 if ((optimize && !register_operand (op1, mode1))
36757 || !insn_p->operand[1].predicate (op1, mode1))
36758 op1 = copy_to_mode_reg (mode1, op1);
36760 if (need_ucomi)
36761 icode = icode == CODE_FOR_sse_comi_round
36762 ? CODE_FOR_sse_ucomi_round
36763 : CODE_FOR_sse2_ucomi_round;
36765 pat = GEN_FCN (icode) (op0, op1, op3);
36766 if (! pat)
36767 return 0;
36769 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36770 if (INTVAL (op3) == NO_ROUND)
36772 pat = ix86_erase_embedded_rounding (pat);
36773 if (! pat)
36774 return 0;
36776 set_dst = SET_DEST (pat);
36778 else
36780 gcc_assert (GET_CODE (pat) == SET);
36781 set_dst = SET_DEST (pat);
36784 emit_insn (pat);
36785 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36786 gen_rtx_fmt_ee (comparison, QImode,
36787 set_dst,
36788 const0_rtx)));
36790 return SUBREG_REG (target);
36793 static rtx
36794 ix86_expand_round_builtin (const struct builtin_description *d,
36795 tree exp, rtx target)
36797 rtx pat;
36798 unsigned int i, nargs;
36799 struct
36801 rtx op;
36802 machine_mode mode;
36803 } args[6];
36804 enum insn_code icode = d->icode;
36805 const struct insn_data_d *insn_p = &insn_data[icode];
36806 machine_mode tmode = insn_p->operand[0].mode;
36807 unsigned int nargs_constant = 0;
36808 unsigned int redundant_embed_rnd = 0;
36810 switch ((enum ix86_builtin_func_type) d->flag)
36812 case UINT64_FTYPE_V2DF_INT:
36813 case UINT64_FTYPE_V4SF_INT:
36814 case UINT_FTYPE_V2DF_INT:
36815 case UINT_FTYPE_V4SF_INT:
36816 case INT64_FTYPE_V2DF_INT:
36817 case INT64_FTYPE_V4SF_INT:
36818 case INT_FTYPE_V2DF_INT:
36819 case INT_FTYPE_V4SF_INT:
36820 nargs = 2;
36821 break;
36822 case V4SF_FTYPE_V4SF_UINT_INT:
36823 case V4SF_FTYPE_V4SF_UINT64_INT:
36824 case V2DF_FTYPE_V2DF_UINT64_INT:
36825 case V4SF_FTYPE_V4SF_INT_INT:
36826 case V4SF_FTYPE_V4SF_INT64_INT:
36827 case V2DF_FTYPE_V2DF_INT64_INT:
36828 case V4SF_FTYPE_V4SF_V4SF_INT:
36829 case V2DF_FTYPE_V2DF_V2DF_INT:
36830 case V4SF_FTYPE_V4SF_V2DF_INT:
36831 case V2DF_FTYPE_V2DF_V4SF_INT:
36832 nargs = 3;
36833 break;
36834 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36835 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36836 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36837 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36838 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36839 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36840 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36841 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36842 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36843 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36844 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36845 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36846 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36847 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36848 nargs = 4;
36849 break;
36850 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36851 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36852 nargs_constant = 2;
36853 nargs = 4;
36854 break;
36855 case INT_FTYPE_V4SF_V4SF_INT_INT:
36856 case INT_FTYPE_V2DF_V2DF_INT_INT:
36857 return ix86_expand_sse_comi_round (d, exp, target);
36858 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36859 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36860 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36861 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36862 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36863 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36864 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36865 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36866 nargs = 5;
36867 break;
36868 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36869 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36870 nargs_constant = 4;
36871 nargs = 5;
36872 break;
36873 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36874 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36875 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36876 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36877 nargs_constant = 3;
36878 nargs = 5;
36879 break;
36880 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36881 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36882 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36883 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36884 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
36885 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
36886 nargs = 6;
36887 nargs_constant = 4;
36888 break;
36889 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36890 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36891 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36892 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36893 nargs = 6;
36894 nargs_constant = 3;
36895 break;
36896 default:
36897 gcc_unreachable ();
36899 gcc_assert (nargs <= ARRAY_SIZE (args));
36901 if (optimize
36902 || target == 0
36903 || GET_MODE (target) != tmode
36904 || !insn_p->operand[0].predicate (target, tmode))
36905 target = gen_reg_rtx (tmode);
36907 for (i = 0; i < nargs; i++)
36909 tree arg = CALL_EXPR_ARG (exp, i);
36910 rtx op = expand_normal (arg);
36911 machine_mode mode = insn_p->operand[i + 1].mode;
36912 bool match = insn_p->operand[i + 1].predicate (op, mode);
36914 if (i == nargs - nargs_constant)
36916 if (!match)
36918 switch (icode)
36920 case CODE_FOR_avx512f_getmantv8df_mask_round:
36921 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36922 case CODE_FOR_avx512f_vgetmantv2df_round:
36923 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
36924 case CODE_FOR_avx512f_vgetmantv4sf_round:
36925 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
36926 error ("the immediate argument must be a 4-bit immediate");
36927 return const0_rtx;
36928 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36929 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36930 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36931 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36932 error ("the immediate argument must be a 5-bit immediate");
36933 return const0_rtx;
36934 default:
36935 error ("the immediate argument must be an 8-bit immediate");
36936 return const0_rtx;
36940 else if (i == nargs-1)
36942 if (!insn_p->operand[nargs].predicate (op, SImode))
36944 error ("incorrect rounding operand");
36945 return const0_rtx;
36948 /* If there is no rounding use normal version of the pattern. */
36949 if (INTVAL (op) == NO_ROUND)
36950 redundant_embed_rnd = 1;
36952 else
36954 if (VECTOR_MODE_P (mode))
36955 op = safe_vector_operand (op, mode);
36957 op = fixup_modeless_constant (op, mode);
36959 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36961 if (optimize || !match)
36962 op = copy_to_mode_reg (mode, op);
36964 else
36966 op = copy_to_reg (op);
36967 op = lowpart_subreg (mode, op, GET_MODE (op));
36971 args[i].op = op;
36972 args[i].mode = mode;
36975 switch (nargs)
36977 case 1:
36978 pat = GEN_FCN (icode) (target, args[0].op);
36979 break;
36980 case 2:
36981 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36982 break;
36983 case 3:
36984 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36985 args[2].op);
36986 break;
36987 case 4:
36988 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36989 args[2].op, args[3].op);
36990 break;
36991 case 5:
36992 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36993 args[2].op, args[3].op, args[4].op);
36994 break;
36995 case 6:
36996 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36997 args[2].op, args[3].op, args[4].op,
36998 args[5].op);
36999 break;
37000 default:
37001 gcc_unreachable ();
37004 if (!pat)
37005 return 0;
37007 if (redundant_embed_rnd)
37008 pat = ix86_erase_embedded_rounding (pat);
37010 emit_insn (pat);
37011 return target;
37014 /* Subroutine of ix86_expand_builtin to take care of special insns
37015 with variable number of operands. */
37017 static rtx
37018 ix86_expand_special_args_builtin (const struct builtin_description *d,
37019 tree exp, rtx target)
37021 tree arg;
37022 rtx pat, op;
37023 unsigned int i, nargs, arg_adjust, memory;
37024 bool aligned_mem = false;
37025 struct
37027 rtx op;
37028 machine_mode mode;
37029 } args[3];
37030 enum insn_code icode = d->icode;
37031 bool last_arg_constant = false;
37032 const struct insn_data_d *insn_p = &insn_data[icode];
37033 machine_mode tmode = insn_p->operand[0].mode;
37034 enum { load, store } klass;
37036 switch ((enum ix86_builtin_func_type) d->flag)
37038 case VOID_FTYPE_VOID:
37039 emit_insn (GEN_FCN (icode) (target));
37040 return 0;
37041 case VOID_FTYPE_UINT64:
37042 case VOID_FTYPE_UNSIGNED:
37043 nargs = 0;
37044 klass = store;
37045 memory = 0;
37046 break;
37048 case INT_FTYPE_VOID:
37049 case USHORT_FTYPE_VOID:
37050 case UINT64_FTYPE_VOID:
37051 case UNSIGNED_FTYPE_VOID:
37052 nargs = 0;
37053 klass = load;
37054 memory = 0;
37055 break;
37056 case UINT64_FTYPE_PUNSIGNED:
37057 case V2DI_FTYPE_PV2DI:
37058 case V4DI_FTYPE_PV4DI:
37059 case V32QI_FTYPE_PCCHAR:
37060 case V16QI_FTYPE_PCCHAR:
37061 case V8SF_FTYPE_PCV4SF:
37062 case V8SF_FTYPE_PCFLOAT:
37063 case V4SF_FTYPE_PCFLOAT:
37064 case V4DF_FTYPE_PCV2DF:
37065 case V4DF_FTYPE_PCDOUBLE:
37066 case V2DF_FTYPE_PCDOUBLE:
37067 case VOID_FTYPE_PVOID:
37068 case V8DI_FTYPE_PV8DI:
37069 nargs = 1;
37070 klass = load;
37071 memory = 0;
37072 switch (icode)
37074 case CODE_FOR_sse4_1_movntdqa:
37075 case CODE_FOR_avx2_movntdqa:
37076 case CODE_FOR_avx512f_movntdqa:
37077 aligned_mem = true;
37078 break;
37079 default:
37080 break;
37082 break;
37083 case VOID_FTYPE_PV2SF_V4SF:
37084 case VOID_FTYPE_PV8DI_V8DI:
37085 case VOID_FTYPE_PV4DI_V4DI:
37086 case VOID_FTYPE_PV2DI_V2DI:
37087 case VOID_FTYPE_PCHAR_V32QI:
37088 case VOID_FTYPE_PCHAR_V16QI:
37089 case VOID_FTYPE_PFLOAT_V16SF:
37090 case VOID_FTYPE_PFLOAT_V8SF:
37091 case VOID_FTYPE_PFLOAT_V4SF:
37092 case VOID_FTYPE_PDOUBLE_V8DF:
37093 case VOID_FTYPE_PDOUBLE_V4DF:
37094 case VOID_FTYPE_PDOUBLE_V2DF:
37095 case VOID_FTYPE_PLONGLONG_LONGLONG:
37096 case VOID_FTYPE_PULONGLONG_ULONGLONG:
37097 case VOID_FTYPE_PINT_INT:
37098 nargs = 1;
37099 klass = store;
37100 /* Reserve memory operand for target. */
37101 memory = ARRAY_SIZE (args);
37102 switch (icode)
37104 /* These builtins and instructions require the memory
37105 to be properly aligned. */
37106 case CODE_FOR_avx_movntv4di:
37107 case CODE_FOR_sse2_movntv2di:
37108 case CODE_FOR_avx_movntv8sf:
37109 case CODE_FOR_sse_movntv4sf:
37110 case CODE_FOR_sse4a_vmmovntv4sf:
37111 case CODE_FOR_avx_movntv4df:
37112 case CODE_FOR_sse2_movntv2df:
37113 case CODE_FOR_sse4a_vmmovntv2df:
37114 case CODE_FOR_sse2_movntidi:
37115 case CODE_FOR_sse_movntq:
37116 case CODE_FOR_sse2_movntisi:
37117 case CODE_FOR_avx512f_movntv16sf:
37118 case CODE_FOR_avx512f_movntv8df:
37119 case CODE_FOR_avx512f_movntv8di:
37120 aligned_mem = true;
37121 break;
37122 default:
37123 break;
37125 break;
37126 case V4SF_FTYPE_V4SF_PCV2SF:
37127 case V2DF_FTYPE_V2DF_PCDOUBLE:
37128 nargs = 2;
37129 klass = load;
37130 memory = 1;
37131 break;
37132 case V8SF_FTYPE_PCV8SF_V8SI:
37133 case V4DF_FTYPE_PCV4DF_V4DI:
37134 case V4SF_FTYPE_PCV4SF_V4SI:
37135 case V2DF_FTYPE_PCV2DF_V2DI:
37136 case V8SI_FTYPE_PCV8SI_V8SI:
37137 case V4DI_FTYPE_PCV4DI_V4DI:
37138 case V4SI_FTYPE_PCV4SI_V4SI:
37139 case V2DI_FTYPE_PCV2DI_V2DI:
37140 case VOID_FTYPE_INT_INT64:
37141 nargs = 2;
37142 klass = load;
37143 memory = 0;
37144 break;
37145 case VOID_FTYPE_PV8DF_V8DF_UQI:
37146 case VOID_FTYPE_PV4DF_V4DF_UQI:
37147 case VOID_FTYPE_PV2DF_V2DF_UQI:
37148 case VOID_FTYPE_PV16SF_V16SF_UHI:
37149 case VOID_FTYPE_PV8SF_V8SF_UQI:
37150 case VOID_FTYPE_PV4SF_V4SF_UQI:
37151 case VOID_FTYPE_PV8DI_V8DI_UQI:
37152 case VOID_FTYPE_PV4DI_V4DI_UQI:
37153 case VOID_FTYPE_PV2DI_V2DI_UQI:
37154 case VOID_FTYPE_PV16SI_V16SI_UHI:
37155 case VOID_FTYPE_PV8SI_V8SI_UQI:
37156 case VOID_FTYPE_PV4SI_V4SI_UQI:
37157 switch (icode)
37159 /* These builtins and instructions require the memory
37160 to be properly aligned. */
37161 case CODE_FOR_avx512f_storev16sf_mask:
37162 case CODE_FOR_avx512f_storev16si_mask:
37163 case CODE_FOR_avx512f_storev8df_mask:
37164 case CODE_FOR_avx512f_storev8di_mask:
37165 case CODE_FOR_avx512vl_storev8sf_mask:
37166 case CODE_FOR_avx512vl_storev8si_mask:
37167 case CODE_FOR_avx512vl_storev4df_mask:
37168 case CODE_FOR_avx512vl_storev4di_mask:
37169 case CODE_FOR_avx512vl_storev4sf_mask:
37170 case CODE_FOR_avx512vl_storev4si_mask:
37171 case CODE_FOR_avx512vl_storev2df_mask:
37172 case CODE_FOR_avx512vl_storev2di_mask:
37173 aligned_mem = true;
37174 break;
37175 default:
37176 break;
37178 /* FALLTHRU */
37179 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37180 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37181 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37182 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37183 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37184 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37185 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37186 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37187 case VOID_FTYPE_PV8SI_V8DI_UQI:
37188 case VOID_FTYPE_PV8HI_V8DI_UQI:
37189 case VOID_FTYPE_PV16HI_V16SI_UHI:
37190 case VOID_FTYPE_PV16QI_V8DI_UQI:
37191 case VOID_FTYPE_PV16QI_V16SI_UHI:
37192 case VOID_FTYPE_PV4SI_V4DI_UQI:
37193 case VOID_FTYPE_PV4SI_V2DI_UQI:
37194 case VOID_FTYPE_PV8HI_V4DI_UQI:
37195 case VOID_FTYPE_PV8HI_V2DI_UQI:
37196 case VOID_FTYPE_PV8HI_V8SI_UQI:
37197 case VOID_FTYPE_PV8HI_V4SI_UQI:
37198 case VOID_FTYPE_PV16QI_V4DI_UQI:
37199 case VOID_FTYPE_PV16QI_V2DI_UQI:
37200 case VOID_FTYPE_PV16QI_V8SI_UQI:
37201 case VOID_FTYPE_PV16QI_V4SI_UQI:
37202 case VOID_FTYPE_PCHAR_V64QI_UDI:
37203 case VOID_FTYPE_PCHAR_V32QI_USI:
37204 case VOID_FTYPE_PCHAR_V16QI_UHI:
37205 case VOID_FTYPE_PSHORT_V32HI_USI:
37206 case VOID_FTYPE_PSHORT_V16HI_UHI:
37207 case VOID_FTYPE_PSHORT_V8HI_UQI:
37208 case VOID_FTYPE_PINT_V16SI_UHI:
37209 case VOID_FTYPE_PINT_V8SI_UQI:
37210 case VOID_FTYPE_PINT_V4SI_UQI:
37211 case VOID_FTYPE_PINT64_V8DI_UQI:
37212 case VOID_FTYPE_PINT64_V4DI_UQI:
37213 case VOID_FTYPE_PINT64_V2DI_UQI:
37214 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37215 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37216 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37217 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37218 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37219 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37220 case VOID_FTYPE_PV32QI_V32HI_USI:
37221 case VOID_FTYPE_PV16QI_V16HI_UHI:
37222 case VOID_FTYPE_PV8QI_V8HI_UQI:
37223 nargs = 2;
37224 klass = store;
37225 /* Reserve memory operand for target. */
37226 memory = ARRAY_SIZE (args);
37227 break;
37228 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37229 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37230 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37231 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37232 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37233 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37234 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37235 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37236 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37237 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37238 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37239 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37240 switch (icode)
37242 /* These builtins and instructions require the memory
37243 to be properly aligned. */
37244 case CODE_FOR_avx512f_loadv16sf_mask:
37245 case CODE_FOR_avx512f_loadv16si_mask:
37246 case CODE_FOR_avx512f_loadv8df_mask:
37247 case CODE_FOR_avx512f_loadv8di_mask:
37248 case CODE_FOR_avx512vl_loadv8sf_mask:
37249 case CODE_FOR_avx512vl_loadv8si_mask:
37250 case CODE_FOR_avx512vl_loadv4df_mask:
37251 case CODE_FOR_avx512vl_loadv4di_mask:
37252 case CODE_FOR_avx512vl_loadv4sf_mask:
37253 case CODE_FOR_avx512vl_loadv4si_mask:
37254 case CODE_FOR_avx512vl_loadv2df_mask:
37255 case CODE_FOR_avx512vl_loadv2di_mask:
37256 case CODE_FOR_avx512bw_loadv64qi_mask:
37257 case CODE_FOR_avx512vl_loadv32qi_mask:
37258 case CODE_FOR_avx512vl_loadv16qi_mask:
37259 case CODE_FOR_avx512bw_loadv32hi_mask:
37260 case CODE_FOR_avx512vl_loadv16hi_mask:
37261 case CODE_FOR_avx512vl_loadv8hi_mask:
37262 aligned_mem = true;
37263 break;
37264 default:
37265 break;
37267 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37268 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37269 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37270 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37271 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37272 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37273 case V16SI_FTYPE_PCINT_V16SI_UHI:
37274 case V8SI_FTYPE_PCINT_V8SI_UQI:
37275 case V4SI_FTYPE_PCINT_V4SI_UQI:
37276 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37277 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37278 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37279 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37280 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37281 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37282 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37283 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37284 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37285 nargs = 3;
37286 klass = load;
37287 memory = 0;
37288 break;
37289 case VOID_FTYPE_UINT_UINT_UINT:
37290 case VOID_FTYPE_UINT64_UINT_UINT:
37291 case UCHAR_FTYPE_UINT_UINT_UINT:
37292 case UCHAR_FTYPE_UINT64_UINT_UINT:
37293 nargs = 3;
37294 klass = load;
37295 memory = ARRAY_SIZE (args);
37296 last_arg_constant = true;
37297 break;
37298 default:
37299 gcc_unreachable ();
37302 gcc_assert (nargs <= ARRAY_SIZE (args));
37304 if (klass == store)
37306 arg = CALL_EXPR_ARG (exp, 0);
37307 op = expand_normal (arg);
37308 gcc_assert (target == 0);
37309 if (memory)
37311 op = ix86_zero_extend_to_Pmode (op);
37312 target = gen_rtx_MEM (tmode, op);
37313 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37314 on it. Try to improve it using get_pointer_alignment,
37315 and if the special builtin is one that requires strict
37316 mode alignment, also from it's GET_MODE_ALIGNMENT.
37317 Failure to do so could lead to ix86_legitimate_combined_insn
37318 rejecting all changes to such insns. */
37319 unsigned int align = get_pointer_alignment (arg);
37320 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37321 align = GET_MODE_ALIGNMENT (tmode);
37322 if (MEM_ALIGN (target) < align)
37323 set_mem_align (target, align);
37325 else
37326 target = force_reg (tmode, op);
37327 arg_adjust = 1;
37329 else
37331 arg_adjust = 0;
37332 if (optimize
37333 || target == 0
37334 || !register_operand (target, tmode)
37335 || GET_MODE (target) != tmode)
37336 target = gen_reg_rtx (tmode);
37339 for (i = 0; i < nargs; i++)
37341 machine_mode mode = insn_p->operand[i + 1].mode;
37342 bool match;
37344 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37345 op = expand_normal (arg);
37346 match = insn_p->operand[i + 1].predicate (op, mode);
37348 if (last_arg_constant && (i + 1) == nargs)
37350 if (!match)
37352 if (icode == CODE_FOR_lwp_lwpvalsi3
37353 || icode == CODE_FOR_lwp_lwpinssi3
37354 || icode == CODE_FOR_lwp_lwpvaldi3
37355 || icode == CODE_FOR_lwp_lwpinsdi3)
37356 error ("the last argument must be a 32-bit immediate");
37357 else
37358 error ("the last argument must be an 8-bit immediate");
37359 return const0_rtx;
37362 else
37364 if (i == memory)
37366 /* This must be the memory operand. */
37367 op = ix86_zero_extend_to_Pmode (op);
37368 op = gen_rtx_MEM (mode, op);
37369 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37370 on it. Try to improve it using get_pointer_alignment,
37371 and if the special builtin is one that requires strict
37372 mode alignment, also from it's GET_MODE_ALIGNMENT.
37373 Failure to do so could lead to ix86_legitimate_combined_insn
37374 rejecting all changes to such insns. */
37375 unsigned int align = get_pointer_alignment (arg);
37376 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37377 align = GET_MODE_ALIGNMENT (mode);
37378 if (MEM_ALIGN (op) < align)
37379 set_mem_align (op, align);
37381 else
37383 /* This must be register. */
37384 if (VECTOR_MODE_P (mode))
37385 op = safe_vector_operand (op, mode);
37387 op = fixup_modeless_constant (op, mode);
37389 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37390 op = copy_to_mode_reg (mode, op);
37391 else
37393 op = copy_to_reg (op);
37394 op = lowpart_subreg (mode, op, GET_MODE (op));
37399 args[i].op = op;
37400 args[i].mode = mode;
37403 switch (nargs)
37405 case 0:
37406 pat = GEN_FCN (icode) (target);
37407 break;
37408 case 1:
37409 pat = GEN_FCN (icode) (target, args[0].op);
37410 break;
37411 case 2:
37412 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37413 break;
37414 case 3:
37415 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37416 break;
37417 default:
37418 gcc_unreachable ();
37421 if (! pat)
37422 return 0;
37423 emit_insn (pat);
37424 return klass == store ? 0 : target;
37427 /* Return the integer constant in ARG. Constrain it to be in the range
37428 of the subparts of VEC_TYPE; issue an error if not. */
37430 static int
37431 get_element_number (tree vec_type, tree arg)
37433 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37435 if (!tree_fits_uhwi_p (arg)
37436 || (elt = tree_to_uhwi (arg), elt > max))
37438 error ("selector must be an integer constant in the range 0..%wi", max);
37439 return 0;
37442 return elt;
37445 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37446 ix86_expand_vector_init. We DO have language-level syntax for this, in
37447 the form of (type){ init-list }. Except that since we can't place emms
37448 instructions from inside the compiler, we can't allow the use of MMX
37449 registers unless the user explicitly asks for it. So we do *not* define
37450 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37451 we have builtins invoked by mmintrin.h that gives us license to emit
37452 these sorts of instructions. */
37454 static rtx
37455 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37457 machine_mode tmode = TYPE_MODE (type);
37458 machine_mode inner_mode = GET_MODE_INNER (tmode);
37459 int i, n_elt = GET_MODE_NUNITS (tmode);
37460 rtvec v = rtvec_alloc (n_elt);
37462 gcc_assert (VECTOR_MODE_P (tmode));
37463 gcc_assert (call_expr_nargs (exp) == n_elt);
37465 for (i = 0; i < n_elt; ++i)
37467 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37468 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37471 if (!target || !register_operand (target, tmode))
37472 target = gen_reg_rtx (tmode);
37474 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37475 return target;
37478 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37479 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37480 had a language-level syntax for referencing vector elements. */
37482 static rtx
37483 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37485 machine_mode tmode, mode0;
37486 tree arg0, arg1;
37487 int elt;
37488 rtx op0;
37490 arg0 = CALL_EXPR_ARG (exp, 0);
37491 arg1 = CALL_EXPR_ARG (exp, 1);
37493 op0 = expand_normal (arg0);
37494 elt = get_element_number (TREE_TYPE (arg0), arg1);
37496 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37497 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37498 gcc_assert (VECTOR_MODE_P (mode0));
37500 op0 = force_reg (mode0, op0);
37502 if (optimize || !target || !register_operand (target, tmode))
37503 target = gen_reg_rtx (tmode);
37505 ix86_expand_vector_extract (true, target, op0, elt);
37507 return target;
37510 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37511 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37512 a language-level syntax for referencing vector elements. */
37514 static rtx
37515 ix86_expand_vec_set_builtin (tree exp)
37517 machine_mode tmode, mode1;
37518 tree arg0, arg1, arg2;
37519 int elt;
37520 rtx op0, op1, target;
37522 arg0 = CALL_EXPR_ARG (exp, 0);
37523 arg1 = CALL_EXPR_ARG (exp, 1);
37524 arg2 = CALL_EXPR_ARG (exp, 2);
37526 tmode = TYPE_MODE (TREE_TYPE (arg0));
37527 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37528 gcc_assert (VECTOR_MODE_P (tmode));
37530 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37531 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37532 elt = get_element_number (TREE_TYPE (arg0), arg2);
37534 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37535 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37537 op0 = force_reg (tmode, op0);
37538 op1 = force_reg (mode1, op1);
37540 /* OP0 is the source of these builtin functions and shouldn't be
37541 modified. Create a copy, use it and return it as target. */
37542 target = gen_reg_rtx (tmode);
37543 emit_move_insn (target, op0);
37544 ix86_expand_vector_set (true, target, op1, elt);
37546 return target;
37549 /* Emit conditional move of SRC to DST with condition
37550 OP1 CODE OP2. */
37551 static void
37552 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37554 rtx t;
37556 if (TARGET_CMOVE)
37558 t = ix86_expand_compare (code, op1, op2);
37559 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37560 src, dst)));
37562 else
37564 rtx_code_label *nomove = gen_label_rtx ();
37565 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37566 const0_rtx, GET_MODE (op1), 1, nomove);
37567 emit_move_insn (dst, src);
37568 emit_label (nomove);
37572 /* Choose max of DST and SRC and put it to DST. */
37573 static void
37574 ix86_emit_move_max (rtx dst, rtx src)
37576 ix86_emit_cmove (dst, src, LTU, dst, src);
37579 /* Expand an expression EXP that calls a built-in function,
37580 with result going to TARGET if that's convenient
37581 (and in mode MODE if that's convenient).
37582 SUBTARGET may be used as the target for computing one of EXP's operands.
37583 IGNORE is nonzero if the value is to be ignored. */
37585 static rtx
37586 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37587 machine_mode mode, int ignore)
37589 size_t i;
37590 enum insn_code icode;
37591 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37592 tree arg0, arg1, arg2, arg3, arg4;
37593 rtx op0, op1, op2, op3, op4, pat, insn;
37594 machine_mode mode0, mode1, mode2, mode3, mode4;
37595 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37597 /* For CPU builtins that can be folded, fold first and expand the fold. */
37598 switch (fcode)
37600 case IX86_BUILTIN_CPU_INIT:
37602 /* Make it call __cpu_indicator_init in libgcc. */
37603 tree call_expr, fndecl, type;
37604 type = build_function_type_list (integer_type_node, NULL_TREE);
37605 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37606 call_expr = build_call_expr (fndecl, 0);
37607 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37609 case IX86_BUILTIN_CPU_IS:
37610 case IX86_BUILTIN_CPU_SUPPORTS:
37612 tree arg0 = CALL_EXPR_ARG (exp, 0);
37613 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37614 gcc_assert (fold_expr != NULL_TREE);
37615 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37619 /* Determine whether the builtin function is available under the current ISA.
37620 Originally the builtin was not created if it wasn't applicable to the
37621 current ISA based on the command line switches. With function specific
37622 options, we need to check in the context of the function making the call
37623 whether it is supported. Treat AVX512VL specially. For other flags,
37624 if isa includes more than one ISA bit, treat those are requiring any
37625 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37626 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37627 at all, -m64 is a whole TU option. */
37628 if (((ix86_builtins_isa[fcode].isa
37629 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37630 && !(ix86_builtins_isa[fcode].isa
37631 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37632 & ix86_isa_flags))
37633 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37634 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37635 || (ix86_builtins_isa[fcode].isa2
37636 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37638 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37639 ix86_builtins_isa[fcode].isa2, 0, 0,
37640 NULL, NULL, (enum fpmath_unit) 0,
37641 false);
37642 if (!opts)
37643 error ("%qE needs unknown isa option", fndecl);
37644 else
37646 gcc_assert (opts != NULL);
37647 error ("%qE needs isa option %s", fndecl, opts);
37648 free (opts);
37650 return expand_call (exp, target, ignore);
37653 switch (fcode)
37655 case IX86_BUILTIN_BNDMK:
37656 if (!target
37657 || GET_MODE (target) != BNDmode
37658 || !register_operand (target, BNDmode))
37659 target = gen_reg_rtx (BNDmode);
37661 arg0 = CALL_EXPR_ARG (exp, 0);
37662 arg1 = CALL_EXPR_ARG (exp, 1);
37664 op0 = expand_normal (arg0);
37665 op1 = expand_normal (arg1);
37667 if (!register_operand (op0, Pmode))
37668 op0 = ix86_zero_extend_to_Pmode (op0);
37669 if (!register_operand (op1, Pmode))
37670 op1 = ix86_zero_extend_to_Pmode (op1);
37672 /* Builtin arg1 is size of block but instruction op1 should
37673 be (size - 1). */
37674 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37675 NULL_RTX, 1, OPTAB_DIRECT);
37677 emit_insn (BNDmode == BND64mode
37678 ? gen_bnd64_mk (target, op0, op1)
37679 : gen_bnd32_mk (target, op0, op1));
37680 return target;
37682 case IX86_BUILTIN_BNDSTX:
37683 arg0 = CALL_EXPR_ARG (exp, 0);
37684 arg1 = CALL_EXPR_ARG (exp, 1);
37685 arg2 = CALL_EXPR_ARG (exp, 2);
37687 op0 = expand_normal (arg0);
37688 op1 = expand_normal (arg1);
37689 op2 = expand_normal (arg2);
37691 if (!register_operand (op0, Pmode))
37692 op0 = ix86_zero_extend_to_Pmode (op0);
37693 if (!register_operand (op1, BNDmode))
37694 op1 = copy_to_mode_reg (BNDmode, op1);
37695 if (!register_operand (op2, Pmode))
37696 op2 = ix86_zero_extend_to_Pmode (op2);
37698 emit_insn (BNDmode == BND64mode
37699 ? gen_bnd64_stx (op2, op0, op1)
37700 : gen_bnd32_stx (op2, op0, op1));
37701 return 0;
37703 case IX86_BUILTIN_BNDLDX:
37704 if (!target
37705 || GET_MODE (target) != BNDmode
37706 || !register_operand (target, BNDmode))
37707 target = gen_reg_rtx (BNDmode);
37709 arg0 = CALL_EXPR_ARG (exp, 0);
37710 arg1 = CALL_EXPR_ARG (exp, 1);
37712 op0 = expand_normal (arg0);
37713 op1 = expand_normal (arg1);
37715 if (!register_operand (op0, Pmode))
37716 op0 = ix86_zero_extend_to_Pmode (op0);
37717 if (!register_operand (op1, Pmode))
37718 op1 = ix86_zero_extend_to_Pmode (op1);
37720 emit_insn (BNDmode == BND64mode
37721 ? gen_bnd64_ldx (target, op0, op1)
37722 : gen_bnd32_ldx (target, op0, op1));
37723 return target;
37725 case IX86_BUILTIN_BNDCL:
37726 arg0 = CALL_EXPR_ARG (exp, 0);
37727 arg1 = CALL_EXPR_ARG (exp, 1);
37729 op0 = expand_normal (arg0);
37730 op1 = expand_normal (arg1);
37732 if (!register_operand (op0, Pmode))
37733 op0 = ix86_zero_extend_to_Pmode (op0);
37734 if (!register_operand (op1, BNDmode))
37735 op1 = copy_to_mode_reg (BNDmode, op1);
37737 emit_insn (BNDmode == BND64mode
37738 ? gen_bnd64_cl (op1, op0)
37739 : gen_bnd32_cl (op1, op0));
37740 return 0;
37742 case IX86_BUILTIN_BNDCU:
37743 arg0 = CALL_EXPR_ARG (exp, 0);
37744 arg1 = CALL_EXPR_ARG (exp, 1);
37746 op0 = expand_normal (arg0);
37747 op1 = expand_normal (arg1);
37749 if (!register_operand (op0, Pmode))
37750 op0 = ix86_zero_extend_to_Pmode (op0);
37751 if (!register_operand (op1, BNDmode))
37752 op1 = copy_to_mode_reg (BNDmode, op1);
37754 emit_insn (BNDmode == BND64mode
37755 ? gen_bnd64_cu (op1, op0)
37756 : gen_bnd32_cu (op1, op0));
37757 return 0;
37759 case IX86_BUILTIN_BNDRET:
37760 arg0 = CALL_EXPR_ARG (exp, 0);
37761 target = chkp_get_rtl_bounds (arg0);
37763 /* If no bounds were specified for returned value,
37764 then use INIT bounds. It usually happens when
37765 some built-in function is expanded. */
37766 if (!target)
37768 rtx t1 = gen_reg_rtx (Pmode);
37769 rtx t2 = gen_reg_rtx (Pmode);
37770 target = gen_reg_rtx (BNDmode);
37771 emit_move_insn (t1, const0_rtx);
37772 emit_move_insn (t2, constm1_rtx);
37773 emit_insn (BNDmode == BND64mode
37774 ? gen_bnd64_mk (target, t1, t2)
37775 : gen_bnd32_mk (target, t1, t2));
37778 gcc_assert (target && REG_P (target));
37779 return target;
37781 case IX86_BUILTIN_BNDNARROW:
37783 rtx m1, m1h1, m1h2, lb, ub, t1;
37785 /* Return value and lb. */
37786 arg0 = CALL_EXPR_ARG (exp, 0);
37787 /* Bounds. */
37788 arg1 = CALL_EXPR_ARG (exp, 1);
37789 /* Size. */
37790 arg2 = CALL_EXPR_ARG (exp, 2);
37792 lb = expand_normal (arg0);
37793 op1 = expand_normal (arg1);
37794 op2 = expand_normal (arg2);
37796 /* Size was passed but we need to use (size - 1) as for bndmk. */
37797 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37798 NULL_RTX, 1, OPTAB_DIRECT);
37800 /* Add LB to size and inverse to get UB. */
37801 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37802 op2, 1, OPTAB_DIRECT);
37803 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37805 if (!register_operand (lb, Pmode))
37806 lb = ix86_zero_extend_to_Pmode (lb);
37807 if (!register_operand (ub, Pmode))
37808 ub = ix86_zero_extend_to_Pmode (ub);
37810 /* We need to move bounds to memory before any computations. */
37811 if (MEM_P (op1))
37812 m1 = op1;
37813 else
37815 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37816 emit_move_insn (m1, op1);
37819 /* Generate mem expression to be used for access to LB and UB. */
37820 m1h1 = adjust_address (m1, Pmode, 0);
37821 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37823 t1 = gen_reg_rtx (Pmode);
37825 /* Compute LB. */
37826 emit_move_insn (t1, m1h1);
37827 ix86_emit_move_max (t1, lb);
37828 emit_move_insn (m1h1, t1);
37830 /* Compute UB. UB is stored in 1's complement form. Therefore
37831 we also use max here. */
37832 emit_move_insn (t1, m1h2);
37833 ix86_emit_move_max (t1, ub);
37834 emit_move_insn (m1h2, t1);
37836 op2 = gen_reg_rtx (BNDmode);
37837 emit_move_insn (op2, m1);
37839 return chkp_join_splitted_slot (lb, op2);
37842 case IX86_BUILTIN_BNDINT:
37844 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37846 if (!target
37847 || GET_MODE (target) != BNDmode
37848 || !register_operand (target, BNDmode))
37849 target = gen_reg_rtx (BNDmode);
37851 arg0 = CALL_EXPR_ARG (exp, 0);
37852 arg1 = CALL_EXPR_ARG (exp, 1);
37854 op0 = expand_normal (arg0);
37855 op1 = expand_normal (arg1);
37857 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37858 rh1 = adjust_address (res, Pmode, 0);
37859 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37861 /* Put first bounds to temporaries. */
37862 lb1 = gen_reg_rtx (Pmode);
37863 ub1 = gen_reg_rtx (Pmode);
37864 if (MEM_P (op0))
37866 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37867 emit_move_insn (ub1, adjust_address (op0, Pmode,
37868 GET_MODE_SIZE (Pmode)));
37870 else
37872 emit_move_insn (res, op0);
37873 emit_move_insn (lb1, rh1);
37874 emit_move_insn (ub1, rh2);
37877 /* Put second bounds to temporaries. */
37878 lb2 = gen_reg_rtx (Pmode);
37879 ub2 = gen_reg_rtx (Pmode);
37880 if (MEM_P (op1))
37882 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37883 emit_move_insn (ub2, adjust_address (op1, Pmode,
37884 GET_MODE_SIZE (Pmode)));
37886 else
37888 emit_move_insn (res, op1);
37889 emit_move_insn (lb2, rh1);
37890 emit_move_insn (ub2, rh2);
37893 /* Compute LB. */
37894 ix86_emit_move_max (lb1, lb2);
37895 emit_move_insn (rh1, lb1);
37897 /* Compute UB. UB is stored in 1's complement form. Therefore
37898 we also use max here. */
37899 ix86_emit_move_max (ub1, ub2);
37900 emit_move_insn (rh2, ub1);
37902 emit_move_insn (target, res);
37904 return target;
37907 case IX86_BUILTIN_SIZEOF:
37909 tree name;
37910 rtx symbol;
37912 if (!target
37913 || GET_MODE (target) != Pmode
37914 || !register_operand (target, Pmode))
37915 target = gen_reg_rtx (Pmode);
37917 arg0 = CALL_EXPR_ARG (exp, 0);
37918 gcc_assert (VAR_P (arg0));
37920 name = DECL_ASSEMBLER_NAME (arg0);
37921 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37923 emit_insn (Pmode == SImode
37924 ? gen_move_size_reloc_si (target, symbol)
37925 : gen_move_size_reloc_di (target, symbol));
37927 return target;
37930 case IX86_BUILTIN_BNDLOWER:
37932 rtx mem, hmem;
37934 if (!target
37935 || GET_MODE (target) != Pmode
37936 || !register_operand (target, Pmode))
37937 target = gen_reg_rtx (Pmode);
37939 arg0 = CALL_EXPR_ARG (exp, 0);
37940 op0 = expand_normal (arg0);
37942 /* We need to move bounds to memory first. */
37943 if (MEM_P (op0))
37944 mem = op0;
37945 else
37947 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37948 emit_move_insn (mem, op0);
37951 /* Generate mem expression to access LB and load it. */
37952 hmem = adjust_address (mem, Pmode, 0);
37953 emit_move_insn (target, hmem);
37955 return target;
37958 case IX86_BUILTIN_BNDUPPER:
37960 rtx mem, hmem, res;
37962 if (!target
37963 || GET_MODE (target) != Pmode
37964 || !register_operand (target, Pmode))
37965 target = gen_reg_rtx (Pmode);
37967 arg0 = CALL_EXPR_ARG (exp, 0);
37968 op0 = expand_normal (arg0);
37970 /* We need to move bounds to memory first. */
37971 if (MEM_P (op0))
37972 mem = op0;
37973 else
37975 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37976 emit_move_insn (mem, op0);
37979 /* Generate mem expression to access UB. */
37980 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37982 /* We need to inverse all bits of UB. */
37983 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37985 if (res != target)
37986 emit_move_insn (target, res);
37988 return target;
37991 case IX86_BUILTIN_MASKMOVQ:
37992 case IX86_BUILTIN_MASKMOVDQU:
37993 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37994 ? CODE_FOR_mmx_maskmovq
37995 : CODE_FOR_sse2_maskmovdqu);
37996 /* Note the arg order is different from the operand order. */
37997 arg1 = CALL_EXPR_ARG (exp, 0);
37998 arg2 = CALL_EXPR_ARG (exp, 1);
37999 arg0 = CALL_EXPR_ARG (exp, 2);
38000 op0 = expand_normal (arg0);
38001 op1 = expand_normal (arg1);
38002 op2 = expand_normal (arg2);
38003 mode0 = insn_data[icode].operand[0].mode;
38004 mode1 = insn_data[icode].operand[1].mode;
38005 mode2 = insn_data[icode].operand[2].mode;
38007 op0 = ix86_zero_extend_to_Pmode (op0);
38008 op0 = gen_rtx_MEM (mode1, op0);
38010 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38011 op0 = copy_to_mode_reg (mode0, op0);
38012 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38013 op1 = copy_to_mode_reg (mode1, op1);
38014 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38015 op2 = copy_to_mode_reg (mode2, op2);
38016 pat = GEN_FCN (icode) (op0, op1, op2);
38017 if (! pat)
38018 return 0;
38019 emit_insn (pat);
38020 return 0;
38022 case IX86_BUILTIN_LDMXCSR:
38023 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
38024 target = assign_386_stack_local (SImode, SLOT_TEMP);
38025 emit_move_insn (target, op0);
38026 emit_insn (gen_sse_ldmxcsr (target));
38027 return 0;
38029 case IX86_BUILTIN_STMXCSR:
38030 target = assign_386_stack_local (SImode, SLOT_TEMP);
38031 emit_insn (gen_sse_stmxcsr (target));
38032 return copy_to_mode_reg (SImode, target);
38034 case IX86_BUILTIN_CLFLUSH:
38035 arg0 = CALL_EXPR_ARG (exp, 0);
38036 op0 = expand_normal (arg0);
38037 icode = CODE_FOR_sse2_clflush;
38038 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38039 op0 = ix86_zero_extend_to_Pmode (op0);
38041 emit_insn (gen_sse2_clflush (op0));
38042 return 0;
38044 case IX86_BUILTIN_CLWB:
38045 arg0 = CALL_EXPR_ARG (exp, 0);
38046 op0 = expand_normal (arg0);
38047 icode = CODE_FOR_clwb;
38048 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38049 op0 = ix86_zero_extend_to_Pmode (op0);
38051 emit_insn (gen_clwb (op0));
38052 return 0;
38054 case IX86_BUILTIN_CLFLUSHOPT:
38055 arg0 = CALL_EXPR_ARG (exp, 0);
38056 op0 = expand_normal (arg0);
38057 icode = CODE_FOR_clflushopt;
38058 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38059 op0 = ix86_zero_extend_to_Pmode (op0);
38061 emit_insn (gen_clflushopt (op0));
38062 return 0;
38064 case IX86_BUILTIN_MONITOR:
38065 case IX86_BUILTIN_MONITORX:
38066 arg0 = CALL_EXPR_ARG (exp, 0);
38067 arg1 = CALL_EXPR_ARG (exp, 1);
38068 arg2 = CALL_EXPR_ARG (exp, 2);
38069 op0 = expand_normal (arg0);
38070 op1 = expand_normal (arg1);
38071 op2 = expand_normal (arg2);
38072 if (!REG_P (op0))
38073 op0 = ix86_zero_extend_to_Pmode (op0);
38074 if (!REG_P (op1))
38075 op1 = copy_to_mode_reg (SImode, op1);
38076 if (!REG_P (op2))
38077 op2 = copy_to_mode_reg (SImode, op2);
38079 emit_insn (fcode == IX86_BUILTIN_MONITOR
38080 ? ix86_gen_monitor (op0, op1, op2)
38081 : ix86_gen_monitorx (op0, op1, op2));
38082 return 0;
38084 case IX86_BUILTIN_MWAIT:
38085 arg0 = CALL_EXPR_ARG (exp, 0);
38086 arg1 = CALL_EXPR_ARG (exp, 1);
38087 op0 = expand_normal (arg0);
38088 op1 = expand_normal (arg1);
38089 if (!REG_P (op0))
38090 op0 = copy_to_mode_reg (SImode, op0);
38091 if (!REG_P (op1))
38092 op1 = copy_to_mode_reg (SImode, op1);
38093 emit_insn (gen_sse3_mwait (op0, op1));
38094 return 0;
38096 case IX86_BUILTIN_MWAITX:
38097 arg0 = CALL_EXPR_ARG (exp, 0);
38098 arg1 = CALL_EXPR_ARG (exp, 1);
38099 arg2 = CALL_EXPR_ARG (exp, 2);
38100 op0 = expand_normal (arg0);
38101 op1 = expand_normal (arg1);
38102 op2 = expand_normal (arg2);
38103 if (!REG_P (op0))
38104 op0 = copy_to_mode_reg (SImode, op0);
38105 if (!REG_P (op1))
38106 op1 = copy_to_mode_reg (SImode, op1);
38107 if (!REG_P (op2))
38108 op2 = copy_to_mode_reg (SImode, op2);
38109 emit_insn (gen_mwaitx (op0, op1, op2));
38110 return 0;
38112 case IX86_BUILTIN_CLZERO:
38113 arg0 = CALL_EXPR_ARG (exp, 0);
38114 op0 = expand_normal (arg0);
38115 if (!REG_P (op0))
38116 op0 = ix86_zero_extend_to_Pmode (op0);
38117 emit_insn (ix86_gen_clzero (op0));
38118 return 0;
38120 case IX86_BUILTIN_VEC_INIT_V2SI:
38121 case IX86_BUILTIN_VEC_INIT_V4HI:
38122 case IX86_BUILTIN_VEC_INIT_V8QI:
38123 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
38125 case IX86_BUILTIN_VEC_EXT_V2DF:
38126 case IX86_BUILTIN_VEC_EXT_V2DI:
38127 case IX86_BUILTIN_VEC_EXT_V4SF:
38128 case IX86_BUILTIN_VEC_EXT_V4SI:
38129 case IX86_BUILTIN_VEC_EXT_V8HI:
38130 case IX86_BUILTIN_VEC_EXT_V2SI:
38131 case IX86_BUILTIN_VEC_EXT_V4HI:
38132 case IX86_BUILTIN_VEC_EXT_V16QI:
38133 return ix86_expand_vec_ext_builtin (exp, target);
38135 case IX86_BUILTIN_VEC_SET_V2DI:
38136 case IX86_BUILTIN_VEC_SET_V4SF:
38137 case IX86_BUILTIN_VEC_SET_V4SI:
38138 case IX86_BUILTIN_VEC_SET_V8HI:
38139 case IX86_BUILTIN_VEC_SET_V4HI:
38140 case IX86_BUILTIN_VEC_SET_V16QI:
38141 return ix86_expand_vec_set_builtin (exp);
38143 case IX86_BUILTIN_NANQ:
38144 case IX86_BUILTIN_NANSQ:
38145 return expand_call (exp, target, ignore);
38147 case IX86_BUILTIN_RDPMC:
38148 case IX86_BUILTIN_RDTSC:
38149 case IX86_BUILTIN_RDTSCP:
38150 case IX86_BUILTIN_XGETBV:
38152 op0 = gen_reg_rtx (DImode);
38153 op1 = gen_reg_rtx (DImode);
38155 if (fcode == IX86_BUILTIN_RDPMC)
38157 arg0 = CALL_EXPR_ARG (exp, 0);
38158 op2 = expand_normal (arg0);
38159 if (!register_operand (op2, SImode))
38160 op2 = copy_to_mode_reg (SImode, op2);
38162 insn = (TARGET_64BIT
38163 ? gen_rdpmc_rex64 (op0, op1, op2)
38164 : gen_rdpmc (op0, op2));
38165 emit_insn (insn);
38167 else if (fcode == IX86_BUILTIN_XGETBV)
38169 arg0 = CALL_EXPR_ARG (exp, 0);
38170 op2 = expand_normal (arg0);
38171 if (!register_operand (op2, SImode))
38172 op2 = copy_to_mode_reg (SImode, op2);
38174 insn = (TARGET_64BIT
38175 ? gen_xgetbv_rex64 (op0, op1, op2)
38176 : gen_xgetbv (op0, op2));
38177 emit_insn (insn);
38179 else if (fcode == IX86_BUILTIN_RDTSC)
38181 insn = (TARGET_64BIT
38182 ? gen_rdtsc_rex64 (op0, op1)
38183 : gen_rdtsc (op0));
38184 emit_insn (insn);
38186 else
38188 op2 = gen_reg_rtx (SImode);
38190 insn = (TARGET_64BIT
38191 ? gen_rdtscp_rex64 (op0, op1, op2)
38192 : gen_rdtscp (op0, op2));
38193 emit_insn (insn);
38195 arg0 = CALL_EXPR_ARG (exp, 0);
38196 op4 = expand_normal (arg0);
38197 if (!address_operand (op4, VOIDmode))
38199 op4 = convert_memory_address (Pmode, op4);
38200 op4 = copy_addr_to_reg (op4);
38202 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38205 if (target == 0)
38207 /* mode is VOIDmode if __builtin_rd* has been called
38208 without lhs. */
38209 if (mode == VOIDmode)
38210 return target;
38211 target = gen_reg_rtx (mode);
38214 if (TARGET_64BIT)
38216 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38217 op1, 1, OPTAB_DIRECT);
38218 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38219 op0, 1, OPTAB_DIRECT);
38222 emit_move_insn (target, op0);
38223 return target;
38225 case IX86_BUILTIN_FXSAVE:
38226 case IX86_BUILTIN_FXRSTOR:
38227 case IX86_BUILTIN_FXSAVE64:
38228 case IX86_BUILTIN_FXRSTOR64:
38229 case IX86_BUILTIN_FNSTENV:
38230 case IX86_BUILTIN_FLDENV:
38231 mode0 = BLKmode;
38232 switch (fcode)
38234 case IX86_BUILTIN_FXSAVE:
38235 icode = CODE_FOR_fxsave;
38236 break;
38237 case IX86_BUILTIN_FXRSTOR:
38238 icode = CODE_FOR_fxrstor;
38239 break;
38240 case IX86_BUILTIN_FXSAVE64:
38241 icode = CODE_FOR_fxsave64;
38242 break;
38243 case IX86_BUILTIN_FXRSTOR64:
38244 icode = CODE_FOR_fxrstor64;
38245 break;
38246 case IX86_BUILTIN_FNSTENV:
38247 icode = CODE_FOR_fnstenv;
38248 break;
38249 case IX86_BUILTIN_FLDENV:
38250 icode = CODE_FOR_fldenv;
38251 break;
38252 default:
38253 gcc_unreachable ();
38256 arg0 = CALL_EXPR_ARG (exp, 0);
38257 op0 = expand_normal (arg0);
38259 if (!address_operand (op0, VOIDmode))
38261 op0 = convert_memory_address (Pmode, op0);
38262 op0 = copy_addr_to_reg (op0);
38264 op0 = gen_rtx_MEM (mode0, op0);
38266 pat = GEN_FCN (icode) (op0);
38267 if (pat)
38268 emit_insn (pat);
38269 return 0;
38271 case IX86_BUILTIN_XSETBV:
38272 arg0 = CALL_EXPR_ARG (exp, 0);
38273 arg1 = CALL_EXPR_ARG (exp, 1);
38274 op0 = expand_normal (arg0);
38275 op1 = expand_normal (arg1);
38277 if (!REG_P (op0))
38278 op0 = copy_to_mode_reg (SImode, op0);
38280 if (TARGET_64BIT)
38282 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38283 NULL, 1, OPTAB_DIRECT);
38285 op2 = gen_lowpart (SImode, op2);
38286 op1 = gen_lowpart (SImode, op1);
38287 if (!REG_P (op1))
38288 op1 = copy_to_mode_reg (SImode, op1);
38289 if (!REG_P (op2))
38290 op2 = copy_to_mode_reg (SImode, op2);
38291 icode = CODE_FOR_xsetbv_rex64;
38292 pat = GEN_FCN (icode) (op0, op1, op2);
38294 else
38296 if (!REG_P (op1))
38297 op1 = copy_to_mode_reg (DImode, op1);
38298 icode = CODE_FOR_xsetbv;
38299 pat = GEN_FCN (icode) (op0, op1);
38301 if (pat)
38302 emit_insn (pat);
38303 return 0;
38305 case IX86_BUILTIN_XSAVE:
38306 case IX86_BUILTIN_XRSTOR:
38307 case IX86_BUILTIN_XSAVE64:
38308 case IX86_BUILTIN_XRSTOR64:
38309 case IX86_BUILTIN_XSAVEOPT:
38310 case IX86_BUILTIN_XSAVEOPT64:
38311 case IX86_BUILTIN_XSAVES:
38312 case IX86_BUILTIN_XRSTORS:
38313 case IX86_BUILTIN_XSAVES64:
38314 case IX86_BUILTIN_XRSTORS64:
38315 case IX86_BUILTIN_XSAVEC:
38316 case IX86_BUILTIN_XSAVEC64:
38317 arg0 = CALL_EXPR_ARG (exp, 0);
38318 arg1 = CALL_EXPR_ARG (exp, 1);
38319 op0 = expand_normal (arg0);
38320 op1 = expand_normal (arg1);
38322 if (!address_operand (op0, VOIDmode))
38324 op0 = convert_memory_address (Pmode, op0);
38325 op0 = copy_addr_to_reg (op0);
38327 op0 = gen_rtx_MEM (BLKmode, op0);
38329 op1 = force_reg (DImode, op1);
38331 if (TARGET_64BIT)
38333 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38334 NULL, 1, OPTAB_DIRECT);
38335 switch (fcode)
38337 case IX86_BUILTIN_XSAVE:
38338 icode = CODE_FOR_xsave_rex64;
38339 break;
38340 case IX86_BUILTIN_XRSTOR:
38341 icode = CODE_FOR_xrstor_rex64;
38342 break;
38343 case IX86_BUILTIN_XSAVE64:
38344 icode = CODE_FOR_xsave64;
38345 break;
38346 case IX86_BUILTIN_XRSTOR64:
38347 icode = CODE_FOR_xrstor64;
38348 break;
38349 case IX86_BUILTIN_XSAVEOPT:
38350 icode = CODE_FOR_xsaveopt_rex64;
38351 break;
38352 case IX86_BUILTIN_XSAVEOPT64:
38353 icode = CODE_FOR_xsaveopt64;
38354 break;
38355 case IX86_BUILTIN_XSAVES:
38356 icode = CODE_FOR_xsaves_rex64;
38357 break;
38358 case IX86_BUILTIN_XRSTORS:
38359 icode = CODE_FOR_xrstors_rex64;
38360 break;
38361 case IX86_BUILTIN_XSAVES64:
38362 icode = CODE_FOR_xsaves64;
38363 break;
38364 case IX86_BUILTIN_XRSTORS64:
38365 icode = CODE_FOR_xrstors64;
38366 break;
38367 case IX86_BUILTIN_XSAVEC:
38368 icode = CODE_FOR_xsavec_rex64;
38369 break;
38370 case IX86_BUILTIN_XSAVEC64:
38371 icode = CODE_FOR_xsavec64;
38372 break;
38373 default:
38374 gcc_unreachable ();
38377 op2 = gen_lowpart (SImode, op2);
38378 op1 = gen_lowpart (SImode, op1);
38379 pat = GEN_FCN (icode) (op0, op1, op2);
38381 else
38383 switch (fcode)
38385 case IX86_BUILTIN_XSAVE:
38386 icode = CODE_FOR_xsave;
38387 break;
38388 case IX86_BUILTIN_XRSTOR:
38389 icode = CODE_FOR_xrstor;
38390 break;
38391 case IX86_BUILTIN_XSAVEOPT:
38392 icode = CODE_FOR_xsaveopt;
38393 break;
38394 case IX86_BUILTIN_XSAVES:
38395 icode = CODE_FOR_xsaves;
38396 break;
38397 case IX86_BUILTIN_XRSTORS:
38398 icode = CODE_FOR_xrstors;
38399 break;
38400 case IX86_BUILTIN_XSAVEC:
38401 icode = CODE_FOR_xsavec;
38402 break;
38403 default:
38404 gcc_unreachable ();
38406 pat = GEN_FCN (icode) (op0, op1);
38409 if (pat)
38410 emit_insn (pat);
38411 return 0;
38413 case IX86_BUILTIN_LLWPCB:
38414 arg0 = CALL_EXPR_ARG (exp, 0);
38415 op0 = expand_normal (arg0);
38416 icode = CODE_FOR_lwp_llwpcb;
38417 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38418 op0 = ix86_zero_extend_to_Pmode (op0);
38419 emit_insn (gen_lwp_llwpcb (op0));
38420 return 0;
38422 case IX86_BUILTIN_SLWPCB:
38423 icode = CODE_FOR_lwp_slwpcb;
38424 if (!target
38425 || !insn_data[icode].operand[0].predicate (target, Pmode))
38426 target = gen_reg_rtx (Pmode);
38427 emit_insn (gen_lwp_slwpcb (target));
38428 return target;
38430 case IX86_BUILTIN_BEXTRI32:
38431 case IX86_BUILTIN_BEXTRI64:
38432 arg0 = CALL_EXPR_ARG (exp, 0);
38433 arg1 = CALL_EXPR_ARG (exp, 1);
38434 op0 = expand_normal (arg0);
38435 op1 = expand_normal (arg1);
38436 icode = (fcode == IX86_BUILTIN_BEXTRI32
38437 ? CODE_FOR_tbm_bextri_si
38438 : CODE_FOR_tbm_bextri_di);
38439 if (!CONST_INT_P (op1))
38441 error ("last argument must be an immediate");
38442 return const0_rtx;
38444 else
38446 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38447 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38448 op1 = GEN_INT (length);
38449 op2 = GEN_INT (lsb_index);
38450 pat = GEN_FCN (icode) (target, op0, op1, op2);
38451 if (pat)
38452 emit_insn (pat);
38453 return target;
38456 case IX86_BUILTIN_RDRAND16_STEP:
38457 icode = CODE_FOR_rdrandhi_1;
38458 mode0 = HImode;
38459 goto rdrand_step;
38461 case IX86_BUILTIN_RDRAND32_STEP:
38462 icode = CODE_FOR_rdrandsi_1;
38463 mode0 = SImode;
38464 goto rdrand_step;
38466 case IX86_BUILTIN_RDRAND64_STEP:
38467 icode = CODE_FOR_rdranddi_1;
38468 mode0 = DImode;
38470 rdrand_step:
38471 arg0 = CALL_EXPR_ARG (exp, 0);
38472 op1 = expand_normal (arg0);
38473 if (!address_operand (op1, VOIDmode))
38475 op1 = convert_memory_address (Pmode, op1);
38476 op1 = copy_addr_to_reg (op1);
38479 op0 = gen_reg_rtx (mode0);
38480 emit_insn (GEN_FCN (icode) (op0));
38482 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38484 op1 = gen_reg_rtx (SImode);
38485 emit_move_insn (op1, CONST1_RTX (SImode));
38487 /* Emit SImode conditional move. */
38488 if (mode0 == HImode)
38490 if (TARGET_ZERO_EXTEND_WITH_AND
38491 && optimize_function_for_speed_p (cfun))
38493 op2 = force_reg (SImode, const0_rtx);
38495 emit_insn (gen_movstricthi
38496 (gen_lowpart (HImode, op2), op0));
38498 else
38500 op2 = gen_reg_rtx (SImode);
38502 emit_insn (gen_zero_extendhisi2 (op2, op0));
38505 else if (mode0 == SImode)
38506 op2 = op0;
38507 else
38508 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38510 if (target == 0
38511 || !register_operand (target, SImode))
38512 target = gen_reg_rtx (SImode);
38514 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38515 const0_rtx);
38516 emit_insn (gen_rtx_SET (target,
38517 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38518 return target;
38520 case IX86_BUILTIN_RDSEED16_STEP:
38521 icode = CODE_FOR_rdseedhi_1;
38522 mode0 = HImode;
38523 goto rdseed_step;
38525 case IX86_BUILTIN_RDSEED32_STEP:
38526 icode = CODE_FOR_rdseedsi_1;
38527 mode0 = SImode;
38528 goto rdseed_step;
38530 case IX86_BUILTIN_RDSEED64_STEP:
38531 icode = CODE_FOR_rdseeddi_1;
38532 mode0 = DImode;
38534 rdseed_step:
38535 arg0 = CALL_EXPR_ARG (exp, 0);
38536 op1 = expand_normal (arg0);
38537 if (!address_operand (op1, VOIDmode))
38539 op1 = convert_memory_address (Pmode, op1);
38540 op1 = copy_addr_to_reg (op1);
38543 op0 = gen_reg_rtx (mode0);
38544 emit_insn (GEN_FCN (icode) (op0));
38546 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38548 op2 = gen_reg_rtx (QImode);
38550 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38551 const0_rtx);
38552 emit_insn (gen_rtx_SET (op2, pat));
38554 if (target == 0
38555 || !register_operand (target, SImode))
38556 target = gen_reg_rtx (SImode);
38558 emit_insn (gen_zero_extendqisi2 (target, op2));
38559 return target;
38561 case IX86_BUILTIN_SBB32:
38562 icode = CODE_FOR_subborrowsi;
38563 mode0 = SImode;
38564 goto handlecarry;
38566 case IX86_BUILTIN_SBB64:
38567 icode = CODE_FOR_subborrowdi;
38568 mode0 = DImode;
38569 goto handlecarry;
38571 case IX86_BUILTIN_ADDCARRYX32:
38572 icode = CODE_FOR_addcarrysi;
38573 mode0 = SImode;
38574 goto handlecarry;
38576 case IX86_BUILTIN_ADDCARRYX64:
38577 icode = CODE_FOR_addcarrydi;
38578 mode0 = DImode;
38580 handlecarry:
38581 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38582 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38583 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38584 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38586 op1 = expand_normal (arg0);
38587 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38589 op2 = expand_normal (arg1);
38590 if (!register_operand (op2, mode0))
38591 op2 = copy_to_mode_reg (mode0, op2);
38593 op3 = expand_normal (arg2);
38594 if (!register_operand (op3, mode0))
38595 op3 = copy_to_mode_reg (mode0, op3);
38597 op4 = expand_normal (arg3);
38598 if (!address_operand (op4, VOIDmode))
38600 op4 = convert_memory_address (Pmode, op4);
38601 op4 = copy_addr_to_reg (op4);
38604 /* Generate CF from input operand. */
38605 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38607 /* Generate instruction that consumes CF. */
38608 op0 = gen_reg_rtx (mode0);
38610 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38611 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38612 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38614 /* Return current CF value. */
38615 if (target == 0)
38616 target = gen_reg_rtx (QImode);
38618 PUT_MODE (pat, QImode);
38619 emit_insn (gen_rtx_SET (target, pat));
38621 /* Store the result. */
38622 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38624 return target;
38626 case IX86_BUILTIN_READ_FLAGS:
38627 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38629 if (optimize
38630 || target == NULL_RTX
38631 || !nonimmediate_operand (target, word_mode)
38632 || GET_MODE (target) != word_mode)
38633 target = gen_reg_rtx (word_mode);
38635 emit_insn (gen_pop (target));
38636 return target;
38638 case IX86_BUILTIN_WRITE_FLAGS:
38640 arg0 = CALL_EXPR_ARG (exp, 0);
38641 op0 = expand_normal (arg0);
38642 if (!general_no_elim_operand (op0, word_mode))
38643 op0 = copy_to_mode_reg (word_mode, op0);
38645 emit_insn (gen_push (op0));
38646 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38647 return 0;
38649 case IX86_BUILTIN_KTESTC8:
38650 icode = CODE_FOR_ktestqi;
38651 mode3 = CCCmode;
38652 goto kortest;
38654 case IX86_BUILTIN_KTESTZ8:
38655 icode = CODE_FOR_ktestqi;
38656 mode3 = CCZmode;
38657 goto kortest;
38659 case IX86_BUILTIN_KTESTC16:
38660 icode = CODE_FOR_ktesthi;
38661 mode3 = CCCmode;
38662 goto kortest;
38664 case IX86_BUILTIN_KTESTZ16:
38665 icode = CODE_FOR_ktesthi;
38666 mode3 = CCZmode;
38667 goto kortest;
38669 case IX86_BUILTIN_KTESTC32:
38670 icode = CODE_FOR_ktestsi;
38671 mode3 = CCCmode;
38672 goto kortest;
38674 case IX86_BUILTIN_KTESTZ32:
38675 icode = CODE_FOR_ktestsi;
38676 mode3 = CCZmode;
38677 goto kortest;
38679 case IX86_BUILTIN_KTESTC64:
38680 icode = CODE_FOR_ktestdi;
38681 mode3 = CCCmode;
38682 goto kortest;
38684 case IX86_BUILTIN_KTESTZ64:
38685 icode = CODE_FOR_ktestdi;
38686 mode3 = CCZmode;
38687 goto kortest;
38689 case IX86_BUILTIN_KORTESTC8:
38690 icode = CODE_FOR_kortestqi;
38691 mode3 = CCCmode;
38692 goto kortest;
38694 case IX86_BUILTIN_KORTESTZ8:
38695 icode = CODE_FOR_kortestqi;
38696 mode3 = CCZmode;
38697 goto kortest;
38699 case IX86_BUILTIN_KORTESTC16:
38700 icode = CODE_FOR_kortesthi;
38701 mode3 = CCCmode;
38702 goto kortest;
38704 case IX86_BUILTIN_KORTESTZ16:
38705 icode = CODE_FOR_kortesthi;
38706 mode3 = CCZmode;
38707 goto kortest;
38709 case IX86_BUILTIN_KORTESTC32:
38710 icode = CODE_FOR_kortestsi;
38711 mode3 = CCCmode;
38712 goto kortest;
38714 case IX86_BUILTIN_KORTESTZ32:
38715 icode = CODE_FOR_kortestsi;
38716 mode3 = CCZmode;
38717 goto kortest;
38719 case IX86_BUILTIN_KORTESTC64:
38720 icode = CODE_FOR_kortestdi;
38721 mode3 = CCCmode;
38722 goto kortest;
38724 case IX86_BUILTIN_KORTESTZ64:
38725 icode = CODE_FOR_kortestdi;
38726 mode3 = CCZmode;
38728 kortest:
38729 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38730 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38731 op0 = expand_normal (arg0);
38732 op1 = expand_normal (arg1);
38734 mode0 = insn_data[icode].operand[0].mode;
38735 mode1 = insn_data[icode].operand[1].mode;
38737 if (GET_MODE (op0) != VOIDmode)
38738 op0 = force_reg (GET_MODE (op0), op0);
38740 op0 = gen_lowpart (mode0, op0);
38742 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38743 op0 = copy_to_mode_reg (mode0, op0);
38745 if (GET_MODE (op1) != VOIDmode)
38746 op1 = force_reg (GET_MODE (op1), op1);
38748 op1 = gen_lowpart (mode1, op1);
38750 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38751 op1 = copy_to_mode_reg (mode1, op1);
38753 target = gen_reg_rtx (QImode);
38755 /* Emit kortest. */
38756 emit_insn (GEN_FCN (icode) (op0, op1));
38757 /* And use setcc to return result from flags. */
38758 ix86_expand_setcc (target, EQ,
38759 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38760 return target;
38762 case IX86_BUILTIN_GATHERSIV2DF:
38763 icode = CODE_FOR_avx2_gathersiv2df;
38764 goto gather_gen;
38765 case IX86_BUILTIN_GATHERSIV4DF:
38766 icode = CODE_FOR_avx2_gathersiv4df;
38767 goto gather_gen;
38768 case IX86_BUILTIN_GATHERDIV2DF:
38769 icode = CODE_FOR_avx2_gatherdiv2df;
38770 goto gather_gen;
38771 case IX86_BUILTIN_GATHERDIV4DF:
38772 icode = CODE_FOR_avx2_gatherdiv4df;
38773 goto gather_gen;
38774 case IX86_BUILTIN_GATHERSIV4SF:
38775 icode = CODE_FOR_avx2_gathersiv4sf;
38776 goto gather_gen;
38777 case IX86_BUILTIN_GATHERSIV8SF:
38778 icode = CODE_FOR_avx2_gathersiv8sf;
38779 goto gather_gen;
38780 case IX86_BUILTIN_GATHERDIV4SF:
38781 icode = CODE_FOR_avx2_gatherdiv4sf;
38782 goto gather_gen;
38783 case IX86_BUILTIN_GATHERDIV8SF:
38784 icode = CODE_FOR_avx2_gatherdiv8sf;
38785 goto gather_gen;
38786 case IX86_BUILTIN_GATHERSIV2DI:
38787 icode = CODE_FOR_avx2_gathersiv2di;
38788 goto gather_gen;
38789 case IX86_BUILTIN_GATHERSIV4DI:
38790 icode = CODE_FOR_avx2_gathersiv4di;
38791 goto gather_gen;
38792 case IX86_BUILTIN_GATHERDIV2DI:
38793 icode = CODE_FOR_avx2_gatherdiv2di;
38794 goto gather_gen;
38795 case IX86_BUILTIN_GATHERDIV4DI:
38796 icode = CODE_FOR_avx2_gatherdiv4di;
38797 goto gather_gen;
38798 case IX86_BUILTIN_GATHERSIV4SI:
38799 icode = CODE_FOR_avx2_gathersiv4si;
38800 goto gather_gen;
38801 case IX86_BUILTIN_GATHERSIV8SI:
38802 icode = CODE_FOR_avx2_gathersiv8si;
38803 goto gather_gen;
38804 case IX86_BUILTIN_GATHERDIV4SI:
38805 icode = CODE_FOR_avx2_gatherdiv4si;
38806 goto gather_gen;
38807 case IX86_BUILTIN_GATHERDIV8SI:
38808 icode = CODE_FOR_avx2_gatherdiv8si;
38809 goto gather_gen;
38810 case IX86_BUILTIN_GATHERALTSIV4DF:
38811 icode = CODE_FOR_avx2_gathersiv4df;
38812 goto gather_gen;
38813 case IX86_BUILTIN_GATHERALTDIV8SF:
38814 icode = CODE_FOR_avx2_gatherdiv8sf;
38815 goto gather_gen;
38816 case IX86_BUILTIN_GATHERALTSIV4DI:
38817 icode = CODE_FOR_avx2_gathersiv4di;
38818 goto gather_gen;
38819 case IX86_BUILTIN_GATHERALTDIV8SI:
38820 icode = CODE_FOR_avx2_gatherdiv8si;
38821 goto gather_gen;
38822 case IX86_BUILTIN_GATHER3SIV16SF:
38823 icode = CODE_FOR_avx512f_gathersiv16sf;
38824 goto gather_gen;
38825 case IX86_BUILTIN_GATHER3SIV8DF:
38826 icode = CODE_FOR_avx512f_gathersiv8df;
38827 goto gather_gen;
38828 case IX86_BUILTIN_GATHER3DIV16SF:
38829 icode = CODE_FOR_avx512f_gatherdiv16sf;
38830 goto gather_gen;
38831 case IX86_BUILTIN_GATHER3DIV8DF:
38832 icode = CODE_FOR_avx512f_gatherdiv8df;
38833 goto gather_gen;
38834 case IX86_BUILTIN_GATHER3SIV16SI:
38835 icode = CODE_FOR_avx512f_gathersiv16si;
38836 goto gather_gen;
38837 case IX86_BUILTIN_GATHER3SIV8DI:
38838 icode = CODE_FOR_avx512f_gathersiv8di;
38839 goto gather_gen;
38840 case IX86_BUILTIN_GATHER3DIV16SI:
38841 icode = CODE_FOR_avx512f_gatherdiv16si;
38842 goto gather_gen;
38843 case IX86_BUILTIN_GATHER3DIV8DI:
38844 icode = CODE_FOR_avx512f_gatherdiv8di;
38845 goto gather_gen;
38846 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38847 icode = CODE_FOR_avx512f_gathersiv8df;
38848 goto gather_gen;
38849 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38850 icode = CODE_FOR_avx512f_gatherdiv16sf;
38851 goto gather_gen;
38852 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38853 icode = CODE_FOR_avx512f_gathersiv8di;
38854 goto gather_gen;
38855 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38856 icode = CODE_FOR_avx512f_gatherdiv16si;
38857 goto gather_gen;
38858 case IX86_BUILTIN_GATHER3SIV2DF:
38859 icode = CODE_FOR_avx512vl_gathersiv2df;
38860 goto gather_gen;
38861 case IX86_BUILTIN_GATHER3SIV4DF:
38862 icode = CODE_FOR_avx512vl_gathersiv4df;
38863 goto gather_gen;
38864 case IX86_BUILTIN_GATHER3DIV2DF:
38865 icode = CODE_FOR_avx512vl_gatherdiv2df;
38866 goto gather_gen;
38867 case IX86_BUILTIN_GATHER3DIV4DF:
38868 icode = CODE_FOR_avx512vl_gatherdiv4df;
38869 goto gather_gen;
38870 case IX86_BUILTIN_GATHER3SIV4SF:
38871 icode = CODE_FOR_avx512vl_gathersiv4sf;
38872 goto gather_gen;
38873 case IX86_BUILTIN_GATHER3SIV8SF:
38874 icode = CODE_FOR_avx512vl_gathersiv8sf;
38875 goto gather_gen;
38876 case IX86_BUILTIN_GATHER3DIV4SF:
38877 icode = CODE_FOR_avx512vl_gatherdiv4sf;
38878 goto gather_gen;
38879 case IX86_BUILTIN_GATHER3DIV8SF:
38880 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38881 goto gather_gen;
38882 case IX86_BUILTIN_GATHER3SIV2DI:
38883 icode = CODE_FOR_avx512vl_gathersiv2di;
38884 goto gather_gen;
38885 case IX86_BUILTIN_GATHER3SIV4DI:
38886 icode = CODE_FOR_avx512vl_gathersiv4di;
38887 goto gather_gen;
38888 case IX86_BUILTIN_GATHER3DIV2DI:
38889 icode = CODE_FOR_avx512vl_gatherdiv2di;
38890 goto gather_gen;
38891 case IX86_BUILTIN_GATHER3DIV4DI:
38892 icode = CODE_FOR_avx512vl_gatherdiv4di;
38893 goto gather_gen;
38894 case IX86_BUILTIN_GATHER3SIV4SI:
38895 icode = CODE_FOR_avx512vl_gathersiv4si;
38896 goto gather_gen;
38897 case IX86_BUILTIN_GATHER3SIV8SI:
38898 icode = CODE_FOR_avx512vl_gathersiv8si;
38899 goto gather_gen;
38900 case IX86_BUILTIN_GATHER3DIV4SI:
38901 icode = CODE_FOR_avx512vl_gatherdiv4si;
38902 goto gather_gen;
38903 case IX86_BUILTIN_GATHER3DIV8SI:
38904 icode = CODE_FOR_avx512vl_gatherdiv8si;
38905 goto gather_gen;
38906 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38907 icode = CODE_FOR_avx512vl_gathersiv4df;
38908 goto gather_gen;
38909 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38910 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38911 goto gather_gen;
38912 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38913 icode = CODE_FOR_avx512vl_gathersiv4di;
38914 goto gather_gen;
38915 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38916 icode = CODE_FOR_avx512vl_gatherdiv8si;
38917 goto gather_gen;
38918 case IX86_BUILTIN_SCATTERSIV16SF:
38919 icode = CODE_FOR_avx512f_scattersiv16sf;
38920 goto scatter_gen;
38921 case IX86_BUILTIN_SCATTERSIV8DF:
38922 icode = CODE_FOR_avx512f_scattersiv8df;
38923 goto scatter_gen;
38924 case IX86_BUILTIN_SCATTERDIV16SF:
38925 icode = CODE_FOR_avx512f_scatterdiv16sf;
38926 goto scatter_gen;
38927 case IX86_BUILTIN_SCATTERDIV8DF:
38928 icode = CODE_FOR_avx512f_scatterdiv8df;
38929 goto scatter_gen;
38930 case IX86_BUILTIN_SCATTERSIV16SI:
38931 icode = CODE_FOR_avx512f_scattersiv16si;
38932 goto scatter_gen;
38933 case IX86_BUILTIN_SCATTERSIV8DI:
38934 icode = CODE_FOR_avx512f_scattersiv8di;
38935 goto scatter_gen;
38936 case IX86_BUILTIN_SCATTERDIV16SI:
38937 icode = CODE_FOR_avx512f_scatterdiv16si;
38938 goto scatter_gen;
38939 case IX86_BUILTIN_SCATTERDIV8DI:
38940 icode = CODE_FOR_avx512f_scatterdiv8di;
38941 goto scatter_gen;
38942 case IX86_BUILTIN_SCATTERSIV8SF:
38943 icode = CODE_FOR_avx512vl_scattersiv8sf;
38944 goto scatter_gen;
38945 case IX86_BUILTIN_SCATTERSIV4SF:
38946 icode = CODE_FOR_avx512vl_scattersiv4sf;
38947 goto scatter_gen;
38948 case IX86_BUILTIN_SCATTERSIV4DF:
38949 icode = CODE_FOR_avx512vl_scattersiv4df;
38950 goto scatter_gen;
38951 case IX86_BUILTIN_SCATTERSIV2DF:
38952 icode = CODE_FOR_avx512vl_scattersiv2df;
38953 goto scatter_gen;
38954 case IX86_BUILTIN_SCATTERDIV8SF:
38955 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38956 goto scatter_gen;
38957 case IX86_BUILTIN_SCATTERDIV4SF:
38958 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38959 goto scatter_gen;
38960 case IX86_BUILTIN_SCATTERDIV4DF:
38961 icode = CODE_FOR_avx512vl_scatterdiv4df;
38962 goto scatter_gen;
38963 case IX86_BUILTIN_SCATTERDIV2DF:
38964 icode = CODE_FOR_avx512vl_scatterdiv2df;
38965 goto scatter_gen;
38966 case IX86_BUILTIN_SCATTERSIV8SI:
38967 icode = CODE_FOR_avx512vl_scattersiv8si;
38968 goto scatter_gen;
38969 case IX86_BUILTIN_SCATTERSIV4SI:
38970 icode = CODE_FOR_avx512vl_scattersiv4si;
38971 goto scatter_gen;
38972 case IX86_BUILTIN_SCATTERSIV4DI:
38973 icode = CODE_FOR_avx512vl_scattersiv4di;
38974 goto scatter_gen;
38975 case IX86_BUILTIN_SCATTERSIV2DI:
38976 icode = CODE_FOR_avx512vl_scattersiv2di;
38977 goto scatter_gen;
38978 case IX86_BUILTIN_SCATTERDIV8SI:
38979 icode = CODE_FOR_avx512vl_scatterdiv8si;
38980 goto scatter_gen;
38981 case IX86_BUILTIN_SCATTERDIV4SI:
38982 icode = CODE_FOR_avx512vl_scatterdiv4si;
38983 goto scatter_gen;
38984 case IX86_BUILTIN_SCATTERDIV4DI:
38985 icode = CODE_FOR_avx512vl_scatterdiv4di;
38986 goto scatter_gen;
38987 case IX86_BUILTIN_SCATTERDIV2DI:
38988 icode = CODE_FOR_avx512vl_scatterdiv2di;
38989 goto scatter_gen;
38990 case IX86_BUILTIN_GATHERPFDPD:
38991 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38992 goto vec_prefetch_gen;
38993 case IX86_BUILTIN_SCATTERALTSIV8DF:
38994 icode = CODE_FOR_avx512f_scattersiv8df;
38995 goto scatter_gen;
38996 case IX86_BUILTIN_SCATTERALTDIV16SF:
38997 icode = CODE_FOR_avx512f_scatterdiv16sf;
38998 goto scatter_gen;
38999 case IX86_BUILTIN_SCATTERALTSIV8DI:
39000 icode = CODE_FOR_avx512f_scattersiv8di;
39001 goto scatter_gen;
39002 case IX86_BUILTIN_SCATTERALTDIV16SI:
39003 icode = CODE_FOR_avx512f_scatterdiv16si;
39004 goto scatter_gen;
39005 case IX86_BUILTIN_GATHERPFDPS:
39006 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
39007 goto vec_prefetch_gen;
39008 case IX86_BUILTIN_GATHERPFQPD:
39009 icode = CODE_FOR_avx512pf_gatherpfv8didf;
39010 goto vec_prefetch_gen;
39011 case IX86_BUILTIN_GATHERPFQPS:
39012 icode = CODE_FOR_avx512pf_gatherpfv8disf;
39013 goto vec_prefetch_gen;
39014 case IX86_BUILTIN_SCATTERPFDPD:
39015 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
39016 goto vec_prefetch_gen;
39017 case IX86_BUILTIN_SCATTERPFDPS:
39018 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
39019 goto vec_prefetch_gen;
39020 case IX86_BUILTIN_SCATTERPFQPD:
39021 icode = CODE_FOR_avx512pf_scatterpfv8didf;
39022 goto vec_prefetch_gen;
39023 case IX86_BUILTIN_SCATTERPFQPS:
39024 icode = CODE_FOR_avx512pf_scatterpfv8disf;
39025 goto vec_prefetch_gen;
39027 gather_gen:
39028 rtx half;
39029 rtx (*gen) (rtx, rtx);
39031 arg0 = CALL_EXPR_ARG (exp, 0);
39032 arg1 = CALL_EXPR_ARG (exp, 1);
39033 arg2 = CALL_EXPR_ARG (exp, 2);
39034 arg3 = CALL_EXPR_ARG (exp, 3);
39035 arg4 = CALL_EXPR_ARG (exp, 4);
39036 op0 = expand_normal (arg0);
39037 op1 = expand_normal (arg1);
39038 op2 = expand_normal (arg2);
39039 op3 = expand_normal (arg3);
39040 op4 = expand_normal (arg4);
39041 /* Note the arg order is different from the operand order. */
39042 mode0 = insn_data[icode].operand[1].mode;
39043 mode2 = insn_data[icode].operand[3].mode;
39044 mode3 = insn_data[icode].operand[4].mode;
39045 mode4 = insn_data[icode].operand[5].mode;
39047 if (target == NULL_RTX
39048 || GET_MODE (target) != insn_data[icode].operand[0].mode
39049 || !insn_data[icode].operand[0].predicate (target,
39050 GET_MODE (target)))
39051 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
39052 else
39053 subtarget = target;
39055 switch (fcode)
39057 case IX86_BUILTIN_GATHER3ALTSIV8DF:
39058 case IX86_BUILTIN_GATHER3ALTSIV8DI:
39059 half = gen_reg_rtx (V8SImode);
39060 if (!nonimmediate_operand (op2, V16SImode))
39061 op2 = copy_to_mode_reg (V16SImode, op2);
39062 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39063 op2 = half;
39064 break;
39065 case IX86_BUILTIN_GATHER3ALTSIV4DF:
39066 case IX86_BUILTIN_GATHER3ALTSIV4DI:
39067 case IX86_BUILTIN_GATHERALTSIV4DF:
39068 case IX86_BUILTIN_GATHERALTSIV4DI:
39069 half = gen_reg_rtx (V4SImode);
39070 if (!nonimmediate_operand (op2, V8SImode))
39071 op2 = copy_to_mode_reg (V8SImode, op2);
39072 emit_insn (gen_vec_extract_lo_v8si (half, op2));
39073 op2 = half;
39074 break;
39075 case IX86_BUILTIN_GATHER3ALTDIV16SF:
39076 case IX86_BUILTIN_GATHER3ALTDIV16SI:
39077 half = gen_reg_rtx (mode0);
39078 if (mode0 == V8SFmode)
39079 gen = gen_vec_extract_lo_v16sf;
39080 else
39081 gen = gen_vec_extract_lo_v16si;
39082 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39083 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39084 emit_insn (gen (half, op0));
39085 op0 = half;
39086 if (GET_MODE (op3) != VOIDmode)
39088 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39089 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39090 emit_insn (gen (half, op3));
39091 op3 = half;
39093 break;
39094 case IX86_BUILTIN_GATHER3ALTDIV8SF:
39095 case IX86_BUILTIN_GATHER3ALTDIV8SI:
39096 case IX86_BUILTIN_GATHERALTDIV8SF:
39097 case IX86_BUILTIN_GATHERALTDIV8SI:
39098 half = gen_reg_rtx (mode0);
39099 if (mode0 == V4SFmode)
39100 gen = gen_vec_extract_lo_v8sf;
39101 else
39102 gen = gen_vec_extract_lo_v8si;
39103 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39104 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39105 emit_insn (gen (half, op0));
39106 op0 = half;
39107 if (GET_MODE (op3) != VOIDmode)
39109 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39110 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39111 emit_insn (gen (half, op3));
39112 op3 = half;
39114 break;
39115 default:
39116 break;
39119 /* Force memory operand only with base register here. But we
39120 don't want to do it on memory operand for other builtin
39121 functions. */
39122 op1 = ix86_zero_extend_to_Pmode (op1);
39124 if (!insn_data[icode].operand[1].predicate (op0, mode0))
39125 op0 = copy_to_mode_reg (mode0, op0);
39126 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
39127 op1 = copy_to_mode_reg (Pmode, op1);
39128 if (!insn_data[icode].operand[3].predicate (op2, mode2))
39129 op2 = copy_to_mode_reg (mode2, op2);
39131 op3 = fixup_modeless_constant (op3, mode3);
39133 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
39135 if (!insn_data[icode].operand[4].predicate (op3, mode3))
39136 op3 = copy_to_mode_reg (mode3, op3);
39138 else
39140 op3 = copy_to_reg (op3);
39141 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
39143 if (!insn_data[icode].operand[5].predicate (op4, mode4))
39145 error ("the last argument must be scale 1, 2, 4, 8");
39146 return const0_rtx;
39149 /* Optimize. If mask is known to have all high bits set,
39150 replace op0 with pc_rtx to signal that the instruction
39151 overwrites the whole destination and doesn't use its
39152 previous contents. */
39153 if (optimize)
39155 if (TREE_CODE (arg3) == INTEGER_CST)
39157 if (integer_all_onesp (arg3))
39158 op0 = pc_rtx;
39160 else if (TREE_CODE (arg3) == VECTOR_CST)
39162 unsigned int negative = 0;
39163 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39165 tree cst = VECTOR_CST_ELT (arg3, i);
39166 if (TREE_CODE (cst) == INTEGER_CST
39167 && tree_int_cst_sign_bit (cst))
39168 negative++;
39169 else if (TREE_CODE (cst) == REAL_CST
39170 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39171 negative++;
39173 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39174 op0 = pc_rtx;
39176 else if (TREE_CODE (arg3) == SSA_NAME
39177 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39179 /* Recognize also when mask is like:
39180 __v2df src = _mm_setzero_pd ();
39181 __v2df mask = _mm_cmpeq_pd (src, src);
39183 __v8sf src = _mm256_setzero_ps ();
39184 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39185 as that is a cheaper way to load all ones into
39186 a register than having to load a constant from
39187 memory. */
39188 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39189 if (is_gimple_call (def_stmt))
39191 tree fndecl = gimple_call_fndecl (def_stmt);
39192 if (fndecl
39193 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39194 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39196 case IX86_BUILTIN_CMPPD:
39197 case IX86_BUILTIN_CMPPS:
39198 case IX86_BUILTIN_CMPPD256:
39199 case IX86_BUILTIN_CMPPS256:
39200 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39201 break;
39202 /* FALLTHRU */
39203 case IX86_BUILTIN_CMPEQPD:
39204 case IX86_BUILTIN_CMPEQPS:
39205 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39206 && initializer_zerop (gimple_call_arg (def_stmt,
39207 1)))
39208 op0 = pc_rtx;
39209 break;
39210 default:
39211 break;
39217 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39218 if (! pat)
39219 return const0_rtx;
39220 emit_insn (pat);
39222 switch (fcode)
39224 case IX86_BUILTIN_GATHER3DIV16SF:
39225 if (target == NULL_RTX)
39226 target = gen_reg_rtx (V8SFmode);
39227 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39228 break;
39229 case IX86_BUILTIN_GATHER3DIV16SI:
39230 if (target == NULL_RTX)
39231 target = gen_reg_rtx (V8SImode);
39232 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39233 break;
39234 case IX86_BUILTIN_GATHER3DIV8SF:
39235 case IX86_BUILTIN_GATHERDIV8SF:
39236 if (target == NULL_RTX)
39237 target = gen_reg_rtx (V4SFmode);
39238 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39239 break;
39240 case IX86_BUILTIN_GATHER3DIV8SI:
39241 case IX86_BUILTIN_GATHERDIV8SI:
39242 if (target == NULL_RTX)
39243 target = gen_reg_rtx (V4SImode);
39244 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39245 break;
39246 default:
39247 target = subtarget;
39248 break;
39250 return target;
39252 scatter_gen:
39253 arg0 = CALL_EXPR_ARG (exp, 0);
39254 arg1 = CALL_EXPR_ARG (exp, 1);
39255 arg2 = CALL_EXPR_ARG (exp, 2);
39256 arg3 = CALL_EXPR_ARG (exp, 3);
39257 arg4 = CALL_EXPR_ARG (exp, 4);
39258 op0 = expand_normal (arg0);
39259 op1 = expand_normal (arg1);
39260 op2 = expand_normal (arg2);
39261 op3 = expand_normal (arg3);
39262 op4 = expand_normal (arg4);
39263 mode1 = insn_data[icode].operand[1].mode;
39264 mode2 = insn_data[icode].operand[2].mode;
39265 mode3 = insn_data[icode].operand[3].mode;
39266 mode4 = insn_data[icode].operand[4].mode;
39268 /* Scatter instruction stores operand op3 to memory with
39269 indices from op2 and scale from op4 under writemask op1.
39270 If index operand op2 has more elements then source operand
39271 op3 one need to use only its low half. And vice versa. */
39272 switch (fcode)
39274 case IX86_BUILTIN_SCATTERALTSIV8DF:
39275 case IX86_BUILTIN_SCATTERALTSIV8DI:
39276 half = gen_reg_rtx (V8SImode);
39277 if (!nonimmediate_operand (op2, V16SImode))
39278 op2 = copy_to_mode_reg (V16SImode, op2);
39279 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39280 op2 = half;
39281 break;
39282 case IX86_BUILTIN_SCATTERALTDIV16SF:
39283 case IX86_BUILTIN_SCATTERALTDIV16SI:
39284 half = gen_reg_rtx (mode3);
39285 if (mode3 == V8SFmode)
39286 gen = gen_vec_extract_lo_v16sf;
39287 else
39288 gen = gen_vec_extract_lo_v16si;
39289 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39290 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39291 emit_insn (gen (half, op3));
39292 op3 = half;
39293 break;
39294 default:
39295 break;
39298 /* Force memory operand only with base register here. But we
39299 don't want to do it on memory operand for other builtin
39300 functions. */
39301 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39303 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39304 op0 = copy_to_mode_reg (Pmode, op0);
39306 op1 = fixup_modeless_constant (op1, mode1);
39308 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39310 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39311 op1 = copy_to_mode_reg (mode1, op1);
39313 else
39315 op1 = copy_to_reg (op1);
39316 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39319 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39320 op2 = copy_to_mode_reg (mode2, op2);
39322 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39323 op3 = copy_to_mode_reg (mode3, op3);
39325 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39327 error ("the last argument must be scale 1, 2, 4, 8");
39328 return const0_rtx;
39331 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39332 if (! pat)
39333 return const0_rtx;
39335 emit_insn (pat);
39336 return 0;
39338 vec_prefetch_gen:
39339 arg0 = CALL_EXPR_ARG (exp, 0);
39340 arg1 = CALL_EXPR_ARG (exp, 1);
39341 arg2 = CALL_EXPR_ARG (exp, 2);
39342 arg3 = CALL_EXPR_ARG (exp, 3);
39343 arg4 = CALL_EXPR_ARG (exp, 4);
39344 op0 = expand_normal (arg0);
39345 op1 = expand_normal (arg1);
39346 op2 = expand_normal (arg2);
39347 op3 = expand_normal (arg3);
39348 op4 = expand_normal (arg4);
39349 mode0 = insn_data[icode].operand[0].mode;
39350 mode1 = insn_data[icode].operand[1].mode;
39351 mode3 = insn_data[icode].operand[3].mode;
39352 mode4 = insn_data[icode].operand[4].mode;
39354 op0 = fixup_modeless_constant (op0, mode0);
39356 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39358 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39359 op0 = copy_to_mode_reg (mode0, op0);
39361 else
39363 op0 = copy_to_reg (op0);
39364 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39367 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39368 op1 = copy_to_mode_reg (mode1, op1);
39370 /* Force memory operand only with base register here. But we
39371 don't want to do it on memory operand for other builtin
39372 functions. */
39373 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39375 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39376 op2 = copy_to_mode_reg (Pmode, op2);
39378 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39380 error ("the forth argument must be scale 1, 2, 4, 8");
39381 return const0_rtx;
39384 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39386 error ("incorrect hint operand");
39387 return const0_rtx;
39390 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39391 if (! pat)
39392 return const0_rtx;
39394 emit_insn (pat);
39396 return 0;
39398 case IX86_BUILTIN_XABORT:
39399 icode = CODE_FOR_xabort;
39400 arg0 = CALL_EXPR_ARG (exp, 0);
39401 op0 = expand_normal (arg0);
39402 mode0 = insn_data[icode].operand[0].mode;
39403 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39405 error ("the xabort's argument must be an 8-bit immediate");
39406 return const0_rtx;
39408 emit_insn (gen_xabort (op0));
39409 return 0;
39411 default:
39412 break;
39415 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39416 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39418 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39419 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39420 target);
39423 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39424 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39426 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39427 switch (fcode)
39429 case IX86_BUILTIN_FABSQ:
39430 case IX86_BUILTIN_COPYSIGNQ:
39431 if (!TARGET_SSE)
39432 /* Emit a normal call if SSE isn't available. */
39433 return expand_call (exp, target, ignore);
39434 /* FALLTHRU */
39435 default:
39436 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39440 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39441 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39443 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39444 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39445 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39446 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39447 int masked = 1;
39448 machine_mode mode, wide_mode, nar_mode;
39450 nar_mode = V4SFmode;
39451 mode = V16SFmode;
39452 wide_mode = V64SFmode;
39453 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39454 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39456 switch (fcode)
39458 case IX86_BUILTIN_4FMAPS:
39459 fcn = gen_avx5124fmaddps_4fmaddps;
39460 masked = 0;
39461 goto v4fma_expand;
39463 case IX86_BUILTIN_4DPWSSD:
39464 nar_mode = V4SImode;
39465 mode = V16SImode;
39466 wide_mode = V64SImode;
39467 fcn = gen_avx5124vnniw_vp4dpwssd;
39468 masked = 0;
39469 goto v4fma_expand;
39471 case IX86_BUILTIN_4DPWSSDS:
39472 nar_mode = V4SImode;
39473 mode = V16SImode;
39474 wide_mode = V64SImode;
39475 fcn = gen_avx5124vnniw_vp4dpwssds;
39476 masked = 0;
39477 goto v4fma_expand;
39479 case IX86_BUILTIN_4FNMAPS:
39480 fcn = gen_avx5124fmaddps_4fnmaddps;
39481 masked = 0;
39482 goto v4fma_expand;
39484 case IX86_BUILTIN_4FNMAPS_MASK:
39485 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39486 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39487 goto v4fma_expand;
39489 case IX86_BUILTIN_4DPWSSD_MASK:
39490 nar_mode = V4SImode;
39491 mode = V16SImode;
39492 wide_mode = V64SImode;
39493 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39494 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39495 goto v4fma_expand;
39497 case IX86_BUILTIN_4DPWSSDS_MASK:
39498 nar_mode = V4SImode;
39499 mode = V16SImode;
39500 wide_mode = V64SImode;
39501 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39502 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39503 goto v4fma_expand;
39505 case IX86_BUILTIN_4FMAPS_MASK:
39507 tree args[4];
39508 rtx ops[4];
39509 rtx wide_reg;
39510 rtx accum;
39511 rtx addr;
39512 rtx mem;
39514 v4fma_expand:
39515 wide_reg = gen_reg_rtx (wide_mode);
39516 for (i = 0; i < 4; i++)
39518 args[i] = CALL_EXPR_ARG (exp, i);
39519 ops[i] = expand_normal (args[i]);
39521 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39522 ops[i]);
39525 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39526 accum = force_reg (mode, accum);
39528 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39529 addr = force_reg (Pmode, addr);
39531 mem = gen_rtx_MEM (nar_mode, addr);
39533 target = gen_reg_rtx (mode);
39535 emit_move_insn (target, accum);
39537 if (! masked)
39538 emit_insn (fcn (target, accum, wide_reg, mem));
39539 else
39541 rtx merge, mask;
39542 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39544 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39546 if (CONST_INT_P (mask))
39547 mask = fixup_modeless_constant (mask, HImode);
39549 mask = force_reg (HImode, mask);
39551 if (GET_MODE (mask) != HImode)
39552 mask = gen_rtx_SUBREG (HImode, mask, 0);
39554 /* If merge is 0 then we're about to emit z-masked variant. */
39555 if (const0_operand (merge, mode))
39556 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39557 /* If merge is the same as accum then emit merge-masked variant. */
39558 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39560 merge = force_reg (mode, merge);
39561 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39563 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39564 else
39566 target = gen_reg_rtx (mode);
39567 emit_move_insn (target, merge);
39568 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39571 return target;
39574 case IX86_BUILTIN_4FNMASS:
39575 fcn = gen_avx5124fmaddps_4fnmaddss;
39576 masked = 0;
39577 goto s4fma_expand;
39579 case IX86_BUILTIN_4FMASS:
39580 fcn = gen_avx5124fmaddps_4fmaddss;
39581 masked = 0;
39582 goto s4fma_expand;
39584 case IX86_BUILTIN_4FNMASS_MASK:
39585 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39586 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39587 goto s4fma_expand;
39589 case IX86_BUILTIN_4FMASS_MASK:
39591 tree args[4];
39592 rtx ops[4];
39593 rtx wide_reg;
39594 rtx accum;
39595 rtx addr;
39596 rtx mem;
39598 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39599 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39601 s4fma_expand:
39602 mode = V4SFmode;
39603 wide_reg = gen_reg_rtx (V64SFmode);
39604 for (i = 0; i < 4; i++)
39606 rtx tmp;
39607 args[i] = CALL_EXPR_ARG (exp, i);
39608 ops[i] = expand_normal (args[i]);
39610 tmp = gen_reg_rtx (SFmode);
39611 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39613 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39614 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39617 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39618 accum = force_reg (V4SFmode, accum);
39620 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39621 addr = force_reg (Pmode, addr);
39623 mem = gen_rtx_MEM (V4SFmode, addr);
39625 target = gen_reg_rtx (V4SFmode);
39627 emit_move_insn (target, accum);
39629 if (! masked)
39630 emit_insn (fcn (target, accum, wide_reg, mem));
39631 else
39633 rtx merge, mask;
39634 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39636 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39638 if (CONST_INT_P (mask))
39639 mask = fixup_modeless_constant (mask, QImode);
39641 mask = force_reg (QImode, mask);
39643 if (GET_MODE (mask) != QImode)
39644 mask = gen_rtx_SUBREG (QImode, mask, 0);
39646 /* If merge is 0 then we're about to emit z-masked variant. */
39647 if (const0_operand (merge, mode))
39648 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39649 /* If merge is the same as accum then emit merge-masked
39650 variant. */
39651 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39653 merge = force_reg (mode, merge);
39654 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39656 /* Merge with something unknown might happen if we z-mask
39657 w/ -O0. */
39658 else
39660 target = gen_reg_rtx (mode);
39661 emit_move_insn (target, merge);
39662 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39665 return target;
39667 case IX86_BUILTIN_RDPID:
39668 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39669 target);
39670 default:
39671 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39675 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39676 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39678 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39679 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39682 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39683 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39685 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39686 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39689 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39690 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39692 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39693 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39696 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39697 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39699 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39700 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39703 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39704 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39706 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39707 const struct builtin_description *d = bdesc_multi_arg + i;
39708 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39709 (enum ix86_builtin_func_type)
39710 d->flag, d->comparison);
39713 gcc_unreachable ();
39716 /* This returns the target-specific builtin with code CODE if
39717 current_function_decl has visibility on this builtin, which is checked
39718 using isa flags. Returns NULL_TREE otherwise. */
39720 static tree ix86_get_builtin (enum ix86_builtins code)
39722 struct cl_target_option *opts;
39723 tree target_tree = NULL_TREE;
39725 /* Determine the isa flags of current_function_decl. */
39727 if (current_function_decl)
39728 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39730 if (target_tree == NULL)
39731 target_tree = target_option_default_node;
39733 opts = TREE_TARGET_OPTION (target_tree);
39735 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39736 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39737 return ix86_builtin_decl (code, true);
39738 else
39739 return NULL_TREE;
39742 /* Return function decl for target specific builtin
39743 for given MPX builtin passed i FCODE. */
39744 static tree
39745 ix86_builtin_mpx_function (unsigned fcode)
39747 switch (fcode)
39749 case BUILT_IN_CHKP_BNDMK:
39750 return ix86_builtins[IX86_BUILTIN_BNDMK];
39752 case BUILT_IN_CHKP_BNDSTX:
39753 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39755 case BUILT_IN_CHKP_BNDLDX:
39756 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39758 case BUILT_IN_CHKP_BNDCL:
39759 return ix86_builtins[IX86_BUILTIN_BNDCL];
39761 case BUILT_IN_CHKP_BNDCU:
39762 return ix86_builtins[IX86_BUILTIN_BNDCU];
39764 case BUILT_IN_CHKP_BNDRET:
39765 return ix86_builtins[IX86_BUILTIN_BNDRET];
39767 case BUILT_IN_CHKP_INTERSECT:
39768 return ix86_builtins[IX86_BUILTIN_BNDINT];
39770 case BUILT_IN_CHKP_NARROW:
39771 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39773 case BUILT_IN_CHKP_SIZEOF:
39774 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39776 case BUILT_IN_CHKP_EXTRACT_LOWER:
39777 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39779 case BUILT_IN_CHKP_EXTRACT_UPPER:
39780 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39782 default:
39783 return NULL_TREE;
39786 gcc_unreachable ();
39789 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39791 Return an address to be used to load/store bounds for pointer
39792 passed in SLOT.
39794 SLOT_NO is an integer constant holding number of a target
39795 dependent special slot to be used in case SLOT is not a memory.
39797 SPECIAL_BASE is a pointer to be used as a base of fake address
39798 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39799 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39801 static rtx
39802 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39804 rtx addr = NULL;
39806 /* NULL slot means we pass bounds for pointer not passed to the
39807 function at all. Register slot means we pass pointer in a
39808 register. In both these cases bounds are passed via Bounds
39809 Table. Since we do not have actual pointer stored in memory,
39810 we have to use fake addresses to access Bounds Table. We
39811 start with (special_base - sizeof (void*)) and decrease this
39812 address by pointer size to get addresses for other slots. */
39813 if (!slot || REG_P (slot))
39815 gcc_assert (CONST_INT_P (slot_no));
39816 addr = plus_constant (Pmode, special_base,
39817 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39819 /* If pointer is passed in a memory then its address is used to
39820 access Bounds Table. */
39821 else if (MEM_P (slot))
39823 addr = XEXP (slot, 0);
39824 if (!register_operand (addr, Pmode))
39825 addr = copy_addr_to_reg (addr);
39827 else
39828 gcc_unreachable ();
39830 return addr;
39833 /* Expand pass uses this hook to load bounds for function parameter
39834 PTR passed in SLOT in case its bounds are not passed in a register.
39836 If SLOT is a memory, then bounds are loaded as for regular pointer
39837 loaded from memory. PTR may be NULL in case SLOT is a memory.
39838 In such case value of PTR (if required) may be loaded from SLOT.
39840 If SLOT is NULL or a register then SLOT_NO is an integer constant
39841 holding number of the target dependent special slot which should be
39842 used to obtain bounds.
39844 Return loaded bounds. */
39846 static rtx
39847 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39849 rtx reg = gen_reg_rtx (BNDmode);
39850 rtx addr;
39852 /* Get address to be used to access Bounds Table. Special slots start
39853 at the location of return address of the current function. */
39854 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39856 /* Load pointer value from a memory if we don't have it. */
39857 if (!ptr)
39859 gcc_assert (MEM_P (slot));
39860 ptr = copy_addr_to_reg (slot);
39863 if (!register_operand (ptr, Pmode))
39864 ptr = ix86_zero_extend_to_Pmode (ptr);
39866 emit_insn (BNDmode == BND64mode
39867 ? gen_bnd64_ldx (reg, addr, ptr)
39868 : gen_bnd32_ldx (reg, addr, ptr));
39870 return reg;
39873 /* Expand pass uses this hook to store BOUNDS for call argument PTR
39874 passed in SLOT in case BOUNDS are not passed in a register.
39876 If SLOT is a memory, then BOUNDS are stored as for regular pointer
39877 stored in memory. PTR may be NULL in case SLOT is a memory.
39878 In such case value of PTR (if required) may be loaded from SLOT.
39880 If SLOT is NULL or a register then SLOT_NO is an integer constant
39881 holding number of the target dependent special slot which should be
39882 used to store BOUNDS. */
39884 static void
39885 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
39887 rtx addr;
39889 /* Get address to be used to access Bounds Table. Special slots start
39890 at the location of return address of a called function. */
39891 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
39893 /* Load pointer value from a memory if we don't have it. */
39894 if (!ptr)
39896 gcc_assert (MEM_P (slot));
39897 ptr = copy_addr_to_reg (slot);
39900 if (!register_operand (ptr, Pmode))
39901 ptr = ix86_zero_extend_to_Pmode (ptr);
39903 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39904 if (!register_operand (bounds, BNDmode))
39905 bounds = copy_to_mode_reg (BNDmode, bounds);
39907 emit_insn (BNDmode == BND64mode
39908 ? gen_bnd64_stx (addr, ptr, bounds)
39909 : gen_bnd32_stx (addr, ptr, bounds));
39912 /* Load and return bounds returned by function in SLOT. */
39914 static rtx
39915 ix86_load_returned_bounds (rtx slot)
39917 rtx res;
39919 gcc_assert (REG_P (slot));
39920 res = gen_reg_rtx (BNDmode);
39921 emit_move_insn (res, slot);
39923 return res;
39926 /* Store BOUNDS returned by function into SLOT. */
39928 static void
39929 ix86_store_returned_bounds (rtx slot, rtx bounds)
39931 gcc_assert (REG_P (slot));
39932 emit_move_insn (slot, bounds);
39935 /* Returns a function decl for a vectorized version of the combined function
39936 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39937 if it is not available. */
39939 static tree
39940 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39941 tree type_in)
39943 machine_mode in_mode, out_mode;
39944 int in_n, out_n;
39946 if (TREE_CODE (type_out) != VECTOR_TYPE
39947 || TREE_CODE (type_in) != VECTOR_TYPE)
39948 return NULL_TREE;
39950 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39951 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39952 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39953 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39955 switch (fn)
39957 CASE_CFN_EXP2:
39958 if (out_mode == SFmode && in_mode == SFmode)
39960 if (out_n == 16 && in_n == 16)
39961 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39963 break;
39965 CASE_CFN_IFLOOR:
39966 CASE_CFN_LFLOOR:
39967 CASE_CFN_LLFLOOR:
39968 /* The round insn does not trap on denormals. */
39969 if (flag_trapping_math || !TARGET_SSE4_1)
39970 break;
39972 if (out_mode == SImode && in_mode == DFmode)
39974 if (out_n == 4 && in_n == 2)
39975 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39976 else if (out_n == 8 && in_n == 4)
39977 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39978 else if (out_n == 16 && in_n == 8)
39979 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39981 if (out_mode == SImode && in_mode == SFmode)
39983 if (out_n == 4 && in_n == 4)
39984 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39985 else if (out_n == 8 && in_n == 8)
39986 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39987 else if (out_n == 16 && in_n == 16)
39988 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39990 break;
39992 CASE_CFN_ICEIL:
39993 CASE_CFN_LCEIL:
39994 CASE_CFN_LLCEIL:
39995 /* The round insn does not trap on denormals. */
39996 if (flag_trapping_math || !TARGET_SSE4_1)
39997 break;
39999 if (out_mode == SImode && in_mode == DFmode)
40001 if (out_n == 4 && in_n == 2)
40002 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
40003 else if (out_n == 8 && in_n == 4)
40004 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
40005 else if (out_n == 16 && in_n == 8)
40006 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
40008 if (out_mode == SImode && in_mode == SFmode)
40010 if (out_n == 4 && in_n == 4)
40011 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
40012 else if (out_n == 8 && in_n == 8)
40013 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
40014 else if (out_n == 16 && in_n == 16)
40015 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
40017 break;
40019 CASE_CFN_IRINT:
40020 CASE_CFN_LRINT:
40021 CASE_CFN_LLRINT:
40022 if (out_mode == SImode && in_mode == DFmode)
40024 if (out_n == 4 && in_n == 2)
40025 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
40026 else if (out_n == 8 && in_n == 4)
40027 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
40028 else if (out_n == 16 && in_n == 8)
40029 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
40031 if (out_mode == SImode && in_mode == SFmode)
40033 if (out_n == 4 && in_n == 4)
40034 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
40035 else if (out_n == 8 && in_n == 8)
40036 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
40037 else if (out_n == 16 && in_n == 16)
40038 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
40040 break;
40042 CASE_CFN_IROUND:
40043 CASE_CFN_LROUND:
40044 CASE_CFN_LLROUND:
40045 /* The round insn does not trap on denormals. */
40046 if (flag_trapping_math || !TARGET_SSE4_1)
40047 break;
40049 if (out_mode == SImode && in_mode == DFmode)
40051 if (out_n == 4 && in_n == 2)
40052 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
40053 else if (out_n == 8 && in_n == 4)
40054 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
40055 else if (out_n == 16 && in_n == 8)
40056 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
40058 if (out_mode == SImode && in_mode == SFmode)
40060 if (out_n == 4 && in_n == 4)
40061 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
40062 else if (out_n == 8 && in_n == 8)
40063 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
40064 else if (out_n == 16 && in_n == 16)
40065 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
40067 break;
40069 CASE_CFN_FLOOR:
40070 /* The round insn does not trap on denormals. */
40071 if (flag_trapping_math || !TARGET_SSE4_1)
40072 break;
40074 if (out_mode == DFmode && in_mode == DFmode)
40076 if (out_n == 2 && in_n == 2)
40077 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
40078 else if (out_n == 4 && in_n == 4)
40079 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
40080 else if (out_n == 8 && in_n == 8)
40081 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
40083 if (out_mode == SFmode && in_mode == SFmode)
40085 if (out_n == 4 && in_n == 4)
40086 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
40087 else if (out_n == 8 && in_n == 8)
40088 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
40089 else if (out_n == 16 && in_n == 16)
40090 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
40092 break;
40094 CASE_CFN_CEIL:
40095 /* The round insn does not trap on denormals. */
40096 if (flag_trapping_math || !TARGET_SSE4_1)
40097 break;
40099 if (out_mode == DFmode && in_mode == DFmode)
40101 if (out_n == 2 && in_n == 2)
40102 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
40103 else if (out_n == 4 && in_n == 4)
40104 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
40105 else if (out_n == 8 && in_n == 8)
40106 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
40108 if (out_mode == SFmode && in_mode == SFmode)
40110 if (out_n == 4 && in_n == 4)
40111 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
40112 else if (out_n == 8 && in_n == 8)
40113 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
40114 else if (out_n == 16 && in_n == 16)
40115 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
40117 break;
40119 CASE_CFN_TRUNC:
40120 /* The round insn does not trap on denormals. */
40121 if (flag_trapping_math || !TARGET_SSE4_1)
40122 break;
40124 if (out_mode == DFmode && in_mode == DFmode)
40126 if (out_n == 2 && in_n == 2)
40127 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
40128 else if (out_n == 4 && in_n == 4)
40129 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
40130 else if (out_n == 8 && in_n == 8)
40131 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
40133 if (out_mode == SFmode && in_mode == SFmode)
40135 if (out_n == 4 && in_n == 4)
40136 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
40137 else if (out_n == 8 && in_n == 8)
40138 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
40139 else if (out_n == 16 && in_n == 16)
40140 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
40142 break;
40144 CASE_CFN_RINT:
40145 /* The round insn does not trap on denormals. */
40146 if (flag_trapping_math || !TARGET_SSE4_1)
40147 break;
40149 if (out_mode == DFmode && in_mode == DFmode)
40151 if (out_n == 2 && in_n == 2)
40152 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
40153 else if (out_n == 4 && in_n == 4)
40154 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
40156 if (out_mode == SFmode && in_mode == SFmode)
40158 if (out_n == 4 && in_n == 4)
40159 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
40160 else if (out_n == 8 && in_n == 8)
40161 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40163 break;
40165 CASE_CFN_FMA:
40166 if (out_mode == DFmode && in_mode == DFmode)
40168 if (out_n == 2 && in_n == 2)
40169 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40170 if (out_n == 4 && in_n == 4)
40171 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40173 if (out_mode == SFmode && in_mode == SFmode)
40175 if (out_n == 4 && in_n == 4)
40176 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40177 if (out_n == 8 && in_n == 8)
40178 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40180 break;
40182 default:
40183 break;
40186 /* Dispatch to a handler for a vectorization library. */
40187 if (ix86_veclib_handler)
40188 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40190 return NULL_TREE;
40193 /* Handler for an SVML-style interface to
40194 a library with vectorized intrinsics. */
40196 static tree
40197 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40199 char name[20];
40200 tree fntype, new_fndecl, args;
40201 unsigned arity;
40202 const char *bname;
40203 machine_mode el_mode, in_mode;
40204 int n, in_n;
40206 /* The SVML is suitable for unsafe math only. */
40207 if (!flag_unsafe_math_optimizations)
40208 return NULL_TREE;
40210 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40211 n = TYPE_VECTOR_SUBPARTS (type_out);
40212 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40213 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40214 if (el_mode != in_mode
40215 || n != in_n)
40216 return NULL_TREE;
40218 switch (fn)
40220 CASE_CFN_EXP:
40221 CASE_CFN_LOG:
40222 CASE_CFN_LOG10:
40223 CASE_CFN_POW:
40224 CASE_CFN_TANH:
40225 CASE_CFN_TAN:
40226 CASE_CFN_ATAN:
40227 CASE_CFN_ATAN2:
40228 CASE_CFN_ATANH:
40229 CASE_CFN_CBRT:
40230 CASE_CFN_SINH:
40231 CASE_CFN_SIN:
40232 CASE_CFN_ASINH:
40233 CASE_CFN_ASIN:
40234 CASE_CFN_COSH:
40235 CASE_CFN_COS:
40236 CASE_CFN_ACOSH:
40237 CASE_CFN_ACOS:
40238 if ((el_mode != DFmode || n != 2)
40239 && (el_mode != SFmode || n != 4))
40240 return NULL_TREE;
40241 break;
40243 default:
40244 return NULL_TREE;
40247 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40248 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40250 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40251 strcpy (name, "vmlsLn4");
40252 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40253 strcpy (name, "vmldLn2");
40254 else if (n == 4)
40256 sprintf (name, "vmls%s", bname+10);
40257 name[strlen (name)-1] = '4';
40259 else
40260 sprintf (name, "vmld%s2", bname+10);
40262 /* Convert to uppercase. */
40263 name[4] &= ~0x20;
40265 arity = 0;
40266 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40267 arity++;
40269 if (arity == 1)
40270 fntype = build_function_type_list (type_out, type_in, NULL);
40271 else
40272 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40274 /* Build a function declaration for the vectorized function. */
40275 new_fndecl = build_decl (BUILTINS_LOCATION,
40276 FUNCTION_DECL, get_identifier (name), fntype);
40277 TREE_PUBLIC (new_fndecl) = 1;
40278 DECL_EXTERNAL (new_fndecl) = 1;
40279 DECL_IS_NOVOPS (new_fndecl) = 1;
40280 TREE_READONLY (new_fndecl) = 1;
40282 return new_fndecl;
40285 /* Handler for an ACML-style interface to
40286 a library with vectorized intrinsics. */
40288 static tree
40289 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40291 char name[20] = "__vr.._";
40292 tree fntype, new_fndecl, args;
40293 unsigned arity;
40294 const char *bname;
40295 machine_mode el_mode, in_mode;
40296 int n, in_n;
40298 /* The ACML is 64bits only and suitable for unsafe math only as
40299 it does not correctly support parts of IEEE with the required
40300 precision such as denormals. */
40301 if (!TARGET_64BIT
40302 || !flag_unsafe_math_optimizations)
40303 return NULL_TREE;
40305 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40306 n = TYPE_VECTOR_SUBPARTS (type_out);
40307 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40308 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40309 if (el_mode != in_mode
40310 || n != in_n)
40311 return NULL_TREE;
40313 switch (fn)
40315 CASE_CFN_SIN:
40316 CASE_CFN_COS:
40317 CASE_CFN_EXP:
40318 CASE_CFN_LOG:
40319 CASE_CFN_LOG2:
40320 CASE_CFN_LOG10:
40321 if (el_mode == DFmode && n == 2)
40323 name[4] = 'd';
40324 name[5] = '2';
40326 else if (el_mode == SFmode && n == 4)
40328 name[4] = 's';
40329 name[5] = '4';
40331 else
40332 return NULL_TREE;
40333 break;
40335 default:
40336 return NULL_TREE;
40339 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40340 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40341 sprintf (name + 7, "%s", bname+10);
40343 arity = 0;
40344 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40345 arity++;
40347 if (arity == 1)
40348 fntype = build_function_type_list (type_out, type_in, NULL);
40349 else
40350 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40352 /* Build a function declaration for the vectorized function. */
40353 new_fndecl = build_decl (BUILTINS_LOCATION,
40354 FUNCTION_DECL, get_identifier (name), fntype);
40355 TREE_PUBLIC (new_fndecl) = 1;
40356 DECL_EXTERNAL (new_fndecl) = 1;
40357 DECL_IS_NOVOPS (new_fndecl) = 1;
40358 TREE_READONLY (new_fndecl) = 1;
40360 return new_fndecl;
40363 /* Returns a decl of a function that implements gather load with
40364 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40365 Return NULL_TREE if it is not available. */
40367 static tree
40368 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40369 const_tree index_type, int scale)
40371 bool si;
40372 enum ix86_builtins code;
40374 if (! TARGET_AVX2)
40375 return NULL_TREE;
40377 if ((TREE_CODE (index_type) != INTEGER_TYPE
40378 && !POINTER_TYPE_P (index_type))
40379 || (TYPE_MODE (index_type) != SImode
40380 && TYPE_MODE (index_type) != DImode))
40381 return NULL_TREE;
40383 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40384 return NULL_TREE;
40386 /* v*gather* insn sign extends index to pointer mode. */
40387 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40388 && TYPE_UNSIGNED (index_type))
40389 return NULL_TREE;
40391 if (scale <= 0
40392 || scale > 8
40393 || (scale & (scale - 1)) != 0)
40394 return NULL_TREE;
40396 si = TYPE_MODE (index_type) == SImode;
40397 switch (TYPE_MODE (mem_vectype))
40399 case E_V2DFmode:
40400 if (TARGET_AVX512VL)
40401 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40402 else
40403 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40404 break;
40405 case E_V4DFmode:
40406 if (TARGET_AVX512VL)
40407 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40408 else
40409 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40410 break;
40411 case E_V2DImode:
40412 if (TARGET_AVX512VL)
40413 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40414 else
40415 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40416 break;
40417 case E_V4DImode:
40418 if (TARGET_AVX512VL)
40419 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40420 else
40421 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40422 break;
40423 case E_V4SFmode:
40424 if (TARGET_AVX512VL)
40425 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40426 else
40427 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40428 break;
40429 case E_V8SFmode:
40430 if (TARGET_AVX512VL)
40431 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40432 else
40433 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40434 break;
40435 case E_V4SImode:
40436 if (TARGET_AVX512VL)
40437 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40438 else
40439 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40440 break;
40441 case E_V8SImode:
40442 if (TARGET_AVX512VL)
40443 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40444 else
40445 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40446 break;
40447 case E_V8DFmode:
40448 if (TARGET_AVX512F)
40449 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40450 else
40451 return NULL_TREE;
40452 break;
40453 case E_V8DImode:
40454 if (TARGET_AVX512F)
40455 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40456 else
40457 return NULL_TREE;
40458 break;
40459 case E_V16SFmode:
40460 if (TARGET_AVX512F)
40461 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40462 else
40463 return NULL_TREE;
40464 break;
40465 case E_V16SImode:
40466 if (TARGET_AVX512F)
40467 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40468 else
40469 return NULL_TREE;
40470 break;
40471 default:
40472 return NULL_TREE;
40475 return ix86_get_builtin (code);
40478 /* Returns a decl of a function that implements scatter store with
40479 register type VECTYPE and index type INDEX_TYPE and SCALE.
40480 Return NULL_TREE if it is not available. */
40482 static tree
40483 ix86_vectorize_builtin_scatter (const_tree vectype,
40484 const_tree index_type, int scale)
40486 bool si;
40487 enum ix86_builtins code;
40489 if (!TARGET_AVX512F)
40490 return NULL_TREE;
40492 if ((TREE_CODE (index_type) != INTEGER_TYPE
40493 && !POINTER_TYPE_P (index_type))
40494 || (TYPE_MODE (index_type) != SImode
40495 && TYPE_MODE (index_type) != DImode))
40496 return NULL_TREE;
40498 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40499 return NULL_TREE;
40501 /* v*scatter* insn sign extends index to pointer mode. */
40502 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40503 && TYPE_UNSIGNED (index_type))
40504 return NULL_TREE;
40506 /* Scale can be 1, 2, 4 or 8. */
40507 if (scale <= 0
40508 || scale > 8
40509 || (scale & (scale - 1)) != 0)
40510 return NULL_TREE;
40512 si = TYPE_MODE (index_type) == SImode;
40513 switch (TYPE_MODE (vectype))
40515 case E_V8DFmode:
40516 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40517 break;
40518 case E_V8DImode:
40519 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40520 break;
40521 case E_V16SFmode:
40522 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40523 break;
40524 case E_V16SImode:
40525 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40526 break;
40527 default:
40528 return NULL_TREE;
40531 return ix86_builtins[code];
40534 /* Return true if it is safe to use the rsqrt optabs to optimize
40535 1.0/sqrt. */
40537 static bool
40538 use_rsqrt_p ()
40540 return (TARGET_SSE_MATH
40541 && flag_finite_math_only
40542 && !flag_trapping_math
40543 && flag_unsafe_math_optimizations);
40546 /* Returns a code for a target-specific builtin that implements
40547 reciprocal of the function, or NULL_TREE if not available. */
40549 static tree
40550 ix86_builtin_reciprocal (tree fndecl)
40552 switch (DECL_FUNCTION_CODE (fndecl))
40554 /* Vectorized version of sqrt to rsqrt conversion. */
40555 case IX86_BUILTIN_SQRTPS_NR:
40556 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40558 case IX86_BUILTIN_SQRTPS_NR256:
40559 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40561 default:
40562 return NULL_TREE;
40566 /* Helper for avx_vpermilps256_operand et al. This is also used by
40567 the expansion functions to turn the parallel back into a mask.
40568 The return value is 0 for no match and the imm8+1 for a match. */
40571 avx_vpermilp_parallel (rtx par, machine_mode mode)
40573 unsigned i, nelt = GET_MODE_NUNITS (mode);
40574 unsigned mask = 0;
40575 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40577 if (XVECLEN (par, 0) != (int) nelt)
40578 return 0;
40580 /* Validate that all of the elements are constants, and not totally
40581 out of range. Copy the data into an integral array to make the
40582 subsequent checks easier. */
40583 for (i = 0; i < nelt; ++i)
40585 rtx er = XVECEXP (par, 0, i);
40586 unsigned HOST_WIDE_INT ei;
40588 if (!CONST_INT_P (er))
40589 return 0;
40590 ei = INTVAL (er);
40591 if (ei >= nelt)
40592 return 0;
40593 ipar[i] = ei;
40596 switch (mode)
40598 case E_V8DFmode:
40599 /* In the 512-bit DFmode case, we can only move elements within
40600 a 128-bit lane. First fill the second part of the mask,
40601 then fallthru. */
40602 for (i = 4; i < 6; ++i)
40604 if (ipar[i] < 4 || ipar[i] >= 6)
40605 return 0;
40606 mask |= (ipar[i] - 4) << i;
40608 for (i = 6; i < 8; ++i)
40610 if (ipar[i] < 6)
40611 return 0;
40612 mask |= (ipar[i] - 6) << i;
40614 /* FALLTHRU */
40616 case E_V4DFmode:
40617 /* In the 256-bit DFmode case, we can only move elements within
40618 a 128-bit lane. */
40619 for (i = 0; i < 2; ++i)
40621 if (ipar[i] >= 2)
40622 return 0;
40623 mask |= ipar[i] << i;
40625 for (i = 2; i < 4; ++i)
40627 if (ipar[i] < 2)
40628 return 0;
40629 mask |= (ipar[i] - 2) << i;
40631 break;
40633 case E_V16SFmode:
40634 /* In 512 bit SFmode case, permutation in the upper 256 bits
40635 must mirror the permutation in the lower 256-bits. */
40636 for (i = 0; i < 8; ++i)
40637 if (ipar[i] + 8 != ipar[i + 8])
40638 return 0;
40639 /* FALLTHRU */
40641 case E_V8SFmode:
40642 /* In 256 bit SFmode case, we have full freedom of
40643 movement within the low 128-bit lane, but the high 128-bit
40644 lane must mirror the exact same pattern. */
40645 for (i = 0; i < 4; ++i)
40646 if (ipar[i] + 4 != ipar[i + 4])
40647 return 0;
40648 nelt = 4;
40649 /* FALLTHRU */
40651 case E_V2DFmode:
40652 case E_V4SFmode:
40653 /* In the 128-bit case, we've full freedom in the placement of
40654 the elements from the source operand. */
40655 for (i = 0; i < nelt; ++i)
40656 mask |= ipar[i] << (i * (nelt / 2));
40657 break;
40659 default:
40660 gcc_unreachable ();
40663 /* Make sure success has a non-zero value by adding one. */
40664 return mask + 1;
40667 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40668 the expansion functions to turn the parallel back into a mask.
40669 The return value is 0 for no match and the imm8+1 for a match. */
40672 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40674 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40675 unsigned mask = 0;
40676 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40678 if (XVECLEN (par, 0) != (int) nelt)
40679 return 0;
40681 /* Validate that all of the elements are constants, and not totally
40682 out of range. Copy the data into an integral array to make the
40683 subsequent checks easier. */
40684 for (i = 0; i < nelt; ++i)
40686 rtx er = XVECEXP (par, 0, i);
40687 unsigned HOST_WIDE_INT ei;
40689 if (!CONST_INT_P (er))
40690 return 0;
40691 ei = INTVAL (er);
40692 if (ei >= 2 * nelt)
40693 return 0;
40694 ipar[i] = ei;
40697 /* Validate that the halves of the permute are halves. */
40698 for (i = 0; i < nelt2 - 1; ++i)
40699 if (ipar[i] + 1 != ipar[i + 1])
40700 return 0;
40701 for (i = nelt2; i < nelt - 1; ++i)
40702 if (ipar[i] + 1 != ipar[i + 1])
40703 return 0;
40705 /* Reconstruct the mask. */
40706 for (i = 0; i < 2; ++i)
40708 unsigned e = ipar[i * nelt2];
40709 if (e % nelt2)
40710 return 0;
40711 e /= nelt2;
40712 mask |= e << (i * 4);
40715 /* Make sure success has a non-zero value by adding one. */
40716 return mask + 1;
40719 /* Return a register priority for hard reg REGNO. */
40720 static int
40721 ix86_register_priority (int hard_regno)
40723 /* ebp and r13 as the base always wants a displacement, r12 as the
40724 base always wants an index. So discourage their usage in an
40725 address. */
40726 if (hard_regno == R12_REG || hard_regno == R13_REG)
40727 return 0;
40728 if (hard_regno == BP_REG)
40729 return 1;
40730 /* New x86-64 int registers result in bigger code size. Discourage
40731 them. */
40732 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40733 return 2;
40734 /* New x86-64 SSE registers result in bigger code size. Discourage
40735 them. */
40736 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40737 return 2;
40738 /* Usage of AX register results in smaller code. Prefer it. */
40739 if (hard_regno == AX_REG)
40740 return 4;
40741 return 3;
40744 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40746 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40747 QImode must go into class Q_REGS.
40748 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40749 movdf to do mem-to-mem moves through integer regs. */
40751 static reg_class_t
40752 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40754 machine_mode mode = GET_MODE (x);
40756 /* We're only allowed to return a subclass of CLASS. Many of the
40757 following checks fail for NO_REGS, so eliminate that early. */
40758 if (regclass == NO_REGS)
40759 return NO_REGS;
40761 /* All classes can load zeros. */
40762 if (x == CONST0_RTX (mode))
40763 return regclass;
40765 /* Force constants into memory if we are loading a (nonzero) constant into
40766 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40767 instructions to load from a constant. */
40768 if (CONSTANT_P (x)
40769 && (MAYBE_MMX_CLASS_P (regclass)
40770 || MAYBE_SSE_CLASS_P (regclass)
40771 || MAYBE_MASK_CLASS_P (regclass)))
40772 return NO_REGS;
40774 /* Floating-point constants need more complex checks. */
40775 if (CONST_DOUBLE_P (x))
40777 /* General regs can load everything. */
40778 if (INTEGER_CLASS_P (regclass))
40779 return regclass;
40781 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40782 zero above. We only want to wind up preferring 80387 registers if
40783 we plan on doing computation with them. */
40784 if (IS_STACK_MODE (mode)
40785 && standard_80387_constant_p (x) > 0)
40787 /* Limit class to FP regs. */
40788 if (FLOAT_CLASS_P (regclass))
40789 return FLOAT_REGS;
40790 else if (regclass == FP_TOP_SSE_REGS)
40791 return FP_TOP_REG;
40792 else if (regclass == FP_SECOND_SSE_REGS)
40793 return FP_SECOND_REG;
40796 return NO_REGS;
40799 /* Prefer SSE regs only, if we can use them for math. */
40800 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40801 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40803 /* Generally when we see PLUS here, it's the function invariant
40804 (plus soft-fp const_int). Which can only be computed into general
40805 regs. */
40806 if (GET_CODE (x) == PLUS)
40807 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40809 /* QImode constants are easy to load, but non-constant QImode data
40810 must go into Q_REGS. */
40811 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40813 if (Q_CLASS_P (regclass))
40814 return regclass;
40815 else if (reg_class_subset_p (Q_REGS, regclass))
40816 return Q_REGS;
40817 else
40818 return NO_REGS;
40821 return regclass;
40824 /* Discourage putting floating-point values in SSE registers unless
40825 SSE math is being used, and likewise for the 387 registers. */
40826 static reg_class_t
40827 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40829 machine_mode mode = GET_MODE (x);
40831 /* Restrict the output reload class to the register bank that we are doing
40832 math on. If we would like not to return a subset of CLASS, reject this
40833 alternative: if reload cannot do this, it will still use its choice. */
40834 mode = GET_MODE (x);
40835 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40836 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40838 if (IS_STACK_MODE (mode))
40840 if (regclass == FP_TOP_SSE_REGS)
40841 return FP_TOP_REG;
40842 else if (regclass == FP_SECOND_SSE_REGS)
40843 return FP_SECOND_REG;
40844 else
40845 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40848 return regclass;
40851 static reg_class_t
40852 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40853 machine_mode mode, secondary_reload_info *sri)
40855 /* Double-word spills from general registers to non-offsettable memory
40856 references (zero-extended addresses) require special handling. */
40857 if (TARGET_64BIT
40858 && MEM_P (x)
40859 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40860 && INTEGER_CLASS_P (rclass)
40861 && !offsettable_memref_p (x))
40863 sri->icode = (in_p
40864 ? CODE_FOR_reload_noff_load
40865 : CODE_FOR_reload_noff_store);
40866 /* Add the cost of moving address to a temporary. */
40867 sri->extra_cost = 1;
40869 return NO_REGS;
40872 /* QImode spills from non-QI registers require
40873 intermediate register on 32bit targets. */
40874 if (mode == QImode
40875 && ((!TARGET_64BIT && !in_p
40876 && INTEGER_CLASS_P (rclass)
40877 && MAYBE_NON_Q_CLASS_P (rclass))
40878 || (!TARGET_AVX512DQ
40879 && MAYBE_MASK_CLASS_P (rclass))))
40881 int regno = true_regnum (x);
40883 /* Return Q_REGS if the operand is in memory. */
40884 if (regno == -1)
40885 return Q_REGS;
40887 return NO_REGS;
40890 /* This condition handles corner case where an expression involving
40891 pointers gets vectorized. We're trying to use the address of a
40892 stack slot as a vector initializer.
40894 (set (reg:V2DI 74 [ vect_cst_.2 ])
40895 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40897 Eventually frame gets turned into sp+offset like this:
40899 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40900 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40901 (const_int 392 [0x188]))))
40903 That later gets turned into:
40905 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40906 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40907 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40909 We'll have the following reload recorded:
40911 Reload 0: reload_in (DI) =
40912 (plus:DI (reg/f:DI 7 sp)
40913 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40914 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40915 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40916 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40917 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40918 reload_reg_rtx: (reg:V2DI 22 xmm1)
40920 Which isn't going to work since SSE instructions can't handle scalar
40921 additions. Returning GENERAL_REGS forces the addition into integer
40922 register and reload can handle subsequent reloads without problems. */
40924 if (in_p && GET_CODE (x) == PLUS
40925 && SSE_CLASS_P (rclass)
40926 && SCALAR_INT_MODE_P (mode))
40927 return GENERAL_REGS;
40929 return NO_REGS;
40932 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40934 static bool
40935 ix86_class_likely_spilled_p (reg_class_t rclass)
40937 switch (rclass)
40939 case AREG:
40940 case DREG:
40941 case CREG:
40942 case BREG:
40943 case AD_REGS:
40944 case SIREG:
40945 case DIREG:
40946 case SSE_FIRST_REG:
40947 case FP_TOP_REG:
40948 case FP_SECOND_REG:
40949 case BND_REGS:
40950 return true;
40952 default:
40953 break;
40956 return false;
40959 /* If we are copying between registers from different register sets
40960 (e.g. FP and integer), we may need a memory location.
40962 The function can't work reliably when one of the CLASSES is a class
40963 containing registers from multiple sets. We avoid this by never combining
40964 different sets in a single alternative in the machine description.
40965 Ensure that this constraint holds to avoid unexpected surprises.
40967 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40968 so do not enforce these sanity checks.
40970 To optimize register_move_cost performance, define inline variant. */
40972 static inline bool
40973 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40974 machine_mode mode, int strict)
40976 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40977 return false;
40979 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40980 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40981 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40982 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40983 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40984 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40985 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40986 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40988 gcc_assert (!strict || lra_in_progress);
40989 return true;
40992 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40993 return true;
40995 /* Between mask and general, we have moves no larger than word size. */
40996 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40997 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40998 return true;
41000 /* ??? This is a lie. We do have moves between mmx/general, and for
41001 mmx/sse2. But by saying we need secondary memory we discourage the
41002 register allocator from using the mmx registers unless needed. */
41003 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
41004 return true;
41006 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41008 /* SSE1 doesn't have any direct moves from other classes. */
41009 if (!TARGET_SSE2)
41010 return true;
41012 /* If the target says that inter-unit moves are more expensive
41013 than moving through memory, then don't generate them. */
41014 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
41015 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
41016 return true;
41018 /* Between SSE and general, we have moves no larger than word size. */
41019 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41020 return true;
41023 return false;
41026 bool
41027 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
41028 machine_mode mode, int strict)
41030 return inline_secondary_memory_needed (class1, class2, mode, strict);
41033 /* Implement the TARGET_CLASS_MAX_NREGS hook.
41035 On the 80386, this is the size of MODE in words,
41036 except in the FP regs, where a single reg is always enough. */
41038 static unsigned char
41039 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
41041 if (MAYBE_INTEGER_CLASS_P (rclass))
41043 if (mode == XFmode)
41044 return (TARGET_64BIT ? 2 : 3);
41045 else if (mode == XCmode)
41046 return (TARGET_64BIT ? 4 : 6);
41047 else
41048 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
41050 else
41052 if (COMPLEX_MODE_P (mode))
41053 return 2;
41054 else
41055 return 1;
41059 /* Return true if the registers in CLASS cannot represent the change from
41060 modes FROM to TO. */
41062 bool
41063 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
41064 enum reg_class regclass)
41066 if (from == to)
41067 return false;
41069 /* x87 registers can't do subreg at all, as all values are reformatted
41070 to extended precision. */
41071 if (MAYBE_FLOAT_CLASS_P (regclass))
41072 return true;
41074 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
41076 /* Vector registers do not support QI or HImode loads. If we don't
41077 disallow a change to these modes, reload will assume it's ok to
41078 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
41079 the vec_dupv4hi pattern. */
41080 if (GET_MODE_SIZE (from) < 4)
41081 return true;
41084 return false;
41087 /* Return the cost of moving data of mode M between a
41088 register and memory. A value of 2 is the default; this cost is
41089 relative to those in `REGISTER_MOVE_COST'.
41091 This function is used extensively by register_move_cost that is used to
41092 build tables at startup. Make it inline in this case.
41093 When IN is 2, return maximum of in and out move cost.
41095 If moving between registers and memory is more expensive than
41096 between two registers, you should define this macro to express the
41097 relative cost.
41099 Model also increased moving costs of QImode registers in non
41100 Q_REGS classes.
41102 static inline int
41103 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
41104 int in)
41106 int cost;
41107 if (FLOAT_CLASS_P (regclass))
41109 int index;
41110 switch (mode)
41112 case E_SFmode:
41113 index = 0;
41114 break;
41115 case E_DFmode:
41116 index = 1;
41117 break;
41118 case E_XFmode:
41119 index = 2;
41120 break;
41121 default:
41122 return 100;
41124 if (in == 2)
41125 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
41126 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
41128 if (SSE_CLASS_P (regclass))
41130 int index;
41131 switch (GET_MODE_SIZE (mode))
41133 case 4:
41134 index = 0;
41135 break;
41136 case 8:
41137 index = 1;
41138 break;
41139 case 16:
41140 index = 2;
41141 break;
41142 default:
41143 return 100;
41145 if (in == 2)
41146 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
41147 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
41149 if (MMX_CLASS_P (regclass))
41151 int index;
41152 switch (GET_MODE_SIZE (mode))
41154 case 4:
41155 index = 0;
41156 break;
41157 case 8:
41158 index = 1;
41159 break;
41160 default:
41161 return 100;
41163 if (in)
41164 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41165 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41167 switch (GET_MODE_SIZE (mode))
41169 case 1:
41170 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41172 if (!in)
41173 return ix86_cost->int_store[0];
41174 if (TARGET_PARTIAL_REG_DEPENDENCY
41175 && optimize_function_for_speed_p (cfun))
41176 cost = ix86_cost->movzbl_load;
41177 else
41178 cost = ix86_cost->int_load[0];
41179 if (in == 2)
41180 return MAX (cost, ix86_cost->int_store[0]);
41181 return cost;
41183 else
41185 if (in == 2)
41186 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41187 if (in)
41188 return ix86_cost->movzbl_load;
41189 else
41190 return ix86_cost->int_store[0] + 4;
41192 break;
41193 case 2:
41194 if (in == 2)
41195 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41196 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41197 default:
41198 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41199 if (mode == TFmode)
41200 mode = XFmode;
41201 if (in == 2)
41202 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41203 else if (in)
41204 cost = ix86_cost->int_load[2];
41205 else
41206 cost = ix86_cost->int_store[2];
41207 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41211 static int
41212 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41213 bool in)
41215 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41219 /* Return the cost of moving data from a register in class CLASS1 to
41220 one in class CLASS2.
41222 It is not required that the cost always equal 2 when FROM is the same as TO;
41223 on some machines it is expensive to move between registers if they are not
41224 general registers. */
41226 static int
41227 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41228 reg_class_t class2_i)
41230 enum reg_class class1 = (enum reg_class) class1_i;
41231 enum reg_class class2 = (enum reg_class) class2_i;
41233 /* In case we require secondary memory, compute cost of the store followed
41234 by load. In order to avoid bad register allocation choices, we need
41235 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41237 if (inline_secondary_memory_needed (class1, class2, mode, 0))
41239 int cost = 1;
41241 cost += inline_memory_move_cost (mode, class1, 2);
41242 cost += inline_memory_move_cost (mode, class2, 2);
41244 /* In case of copying from general_purpose_register we may emit multiple
41245 stores followed by single load causing memory size mismatch stall.
41246 Count this as arbitrarily high cost of 20. */
41247 if (targetm.class_max_nregs (class1, mode)
41248 > targetm.class_max_nregs (class2, mode))
41249 cost += 20;
41251 /* In the case of FP/MMX moves, the registers actually overlap, and we
41252 have to switch modes in order to treat them differently. */
41253 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41254 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41255 cost += 20;
41257 return cost;
41260 /* Moves between SSE/MMX and integer unit are expensive. */
41261 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41262 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41264 /* ??? By keeping returned value relatively high, we limit the number
41265 of moves between integer and MMX/SSE registers for all targets.
41266 Additionally, high value prevents problem with x86_modes_tieable_p(),
41267 where integer modes in MMX/SSE registers are not tieable
41268 because of missing QImode and HImode moves to, from or between
41269 MMX/SSE registers. */
41270 return MAX (8, ix86_cost->mmxsse_to_integer);
41272 if (MAYBE_FLOAT_CLASS_P (class1))
41273 return ix86_cost->fp_move;
41274 if (MAYBE_SSE_CLASS_P (class1))
41275 return ix86_cost->sse_move;
41276 if (MAYBE_MMX_CLASS_P (class1))
41277 return ix86_cost->mmx_move;
41278 return 2;
41281 /* Return TRUE if hard register REGNO can hold a value of machine-mode
41282 MODE. */
41284 bool
41285 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
41287 /* Flags and only flags can only hold CCmode values. */
41288 if (CC_REGNO_P (regno))
41289 return GET_MODE_CLASS (mode) == MODE_CC;
41290 if (GET_MODE_CLASS (mode) == MODE_CC
41291 || GET_MODE_CLASS (mode) == MODE_RANDOM
41292 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41293 return false;
41294 if (STACK_REGNO_P (regno))
41295 return VALID_FP_MODE_P (mode);
41296 if (MASK_REGNO_P (regno))
41297 return (VALID_MASK_REG_MODE (mode)
41298 || (TARGET_AVX512BW
41299 && VALID_MASK_AVX512BW_MODE (mode)));
41300 if (BND_REGNO_P (regno))
41301 return VALID_BND_REG_MODE (mode);
41302 if (SSE_REGNO_P (regno))
41304 /* We implement the move patterns for all vector modes into and
41305 out of SSE registers, even when no operation instructions
41306 are available. */
41308 /* For AVX-512 we allow, regardless of regno:
41309 - XI mode
41310 - any of 512-bit wide vector mode
41311 - any scalar mode. */
41312 if (TARGET_AVX512F
41313 && (mode == XImode
41314 || VALID_AVX512F_REG_MODE (mode)
41315 || VALID_AVX512F_SCALAR_MODE (mode)))
41316 return true;
41318 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41319 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41320 && MOD4_SSE_REGNO_P (regno)
41321 && mode == V64SFmode)
41322 return true;
41324 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41325 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41326 && MOD4_SSE_REGNO_P (regno)
41327 && mode == V64SImode)
41328 return true;
41330 /* TODO check for QI/HI scalars. */
41331 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41332 if (TARGET_AVX512VL
41333 && (mode == OImode
41334 || mode == TImode
41335 || VALID_AVX256_REG_MODE (mode)
41336 || VALID_AVX512VL_128_REG_MODE (mode)))
41337 return true;
41339 /* xmm16-xmm31 are only available for AVX-512. */
41340 if (EXT_REX_SSE_REGNO_P (regno))
41341 return false;
41343 /* OImode and AVX modes are available only when AVX is enabled. */
41344 return ((TARGET_AVX
41345 && VALID_AVX256_REG_OR_OI_MODE (mode))
41346 || VALID_SSE_REG_MODE (mode)
41347 || VALID_SSE2_REG_MODE (mode)
41348 || VALID_MMX_REG_MODE (mode)
41349 || VALID_MMX_REG_MODE_3DNOW (mode));
41351 if (MMX_REGNO_P (regno))
41353 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41354 so if the register is available at all, then we can move data of
41355 the given mode into or out of it. */
41356 return (VALID_MMX_REG_MODE (mode)
41357 || VALID_MMX_REG_MODE_3DNOW (mode));
41360 if (mode == QImode)
41362 /* Take care for QImode values - they can be in non-QI regs,
41363 but then they do cause partial register stalls. */
41364 if (ANY_QI_REGNO_P (regno))
41365 return true;
41366 if (!TARGET_PARTIAL_REG_STALL)
41367 return true;
41368 /* LRA checks if the hard register is OK for the given mode.
41369 QImode values can live in non-QI regs, so we allow all
41370 registers here. */
41371 if (lra_in_progress)
41372 return true;
41373 return !can_create_pseudo_p ();
41375 /* We handle both integer and floats in the general purpose registers. */
41376 else if (VALID_INT_MODE_P (mode))
41377 return true;
41378 else if (VALID_FP_MODE_P (mode))
41379 return true;
41380 else if (VALID_DFP_MODE_P (mode))
41381 return true;
41382 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41383 on to use that value in smaller contexts, this can easily force a
41384 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41385 supporting DImode, allow it. */
41386 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41387 return true;
41389 return false;
41392 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41393 tieable integer mode. */
41395 static bool
41396 ix86_tieable_integer_mode_p (machine_mode mode)
41398 switch (mode)
41400 case E_HImode:
41401 case E_SImode:
41402 return true;
41404 case E_QImode:
41405 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41407 case E_DImode:
41408 return TARGET_64BIT;
41410 default:
41411 return false;
41415 /* Return true if MODE1 is accessible in a register that can hold MODE2
41416 without copying. That is, all register classes that can hold MODE2
41417 can also hold MODE1. */
41419 bool
41420 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41422 if (mode1 == mode2)
41423 return true;
41425 if (ix86_tieable_integer_mode_p (mode1)
41426 && ix86_tieable_integer_mode_p (mode2))
41427 return true;
41429 /* MODE2 being XFmode implies fp stack or general regs, which means we
41430 can tie any smaller floating point modes to it. Note that we do not
41431 tie this with TFmode. */
41432 if (mode2 == XFmode)
41433 return mode1 == SFmode || mode1 == DFmode;
41435 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41436 that we can tie it with SFmode. */
41437 if (mode2 == DFmode)
41438 return mode1 == SFmode;
41440 /* If MODE2 is only appropriate for an SSE register, then tie with
41441 any other mode acceptable to SSE registers. */
41442 if (GET_MODE_SIZE (mode2) == 32
41443 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41444 return (GET_MODE_SIZE (mode1) == 32
41445 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41446 if (GET_MODE_SIZE (mode2) == 16
41447 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41448 return (GET_MODE_SIZE (mode1) == 16
41449 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41451 /* If MODE2 is appropriate for an MMX register, then tie
41452 with any other mode acceptable to MMX registers. */
41453 if (GET_MODE_SIZE (mode2) == 8
41454 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41455 return (GET_MODE_SIZE (mode1) == 8
41456 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41458 return false;
41461 /* Return the cost of moving between two registers of mode MODE. */
41463 static int
41464 ix86_set_reg_reg_cost (machine_mode mode)
41466 unsigned int units = UNITS_PER_WORD;
41468 switch (GET_MODE_CLASS (mode))
41470 default:
41471 break;
41473 case MODE_CC:
41474 units = GET_MODE_SIZE (CCmode);
41475 break;
41477 case MODE_FLOAT:
41478 if ((TARGET_SSE && mode == TFmode)
41479 || (TARGET_80387 && mode == XFmode)
41480 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41481 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41482 units = GET_MODE_SIZE (mode);
41483 break;
41485 case MODE_COMPLEX_FLOAT:
41486 if ((TARGET_SSE && mode == TCmode)
41487 || (TARGET_80387 && mode == XCmode)
41488 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41489 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41490 units = GET_MODE_SIZE (mode);
41491 break;
41493 case MODE_VECTOR_INT:
41494 case MODE_VECTOR_FLOAT:
41495 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41496 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41497 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41498 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41499 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41500 units = GET_MODE_SIZE (mode);
41503 /* Return the cost of moving between two registers of mode MODE,
41504 assuming that the move will be in pieces of at most UNITS bytes. */
41505 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41508 /* Compute a (partial) cost for rtx X. Return true if the complete
41509 cost has been computed, and false if subexpressions should be
41510 scanned. In either case, *TOTAL contains the cost result. */
41512 static bool
41513 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41514 int *total, bool speed)
41516 rtx mask;
41517 enum rtx_code code = GET_CODE (x);
41518 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41519 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41520 int src_cost;
41522 switch (code)
41524 case SET:
41525 if (register_operand (SET_DEST (x), VOIDmode)
41526 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41528 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41529 return true;
41532 if (register_operand (SET_SRC (x), VOIDmode))
41533 /* Avoid potentially incorrect high cost from rtx_costs
41534 for non-tieable SUBREGs. */
41535 src_cost = 0;
41536 else
41538 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41540 if (CONSTANT_P (SET_SRC (x)))
41541 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41542 a small value, possibly zero for cheap constants. */
41543 src_cost += COSTS_N_INSNS (1);
41546 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41547 return true;
41549 case CONST_INT:
41550 case CONST:
41551 case LABEL_REF:
41552 case SYMBOL_REF:
41553 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41554 *total = 3;
41555 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41556 *total = 2;
41557 else if (flag_pic && SYMBOLIC_CONST (x)
41558 && !(TARGET_64BIT
41559 && (GET_CODE (x) == LABEL_REF
41560 || (GET_CODE (x) == SYMBOL_REF
41561 && SYMBOL_REF_LOCAL_P (x))))
41562 /* Use 0 cost for CONST to improve its propagation. */
41563 && (TARGET_64BIT || GET_CODE (x) != CONST))
41564 *total = 1;
41565 else
41566 *total = 0;
41567 return true;
41569 case CONST_DOUBLE:
41570 if (IS_STACK_MODE (mode))
41571 switch (standard_80387_constant_p (x))
41573 case -1:
41574 case 0:
41575 break;
41576 case 1: /* 0.0 */
41577 *total = 1;
41578 return true;
41579 default: /* Other constants */
41580 *total = 2;
41581 return true;
41583 /* FALLTHRU */
41585 case CONST_VECTOR:
41586 switch (standard_sse_constant_p (x, mode))
41588 case 0:
41589 break;
41590 case 1: /* 0: xor eliminates false dependency */
41591 *total = 0;
41592 return true;
41593 default: /* -1: cmp contains false dependency */
41594 *total = 1;
41595 return true;
41597 /* FALLTHRU */
41599 case CONST_WIDE_INT:
41600 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41601 it'll probably end up. Add a penalty for size. */
41602 *total = (COSTS_N_INSNS (1)
41603 + (!TARGET_64BIT && flag_pic)
41604 + (GET_MODE_SIZE (mode) <= 4
41605 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41606 return true;
41608 case ZERO_EXTEND:
41609 /* The zero extensions is often completely free on x86_64, so make
41610 it as cheap as possible. */
41611 if (TARGET_64BIT && mode == DImode
41612 && GET_MODE (XEXP (x, 0)) == SImode)
41613 *total = 1;
41614 else if (TARGET_ZERO_EXTEND_WITH_AND)
41615 *total = cost->add;
41616 else
41617 *total = cost->movzx;
41618 return false;
41620 case SIGN_EXTEND:
41621 *total = cost->movsx;
41622 return false;
41624 case ASHIFT:
41625 if (SCALAR_INT_MODE_P (mode)
41626 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41627 && CONST_INT_P (XEXP (x, 1)))
41629 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41630 if (value == 1)
41632 *total = cost->add;
41633 return false;
41635 if ((value == 2 || value == 3)
41636 && cost->lea <= cost->shift_const)
41638 *total = cost->lea;
41639 return false;
41642 /* FALLTHRU */
41644 case ROTATE:
41645 case ASHIFTRT:
41646 case LSHIFTRT:
41647 case ROTATERT:
41648 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41650 /* ??? Should be SSE vector operation cost. */
41651 /* At least for published AMD latencies, this really is the same
41652 as the latency for a simple fpu operation like fabs. */
41653 /* V*QImode is emulated with 1-11 insns. */
41654 if (mode == V16QImode || mode == V32QImode)
41656 int count = 11;
41657 if (TARGET_XOP && mode == V16QImode)
41659 /* For XOP we use vpshab, which requires a broadcast of the
41660 value to the variable shift insn. For constants this
41661 means a V16Q const in mem; even when we can perform the
41662 shift with one insn set the cost to prefer paddb. */
41663 if (CONSTANT_P (XEXP (x, 1)))
41665 *total = (cost->fabs
41666 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41667 + (speed ? 2 : COSTS_N_BYTES (16)));
41668 return true;
41670 count = 3;
41672 else if (TARGET_SSSE3)
41673 count = 7;
41674 *total = cost->fabs * count;
41676 else
41677 *total = cost->fabs;
41679 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41681 if (CONST_INT_P (XEXP (x, 1)))
41683 if (INTVAL (XEXP (x, 1)) > 32)
41684 *total = cost->shift_const + COSTS_N_INSNS (2);
41685 else
41686 *total = cost->shift_const * 2;
41688 else
41690 if (GET_CODE (XEXP (x, 1)) == AND)
41691 *total = cost->shift_var * 2;
41692 else
41693 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41696 else
41698 if (CONST_INT_P (XEXP (x, 1)))
41699 *total = cost->shift_const;
41700 else if (SUBREG_P (XEXP (x, 1))
41701 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41703 /* Return the cost after shift-and truncation. */
41704 *total = cost->shift_var;
41705 return true;
41707 else
41708 *total = cost->shift_var;
41710 return false;
41712 case FMA:
41714 rtx sub;
41716 gcc_assert (FLOAT_MODE_P (mode));
41717 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41719 /* ??? SSE scalar/vector cost should be used here. */
41720 /* ??? Bald assumption that fma has the same cost as fmul. */
41721 *total = cost->fmul;
41722 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41724 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41725 sub = XEXP (x, 0);
41726 if (GET_CODE (sub) == NEG)
41727 sub = XEXP (sub, 0);
41728 *total += rtx_cost (sub, mode, FMA, 0, speed);
41730 sub = XEXP (x, 2);
41731 if (GET_CODE (sub) == NEG)
41732 sub = XEXP (sub, 0);
41733 *total += rtx_cost (sub, mode, FMA, 2, speed);
41734 return true;
41737 case MULT:
41738 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41740 /* ??? SSE scalar cost should be used here. */
41741 *total = cost->fmul;
41742 return false;
41744 else if (X87_FLOAT_MODE_P (mode))
41746 *total = cost->fmul;
41747 return false;
41749 else if (FLOAT_MODE_P (mode))
41751 /* ??? SSE vector cost should be used here. */
41752 *total = cost->fmul;
41753 return false;
41755 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41757 /* V*QImode is emulated with 7-13 insns. */
41758 if (mode == V16QImode || mode == V32QImode)
41760 int extra = 11;
41761 if (TARGET_XOP && mode == V16QImode)
41762 extra = 5;
41763 else if (TARGET_SSSE3)
41764 extra = 6;
41765 *total = cost->fmul * 2 + cost->fabs * extra;
41767 /* V*DImode is emulated with 5-8 insns. */
41768 else if (mode == V2DImode || mode == V4DImode)
41770 if (TARGET_XOP && mode == V2DImode)
41771 *total = cost->fmul * 2 + cost->fabs * 3;
41772 else
41773 *total = cost->fmul * 3 + cost->fabs * 5;
41775 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41776 insns, including two PMULUDQ. */
41777 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41778 *total = cost->fmul * 2 + cost->fabs * 5;
41779 else
41780 *total = cost->fmul;
41781 return false;
41783 else
41785 rtx op0 = XEXP (x, 0);
41786 rtx op1 = XEXP (x, 1);
41787 int nbits;
41788 if (CONST_INT_P (XEXP (x, 1)))
41790 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41791 for (nbits = 0; value != 0; value &= value - 1)
41792 nbits++;
41794 else
41795 /* This is arbitrary. */
41796 nbits = 7;
41798 /* Compute costs correctly for widening multiplication. */
41799 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41800 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41801 == GET_MODE_SIZE (mode))
41803 int is_mulwiden = 0;
41804 machine_mode inner_mode = GET_MODE (op0);
41806 if (GET_CODE (op0) == GET_CODE (op1))
41807 is_mulwiden = 1, op1 = XEXP (op1, 0);
41808 else if (CONST_INT_P (op1))
41810 if (GET_CODE (op0) == SIGN_EXTEND)
41811 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41812 == INTVAL (op1);
41813 else
41814 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41817 if (is_mulwiden)
41818 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41821 *total = (cost->mult_init[MODE_INDEX (mode)]
41822 + nbits * cost->mult_bit
41823 + rtx_cost (op0, mode, outer_code, opno, speed)
41824 + rtx_cost (op1, mode, outer_code, opno, speed));
41826 return true;
41829 case DIV:
41830 case UDIV:
41831 case MOD:
41832 case UMOD:
41833 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41834 /* ??? SSE cost should be used here. */
41835 *total = cost->fdiv;
41836 else if (X87_FLOAT_MODE_P (mode))
41837 *total = cost->fdiv;
41838 else if (FLOAT_MODE_P (mode))
41839 /* ??? SSE vector cost should be used here. */
41840 *total = cost->fdiv;
41841 else
41842 *total = cost->divide[MODE_INDEX (mode)];
41843 return false;
41845 case PLUS:
41846 if (GET_MODE_CLASS (mode) == MODE_INT
41847 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41849 if (GET_CODE (XEXP (x, 0)) == PLUS
41850 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41851 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41852 && CONSTANT_P (XEXP (x, 1)))
41854 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41855 if (val == 2 || val == 4 || val == 8)
41857 *total = cost->lea;
41858 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41859 outer_code, opno, speed);
41860 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41861 outer_code, opno, speed);
41862 *total += rtx_cost (XEXP (x, 1), mode,
41863 outer_code, opno, speed);
41864 return true;
41867 else if (GET_CODE (XEXP (x, 0)) == MULT
41868 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41870 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41871 if (val == 2 || val == 4 || val == 8)
41873 *total = cost->lea;
41874 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41875 outer_code, opno, speed);
41876 *total += rtx_cost (XEXP (x, 1), mode,
41877 outer_code, opno, speed);
41878 return true;
41881 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41883 /* Add with carry, ignore the cost of adding a carry flag. */
41884 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41885 *total = cost->add;
41886 else
41888 *total = cost->lea;
41889 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41890 outer_code, opno, speed);
41893 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41894 outer_code, opno, speed);
41895 *total += rtx_cost (XEXP (x, 1), mode,
41896 outer_code, opno, speed);
41897 return true;
41900 /* FALLTHRU */
41902 case MINUS:
41903 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41904 if (GET_MODE_CLASS (mode) == MODE_INT
41905 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41906 && GET_CODE (XEXP (x, 0)) == MINUS
41907 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41909 *total = cost->add;
41910 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41911 outer_code, opno, speed);
41912 *total += rtx_cost (XEXP (x, 1), mode,
41913 outer_code, opno, speed);
41914 return true;
41917 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41919 /* ??? SSE cost should be used here. */
41920 *total = cost->fadd;
41921 return false;
41923 else if (X87_FLOAT_MODE_P (mode))
41925 *total = cost->fadd;
41926 return false;
41928 else if (FLOAT_MODE_P (mode))
41930 /* ??? SSE vector cost should be used here. */
41931 *total = cost->fadd;
41932 return false;
41934 /* FALLTHRU */
41936 case AND:
41937 case IOR:
41938 case XOR:
41939 if (GET_MODE_CLASS (mode) == MODE_INT
41940 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41942 *total = (cost->add * 2
41943 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41944 << (GET_MODE (XEXP (x, 0)) != DImode))
41945 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41946 << (GET_MODE (XEXP (x, 1)) != DImode)));
41947 return true;
41949 /* FALLTHRU */
41951 case NEG:
41952 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41954 /* ??? SSE cost should be used here. */
41955 *total = cost->fchs;
41956 return false;
41958 else if (X87_FLOAT_MODE_P (mode))
41960 *total = cost->fchs;
41961 return false;
41963 else if (FLOAT_MODE_P (mode))
41965 /* ??? SSE vector cost should be used here. */
41966 *total = cost->fchs;
41967 return false;
41969 /* FALLTHRU */
41971 case NOT:
41972 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41974 /* ??? Should be SSE vector operation cost. */
41975 /* At least for published AMD latencies, this really is the same
41976 as the latency for a simple fpu operation like fabs. */
41977 *total = cost->fabs;
41979 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41980 *total = cost->add * 2;
41981 else
41982 *total = cost->add;
41983 return false;
41985 case COMPARE:
41986 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41987 && XEXP (XEXP (x, 0), 1) == const1_rtx
41988 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41989 && XEXP (x, 1) == const0_rtx)
41991 /* This kind of construct is implemented using test[bwl].
41992 Treat it as if we had an AND. */
41993 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41994 *total = (cost->add
41995 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41996 opno, speed)
41997 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41998 return true;
42001 /* The embedded comparison operand is completely free. */
42002 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
42003 && XEXP (x, 1) == const0_rtx)
42004 *total = 0;
42006 return false;
42008 case FLOAT_EXTEND:
42009 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
42010 *total = 0;
42011 return false;
42013 case ABS:
42014 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42015 /* ??? SSE cost should be used here. */
42016 *total = cost->fabs;
42017 else if (X87_FLOAT_MODE_P (mode))
42018 *total = cost->fabs;
42019 else if (FLOAT_MODE_P (mode))
42020 /* ??? SSE vector cost should be used here. */
42021 *total = cost->fabs;
42022 return false;
42024 case SQRT:
42025 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42026 /* ??? SSE cost should be used here. */
42027 *total = cost->fsqrt;
42028 else if (X87_FLOAT_MODE_P (mode))
42029 *total = cost->fsqrt;
42030 else if (FLOAT_MODE_P (mode))
42031 /* ??? SSE vector cost should be used here. */
42032 *total = cost->fsqrt;
42033 return false;
42035 case UNSPEC:
42036 if (XINT (x, 1) == UNSPEC_TP)
42037 *total = 0;
42038 return false;
42040 case VEC_SELECT:
42041 case VEC_CONCAT:
42042 case VEC_DUPLICATE:
42043 /* ??? Assume all of these vector manipulation patterns are
42044 recognizable. In which case they all pretty much have the
42045 same cost. */
42046 *total = cost->fabs;
42047 return true;
42048 case VEC_MERGE:
42049 mask = XEXP (x, 2);
42050 /* This is masked instruction, assume the same cost,
42051 as nonmasked variant. */
42052 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
42053 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
42054 else
42055 *total = cost->fabs;
42056 return true;
42058 default:
42059 return false;
42063 #if TARGET_MACHO
42065 static int current_machopic_label_num;
42067 /* Given a symbol name and its associated stub, write out the
42068 definition of the stub. */
42070 void
42071 machopic_output_stub (FILE *file, const char *symb, const char *stub)
42073 unsigned int length;
42074 char *binder_name, *symbol_name, lazy_ptr_name[32];
42075 int label = ++current_machopic_label_num;
42077 /* For 64-bit we shouldn't get here. */
42078 gcc_assert (!TARGET_64BIT);
42080 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
42081 symb = targetm.strip_name_encoding (symb);
42083 length = strlen (stub);
42084 binder_name = XALLOCAVEC (char, length + 32);
42085 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
42087 length = strlen (symb);
42088 symbol_name = XALLOCAVEC (char, length + 32);
42089 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
42091 sprintf (lazy_ptr_name, "L%d$lz", label);
42093 if (MACHOPIC_ATT_STUB)
42094 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
42095 else if (MACHOPIC_PURE)
42096 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
42097 else
42098 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
42100 fprintf (file, "%s:\n", stub);
42101 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42103 if (MACHOPIC_ATT_STUB)
42105 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
42107 else if (MACHOPIC_PURE)
42109 /* PIC stub. */
42110 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42111 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
42112 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
42113 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
42114 label, lazy_ptr_name, label);
42115 fprintf (file, "\tjmp\t*%%ecx\n");
42117 else
42118 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
42120 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
42121 it needs no stub-binding-helper. */
42122 if (MACHOPIC_ATT_STUB)
42123 return;
42125 fprintf (file, "%s:\n", binder_name);
42127 if (MACHOPIC_PURE)
42129 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
42130 fprintf (file, "\tpushl\t%%ecx\n");
42132 else
42133 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
42135 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
42137 /* N.B. Keep the correspondence of these
42138 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
42139 old-pic/new-pic/non-pic stubs; altering this will break
42140 compatibility with existing dylibs. */
42141 if (MACHOPIC_PURE)
42143 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42144 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
42146 else
42147 /* 16-byte -mdynamic-no-pic stub. */
42148 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
42150 fprintf (file, "%s:\n", lazy_ptr_name);
42151 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42152 fprintf (file, ASM_LONG "%s\n", binder_name);
42154 #endif /* TARGET_MACHO */
42156 /* Order the registers for register allocator. */
42158 void
42159 x86_order_regs_for_local_alloc (void)
42161 int pos = 0;
42162 int i;
42164 /* First allocate the local general purpose registers. */
42165 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42166 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42167 reg_alloc_order [pos++] = i;
42169 /* Global general purpose registers. */
42170 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42171 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42172 reg_alloc_order [pos++] = i;
42174 /* x87 registers come first in case we are doing FP math
42175 using them. */
42176 if (!TARGET_SSE_MATH)
42177 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42178 reg_alloc_order [pos++] = i;
42180 /* SSE registers. */
42181 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42182 reg_alloc_order [pos++] = i;
42183 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42184 reg_alloc_order [pos++] = i;
42186 /* Extended REX SSE registers. */
42187 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42188 reg_alloc_order [pos++] = i;
42190 /* Mask register. */
42191 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42192 reg_alloc_order [pos++] = i;
42194 /* MPX bound registers. */
42195 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42196 reg_alloc_order [pos++] = i;
42198 /* x87 registers. */
42199 if (TARGET_SSE_MATH)
42200 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42201 reg_alloc_order [pos++] = i;
42203 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42204 reg_alloc_order [pos++] = i;
42206 /* Initialize the rest of array as we do not allocate some registers
42207 at all. */
42208 while (pos < FIRST_PSEUDO_REGISTER)
42209 reg_alloc_order [pos++] = 0;
42212 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42213 in struct attribute_spec handler. */
42214 static tree
42215 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42216 tree args,
42217 int,
42218 bool *no_add_attrs)
42220 if (TREE_CODE (*node) != FUNCTION_TYPE
42221 && TREE_CODE (*node) != METHOD_TYPE
42222 && TREE_CODE (*node) != FIELD_DECL
42223 && TREE_CODE (*node) != TYPE_DECL)
42225 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42226 name);
42227 *no_add_attrs = true;
42228 return NULL_TREE;
42230 if (TARGET_64BIT)
42232 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42233 name);
42234 *no_add_attrs = true;
42235 return NULL_TREE;
42237 if (is_attribute_p ("callee_pop_aggregate_return", name))
42239 tree cst;
42241 cst = TREE_VALUE (args);
42242 if (TREE_CODE (cst) != INTEGER_CST)
42244 warning (OPT_Wattributes,
42245 "%qE attribute requires an integer constant argument",
42246 name);
42247 *no_add_attrs = true;
42249 else if (compare_tree_int (cst, 0) != 0
42250 && compare_tree_int (cst, 1) != 0)
42252 warning (OPT_Wattributes,
42253 "argument to %qE attribute is neither zero, nor one",
42254 name);
42255 *no_add_attrs = true;
42258 return NULL_TREE;
42261 return NULL_TREE;
42264 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42265 struct attribute_spec.handler. */
42266 static tree
42267 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42268 bool *no_add_attrs)
42270 if (TREE_CODE (*node) != FUNCTION_TYPE
42271 && TREE_CODE (*node) != METHOD_TYPE
42272 && TREE_CODE (*node) != FIELD_DECL
42273 && TREE_CODE (*node) != TYPE_DECL)
42275 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42276 name);
42277 *no_add_attrs = true;
42278 return NULL_TREE;
42281 /* Can combine regparm with all attributes but fastcall. */
42282 if (is_attribute_p ("ms_abi", name))
42284 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42286 error ("ms_abi and sysv_abi attributes are not compatible");
42289 return NULL_TREE;
42291 else if (is_attribute_p ("sysv_abi", name))
42293 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42295 error ("ms_abi and sysv_abi attributes are not compatible");
42298 return NULL_TREE;
42301 return NULL_TREE;
42304 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42305 struct attribute_spec.handler. */
42306 static tree
42307 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42308 bool *no_add_attrs)
42310 tree *type = NULL;
42311 if (DECL_P (*node))
42313 if (TREE_CODE (*node) == TYPE_DECL)
42314 type = &TREE_TYPE (*node);
42316 else
42317 type = node;
42319 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42321 warning (OPT_Wattributes, "%qE attribute ignored",
42322 name);
42323 *no_add_attrs = true;
42326 else if ((is_attribute_p ("ms_struct", name)
42327 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42328 || ((is_attribute_p ("gcc_struct", name)
42329 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42331 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42332 name);
42333 *no_add_attrs = true;
42336 return NULL_TREE;
42339 static tree
42340 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42341 bool *no_add_attrs)
42343 if (TREE_CODE (*node) != FUNCTION_DECL)
42345 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42346 name);
42347 *no_add_attrs = true;
42349 return NULL_TREE;
42352 static tree
42353 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42354 int, bool *)
42356 return NULL_TREE;
42359 static tree
42360 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42362 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42363 but the function type contains args and return type data. */
42364 tree func_type = *node;
42365 tree return_type = TREE_TYPE (func_type);
42367 int nargs = 0;
42368 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42369 while (current_arg_type
42370 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42372 if (nargs == 0)
42374 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42375 error ("interrupt service routine should have a pointer "
42376 "as the first argument");
42378 else if (nargs == 1)
42380 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42381 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42382 error ("interrupt service routine should have unsigned %s"
42383 "int as the second argument",
42384 TARGET_64BIT
42385 ? (TARGET_X32 ? "long long " : "long ")
42386 : "");
42388 nargs++;
42389 current_arg_type = TREE_CHAIN (current_arg_type);
42391 if (!nargs || nargs > 2)
42392 error ("interrupt service routine can only have a pointer argument "
42393 "and an optional integer argument");
42394 if (! VOID_TYPE_P (return_type))
42395 error ("interrupt service routine can't have non-void return value");
42397 return NULL_TREE;
42400 static bool
42401 ix86_ms_bitfield_layout_p (const_tree record_type)
42403 return ((TARGET_MS_BITFIELD_LAYOUT
42404 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42405 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42408 /* Returns an expression indicating where the this parameter is
42409 located on entry to the FUNCTION. */
42411 static rtx
42412 x86_this_parameter (tree function)
42414 tree type = TREE_TYPE (function);
42415 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42416 int nregs;
42418 if (TARGET_64BIT)
42420 const int *parm_regs;
42422 if (ix86_function_type_abi (type) == MS_ABI)
42423 parm_regs = x86_64_ms_abi_int_parameter_registers;
42424 else
42425 parm_regs = x86_64_int_parameter_registers;
42426 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42429 nregs = ix86_function_regparm (type, function);
42431 if (nregs > 0 && !stdarg_p (type))
42433 int regno;
42434 unsigned int ccvt = ix86_get_callcvt (type);
42436 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42437 regno = aggr ? DX_REG : CX_REG;
42438 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42440 regno = CX_REG;
42441 if (aggr)
42442 return gen_rtx_MEM (SImode,
42443 plus_constant (Pmode, stack_pointer_rtx, 4));
42445 else
42447 regno = AX_REG;
42448 if (aggr)
42450 regno = DX_REG;
42451 if (nregs == 1)
42452 return gen_rtx_MEM (SImode,
42453 plus_constant (Pmode,
42454 stack_pointer_rtx, 4));
42457 return gen_rtx_REG (SImode, regno);
42460 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42461 aggr ? 8 : 4));
42464 /* Determine whether x86_output_mi_thunk can succeed. */
42466 static bool
42467 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42468 const_tree function)
42470 /* 64-bit can handle anything. */
42471 if (TARGET_64BIT)
42472 return true;
42474 /* For 32-bit, everything's fine if we have one free register. */
42475 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42476 return true;
42478 /* Need a free register for vcall_offset. */
42479 if (vcall_offset)
42480 return false;
42482 /* Need a free register for GOT references. */
42483 if (flag_pic && !targetm.binds_local_p (function))
42484 return false;
42486 /* Otherwise ok. */
42487 return true;
42490 /* Output the assembler code for a thunk function. THUNK_DECL is the
42491 declaration for the thunk function itself, FUNCTION is the decl for
42492 the target function. DELTA is an immediate constant offset to be
42493 added to THIS. If VCALL_OFFSET is nonzero, the word at
42494 *(*this + vcall_offset) should be added to THIS. */
42496 static void
42497 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42498 HOST_WIDE_INT vcall_offset, tree function)
42500 rtx this_param = x86_this_parameter (function);
42501 rtx this_reg, tmp, fnaddr;
42502 unsigned int tmp_regno;
42503 rtx_insn *insn;
42505 if (TARGET_64BIT)
42506 tmp_regno = R10_REG;
42507 else
42509 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42510 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42511 tmp_regno = AX_REG;
42512 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42513 tmp_regno = DX_REG;
42514 else
42515 tmp_regno = CX_REG;
42518 emit_note (NOTE_INSN_PROLOGUE_END);
42520 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42521 pull it in now and let DELTA benefit. */
42522 if (REG_P (this_param))
42523 this_reg = this_param;
42524 else if (vcall_offset)
42526 /* Put the this parameter into %eax. */
42527 this_reg = gen_rtx_REG (Pmode, AX_REG);
42528 emit_move_insn (this_reg, this_param);
42530 else
42531 this_reg = NULL_RTX;
42533 /* Adjust the this parameter by a fixed constant. */
42534 if (delta)
42536 rtx delta_rtx = GEN_INT (delta);
42537 rtx delta_dst = this_reg ? this_reg : this_param;
42539 if (TARGET_64BIT)
42541 if (!x86_64_general_operand (delta_rtx, Pmode))
42543 tmp = gen_rtx_REG (Pmode, tmp_regno);
42544 emit_move_insn (tmp, delta_rtx);
42545 delta_rtx = tmp;
42549 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42552 /* Adjust the this parameter by a value stored in the vtable. */
42553 if (vcall_offset)
42555 rtx vcall_addr, vcall_mem, this_mem;
42557 tmp = gen_rtx_REG (Pmode, tmp_regno);
42559 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42560 if (Pmode != ptr_mode)
42561 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42562 emit_move_insn (tmp, this_mem);
42564 /* Adjust the this parameter. */
42565 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42566 if (TARGET_64BIT
42567 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42569 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42570 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42571 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42574 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42575 if (Pmode != ptr_mode)
42576 emit_insn (gen_addsi_1_zext (this_reg,
42577 gen_rtx_REG (ptr_mode,
42578 REGNO (this_reg)),
42579 vcall_mem));
42580 else
42581 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42584 /* If necessary, drop THIS back to its stack slot. */
42585 if (this_reg && this_reg != this_param)
42586 emit_move_insn (this_param, this_reg);
42588 fnaddr = XEXP (DECL_RTL (function), 0);
42589 if (TARGET_64BIT)
42591 if (!flag_pic || targetm.binds_local_p (function)
42592 || TARGET_PECOFF)
42594 else
42596 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42597 tmp = gen_rtx_CONST (Pmode, tmp);
42598 fnaddr = gen_const_mem (Pmode, tmp);
42601 else
42603 if (!flag_pic || targetm.binds_local_p (function))
42605 #if TARGET_MACHO
42606 else if (TARGET_MACHO)
42608 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42609 fnaddr = XEXP (fnaddr, 0);
42611 #endif /* TARGET_MACHO */
42612 else
42614 tmp = gen_rtx_REG (Pmode, CX_REG);
42615 output_set_got (tmp, NULL_RTX);
42617 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42618 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42619 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42620 fnaddr = gen_const_mem (Pmode, fnaddr);
42624 /* Our sibling call patterns do not allow memories, because we have no
42625 predicate that can distinguish between frame and non-frame memory.
42626 For our purposes here, we can get away with (ab)using a jump pattern,
42627 because we're going to do no optimization. */
42628 if (MEM_P (fnaddr))
42630 if (sibcall_insn_operand (fnaddr, word_mode))
42632 fnaddr = XEXP (DECL_RTL (function), 0);
42633 tmp = gen_rtx_MEM (QImode, fnaddr);
42634 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42635 tmp = emit_call_insn (tmp);
42636 SIBLING_CALL_P (tmp) = 1;
42638 else
42639 emit_jump_insn (gen_indirect_jump (fnaddr));
42641 else
42643 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42645 // CM_LARGE_PIC always uses pseudo PIC register which is
42646 // uninitialized. Since FUNCTION is local and calling it
42647 // doesn't go through PLT, we use scratch register %r11 as
42648 // PIC register and initialize it here.
42649 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42650 ix86_init_large_pic_reg (tmp_regno);
42651 fnaddr = legitimize_pic_address (fnaddr,
42652 gen_rtx_REG (Pmode, tmp_regno));
42655 if (!sibcall_insn_operand (fnaddr, word_mode))
42657 tmp = gen_rtx_REG (word_mode, tmp_regno);
42658 if (GET_MODE (fnaddr) != word_mode)
42659 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42660 emit_move_insn (tmp, fnaddr);
42661 fnaddr = tmp;
42664 tmp = gen_rtx_MEM (QImode, fnaddr);
42665 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42666 tmp = emit_call_insn (tmp);
42667 SIBLING_CALL_P (tmp) = 1;
42669 emit_barrier ();
42671 /* Emit just enough of rest_of_compilation to get the insns emitted.
42672 Note that use_thunk calls assemble_start_function et al. */
42673 insn = get_insns ();
42674 shorten_branches (insn);
42675 final_start_function (insn, file, 1);
42676 final (insn, file, 1);
42677 final_end_function ();
42680 static void
42681 x86_file_start (void)
42683 default_file_start ();
42684 if (TARGET_16BIT)
42685 fputs ("\t.code16gcc\n", asm_out_file);
42686 #if TARGET_MACHO
42687 darwin_file_start ();
42688 #endif
42689 if (X86_FILE_START_VERSION_DIRECTIVE)
42690 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42691 if (X86_FILE_START_FLTUSED)
42692 fputs ("\t.global\t__fltused\n", asm_out_file);
42693 if (ix86_asm_dialect == ASM_INTEL)
42694 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42698 x86_field_alignment (tree type, int computed)
42700 machine_mode mode;
42702 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42703 return computed;
42704 if (TARGET_IAMCU)
42705 return iamcu_alignment (type, computed);
42706 mode = TYPE_MODE (strip_array_types (type));
42707 if (mode == DFmode || mode == DCmode
42708 || GET_MODE_CLASS (mode) == MODE_INT
42709 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42710 return MIN (32, computed);
42711 return computed;
42714 /* Print call to TARGET to FILE. */
42716 static void
42717 x86_print_call_or_nop (FILE *file, const char *target)
42719 if (flag_nop_mcount)
42720 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42721 else
42722 fprintf (file, "1:\tcall\t%s\n", target);
42725 /* Output assembler code to FILE to increment profiler label # LABELNO
42726 for profiling a function entry. */
42727 void
42728 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42730 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42731 : MCOUNT_NAME);
42732 if (TARGET_64BIT)
42734 #ifndef NO_PROFILE_COUNTERS
42735 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42736 #endif
42738 if (!TARGET_PECOFF && flag_pic)
42739 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42740 else
42741 x86_print_call_or_nop (file, mcount_name);
42743 else if (flag_pic)
42745 #ifndef NO_PROFILE_COUNTERS
42746 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42747 LPREFIX, labelno);
42748 #endif
42749 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42751 else
42753 #ifndef NO_PROFILE_COUNTERS
42754 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42755 LPREFIX, labelno);
42756 #endif
42757 x86_print_call_or_nop (file, mcount_name);
42760 if (flag_record_mcount)
42762 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42763 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42764 fprintf (file, "\t.previous\n");
42768 /* We don't have exact information about the insn sizes, but we may assume
42769 quite safely that we are informed about all 1 byte insns and memory
42770 address sizes. This is enough to eliminate unnecessary padding in
42771 99% of cases. */
42773 static int
42774 min_insn_size (rtx_insn *insn)
42776 int l = 0, len;
42778 if (!INSN_P (insn) || !active_insn_p (insn))
42779 return 0;
42781 /* Discard alignments we've emit and jump instructions. */
42782 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42783 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42784 return 0;
42786 /* Important case - calls are always 5 bytes.
42787 It is common to have many calls in the row. */
42788 if (CALL_P (insn)
42789 && symbolic_reference_mentioned_p (PATTERN (insn))
42790 && !SIBLING_CALL_P (insn))
42791 return 5;
42792 len = get_attr_length (insn);
42793 if (len <= 1)
42794 return 1;
42796 /* For normal instructions we rely on get_attr_length being exact,
42797 with a few exceptions. */
42798 if (!JUMP_P (insn))
42800 enum attr_type type = get_attr_type (insn);
42802 switch (type)
42804 case TYPE_MULTI:
42805 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42806 || asm_noperands (PATTERN (insn)) >= 0)
42807 return 0;
42808 break;
42809 case TYPE_OTHER:
42810 case TYPE_FCMP:
42811 break;
42812 default:
42813 /* Otherwise trust get_attr_length. */
42814 return len;
42817 l = get_attr_length_address (insn);
42818 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42819 l = 4;
42821 if (l)
42822 return 1+l;
42823 else
42824 return 2;
42827 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42829 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42830 window. */
42832 static void
42833 ix86_avoid_jump_mispredicts (void)
42835 rtx_insn *insn, *start = get_insns ();
42836 int nbytes = 0, njumps = 0;
42837 bool isjump = false;
42839 /* Look for all minimal intervals of instructions containing 4 jumps.
42840 The intervals are bounded by START and INSN. NBYTES is the total
42841 size of instructions in the interval including INSN and not including
42842 START. When the NBYTES is smaller than 16 bytes, it is possible
42843 that the end of START and INSN ends up in the same 16byte page.
42845 The smallest offset in the page INSN can start is the case where START
42846 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42847 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42849 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42850 have to, control transfer to label(s) can be performed through other
42851 means, and also we estimate minimum length of all asm stmts as 0. */
42852 for (insn = start; insn; insn = NEXT_INSN (insn))
42854 int min_size;
42856 if (LABEL_P (insn))
42858 int align = label_to_alignment (insn);
42859 int max_skip = label_to_max_skip (insn);
42861 if (max_skip > 15)
42862 max_skip = 15;
42863 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42864 already in the current 16 byte page, because otherwise
42865 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42866 bytes to reach 16 byte boundary. */
42867 if (align <= 0
42868 || (align <= 3 && max_skip != (1 << align) - 1))
42869 max_skip = 0;
42870 if (dump_file)
42871 fprintf (dump_file, "Label %i with max_skip %i\n",
42872 INSN_UID (insn), max_skip);
42873 if (max_skip)
42875 while (nbytes + max_skip >= 16)
42877 start = NEXT_INSN (start);
42878 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42879 || CALL_P (start))
42880 njumps--, isjump = true;
42881 else
42882 isjump = false;
42883 nbytes -= min_insn_size (start);
42886 continue;
42889 min_size = min_insn_size (insn);
42890 nbytes += min_size;
42891 if (dump_file)
42892 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42893 INSN_UID (insn), min_size);
42894 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42895 || CALL_P (insn))
42896 njumps++;
42897 else
42898 continue;
42900 while (njumps > 3)
42902 start = NEXT_INSN (start);
42903 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42904 || CALL_P (start))
42905 njumps--, isjump = true;
42906 else
42907 isjump = false;
42908 nbytes -= min_insn_size (start);
42910 gcc_assert (njumps >= 0);
42911 if (dump_file)
42912 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42913 INSN_UID (start), INSN_UID (insn), nbytes);
42915 if (njumps == 3 && isjump && nbytes < 16)
42917 int padsize = 15 - nbytes + min_insn_size (insn);
42919 if (dump_file)
42920 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42921 INSN_UID (insn), padsize);
42922 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42926 #endif
42928 /* AMD Athlon works faster
42929 when RET is not destination of conditional jump or directly preceded
42930 by other jump instruction. We avoid the penalty by inserting NOP just
42931 before the RET instructions in such cases. */
42932 static void
42933 ix86_pad_returns (void)
42935 edge e;
42936 edge_iterator ei;
42938 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42940 basic_block bb = e->src;
42941 rtx_insn *ret = BB_END (bb);
42942 rtx_insn *prev;
42943 bool replace = false;
42945 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42946 || optimize_bb_for_size_p (bb))
42947 continue;
42948 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42949 if (active_insn_p (prev) || LABEL_P (prev))
42950 break;
42951 if (prev && LABEL_P (prev))
42953 edge e;
42954 edge_iterator ei;
42956 FOR_EACH_EDGE (e, ei, bb->preds)
42957 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42958 && !(e->flags & EDGE_FALLTHRU))
42960 replace = true;
42961 break;
42964 if (!replace)
42966 prev = prev_active_insn (ret);
42967 if (prev
42968 && ((JUMP_P (prev) && any_condjump_p (prev))
42969 || CALL_P (prev)))
42970 replace = true;
42971 /* Empty functions get branch mispredict even when
42972 the jump destination is not visible to us. */
42973 if (!prev && !optimize_function_for_size_p (cfun))
42974 replace = true;
42976 if (replace)
42978 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42979 delete_insn (ret);
42984 /* Count the minimum number of instructions in BB. Return 4 if the
42985 number of instructions >= 4. */
42987 static int
42988 ix86_count_insn_bb (basic_block bb)
42990 rtx_insn *insn;
42991 int insn_count = 0;
42993 /* Count number of instructions in this block. Return 4 if the number
42994 of instructions >= 4. */
42995 FOR_BB_INSNS (bb, insn)
42997 /* Only happen in exit blocks. */
42998 if (JUMP_P (insn)
42999 && ANY_RETURN_P (PATTERN (insn)))
43000 break;
43002 if (NONDEBUG_INSN_P (insn)
43003 && GET_CODE (PATTERN (insn)) != USE
43004 && GET_CODE (PATTERN (insn)) != CLOBBER)
43006 insn_count++;
43007 if (insn_count >= 4)
43008 return insn_count;
43012 return insn_count;
43016 /* Count the minimum number of instructions in code path in BB.
43017 Return 4 if the number of instructions >= 4. */
43019 static int
43020 ix86_count_insn (basic_block bb)
43022 edge e;
43023 edge_iterator ei;
43024 int min_prev_count;
43026 /* Only bother counting instructions along paths with no
43027 more than 2 basic blocks between entry and exit. Given
43028 that BB has an edge to exit, determine if a predecessor
43029 of BB has an edge from entry. If so, compute the number
43030 of instructions in the predecessor block. If there
43031 happen to be multiple such blocks, compute the minimum. */
43032 min_prev_count = 4;
43033 FOR_EACH_EDGE (e, ei, bb->preds)
43035 edge prev_e;
43036 edge_iterator prev_ei;
43038 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43040 min_prev_count = 0;
43041 break;
43043 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
43045 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43047 int count = ix86_count_insn_bb (e->src);
43048 if (count < min_prev_count)
43049 min_prev_count = count;
43050 break;
43055 if (min_prev_count < 4)
43056 min_prev_count += ix86_count_insn_bb (bb);
43058 return min_prev_count;
43061 /* Pad short function to 4 instructions. */
43063 static void
43064 ix86_pad_short_function (void)
43066 edge e;
43067 edge_iterator ei;
43069 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43071 rtx_insn *ret = BB_END (e->src);
43072 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
43074 int insn_count = ix86_count_insn (e->src);
43076 /* Pad short function. */
43077 if (insn_count < 4)
43079 rtx_insn *insn = ret;
43081 /* Find epilogue. */
43082 while (insn
43083 && (!NOTE_P (insn)
43084 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
43085 insn = PREV_INSN (insn);
43087 if (!insn)
43088 insn = ret;
43090 /* Two NOPs count as one instruction. */
43091 insn_count = 2 * (4 - insn_count);
43092 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
43098 /* Fix up a Windows system unwinder issue. If an EH region falls through into
43099 the epilogue, the Windows system unwinder will apply epilogue logic and
43100 produce incorrect offsets. This can be avoided by adding a nop between
43101 the last insn that can throw and the first insn of the epilogue. */
43103 static void
43104 ix86_seh_fixup_eh_fallthru (void)
43106 edge e;
43107 edge_iterator ei;
43109 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43111 rtx_insn *insn, *next;
43113 /* Find the beginning of the epilogue. */
43114 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
43115 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
43116 break;
43117 if (insn == NULL)
43118 continue;
43120 /* We only care about preceding insns that can throw. */
43121 insn = prev_active_insn (insn);
43122 if (insn == NULL || !can_throw_internal (insn))
43123 continue;
43125 /* Do not separate calls from their debug information. */
43126 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
43127 if (NOTE_P (next)
43128 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
43129 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
43130 insn = next;
43131 else
43132 break;
43134 emit_insn_after (gen_nops (const1_rtx), insn);
43138 /* Given a register number BASE, the lowest of a group of registers, update
43139 regsets IN and OUT with the registers that should be avoided in input
43140 and output operands respectively when trying to avoid generating a modr/m
43141 byte for -fmitigate-rop. */
43143 static void
43144 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
43146 SET_HARD_REG_BIT (out, base);
43147 SET_HARD_REG_BIT (out, base + 1);
43148 SET_HARD_REG_BIT (in, base + 2);
43149 SET_HARD_REG_BIT (in, base + 3);
43152 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
43153 that certain encodings of modr/m bytes do not occur. */
43154 static void
43155 ix86_mitigate_rop (void)
43157 HARD_REG_SET input_risky;
43158 HARD_REG_SET output_risky;
43159 HARD_REG_SET inout_risky;
43161 CLEAR_HARD_REG_SET (output_risky);
43162 CLEAR_HARD_REG_SET (input_risky);
43163 SET_HARD_REG_BIT (output_risky, AX_REG);
43164 SET_HARD_REG_BIT (output_risky, CX_REG);
43165 SET_HARD_REG_BIT (input_risky, BX_REG);
43166 SET_HARD_REG_BIT (input_risky, DX_REG);
43167 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43168 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43169 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43170 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43171 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43172 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43173 COPY_HARD_REG_SET (inout_risky, input_risky);
43174 IOR_HARD_REG_SET (inout_risky, output_risky);
43176 df_note_add_problem ();
43177 /* Fix up what stack-regs did. */
43178 df_insn_rescan_all ();
43179 df_analyze ();
43181 regrename_init (true);
43182 regrename_analyze (NULL);
43184 auto_vec<du_head_p> cands;
43186 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43188 if (!NONDEBUG_INSN_P (insn))
43189 continue;
43191 if (GET_CODE (PATTERN (insn)) == USE
43192 || GET_CODE (PATTERN (insn)) == CLOBBER)
43193 continue;
43195 extract_insn (insn);
43197 int opno0, opno1;
43198 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43199 recog_data.n_operands, &opno0,
43200 &opno1);
43202 if (!ix86_rop_should_change_byte_p (modrm))
43203 continue;
43205 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43207 /* This happens when regrename has to fail a block. */
43208 if (!info->op_info)
43209 continue;
43211 if (info->op_info[opno0].n_chains != 0)
43213 gcc_assert (info->op_info[opno0].n_chains == 1);
43214 du_head_p op0c;
43215 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43216 if (op0c->target_data_1 + op0c->target_data_2 == 0
43217 && !op0c->cannot_rename)
43218 cands.safe_push (op0c);
43220 op0c->target_data_1++;
43222 if (info->op_info[opno1].n_chains != 0)
43224 gcc_assert (info->op_info[opno1].n_chains == 1);
43225 du_head_p op1c;
43226 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43227 if (op1c->target_data_1 + op1c->target_data_2 == 0
43228 && !op1c->cannot_rename)
43229 cands.safe_push (op1c);
43231 op1c->target_data_2++;
43235 int i;
43236 du_head_p head;
43237 FOR_EACH_VEC_ELT (cands, i, head)
43239 int old_reg, best_reg;
43240 HARD_REG_SET unavailable;
43242 CLEAR_HARD_REG_SET (unavailable);
43243 if (head->target_data_1)
43244 IOR_HARD_REG_SET (unavailable, output_risky);
43245 if (head->target_data_2)
43246 IOR_HARD_REG_SET (unavailable, input_risky);
43248 int n_uses;
43249 reg_class superclass = regrename_find_superclass (head, &n_uses,
43250 &unavailable);
43251 old_reg = head->regno;
43252 best_reg = find_rename_reg (head, superclass, &unavailable,
43253 old_reg, false);
43254 bool ok = regrename_do_replace (head, best_reg);
43255 gcc_assert (ok);
43256 if (dump_file)
43257 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43258 reg_names[best_reg], reg_class_names[superclass]);
43262 regrename_finish ();
43264 df_analyze ();
43266 basic_block bb;
43267 regset_head live;
43269 INIT_REG_SET (&live);
43271 FOR_EACH_BB_FN (bb, cfun)
43273 rtx_insn *insn;
43275 COPY_REG_SET (&live, DF_LR_OUT (bb));
43276 df_simulate_initialize_backwards (bb, &live);
43278 FOR_BB_INSNS_REVERSE (bb, insn)
43280 if (!NONDEBUG_INSN_P (insn))
43281 continue;
43283 df_simulate_one_insn_backwards (bb, insn, &live);
43285 if (GET_CODE (PATTERN (insn)) == USE
43286 || GET_CODE (PATTERN (insn)) == CLOBBER)
43287 continue;
43289 extract_insn (insn);
43290 constrain_operands_cached (insn, reload_completed);
43291 int opno0, opno1;
43292 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43293 recog_data.n_operands, &opno0,
43294 &opno1);
43295 if (modrm < 0
43296 || !ix86_rop_should_change_byte_p (modrm)
43297 || opno0 == opno1)
43298 continue;
43300 rtx oldreg = recog_data.operand[opno1];
43301 preprocess_constraints (insn);
43302 const operand_alternative *alt = which_op_alt ();
43304 int i;
43305 for (i = 0; i < recog_data.n_operands; i++)
43306 if (i != opno1
43307 && alt[i].earlyclobber
43308 && reg_overlap_mentioned_p (recog_data.operand[i],
43309 oldreg))
43310 break;
43312 if (i < recog_data.n_operands)
43313 continue;
43315 if (dump_file)
43316 fprintf (dump_file,
43317 "attempting to fix modrm byte in insn %d:"
43318 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43319 reg_class_names[alt[opno1].cl]);
43321 HARD_REG_SET unavailable;
43322 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43323 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43324 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43325 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43326 IOR_HARD_REG_SET (unavailable, output_risky);
43327 IOR_COMPL_HARD_REG_SET (unavailable,
43328 reg_class_contents[alt[opno1].cl]);
43330 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43331 if (!TEST_HARD_REG_BIT (unavailable, i))
43332 break;
43333 if (i == FIRST_PSEUDO_REGISTER)
43335 if (dump_file)
43336 fprintf (dump_file, ", none available\n");
43337 continue;
43339 if (dump_file)
43340 fprintf (dump_file, " -> %d\n", i);
43341 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43342 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43343 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43348 /* Implement machine specific optimizations. We implement padding of returns
43349 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43350 static void
43351 ix86_reorg (void)
43353 /* We are freeing block_for_insn in the toplev to keep compatibility
43354 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43355 compute_bb_for_insn ();
43357 if (flag_mitigate_rop)
43358 ix86_mitigate_rop ();
43360 if (TARGET_SEH && current_function_has_exception_handlers ())
43361 ix86_seh_fixup_eh_fallthru ();
43363 if (optimize && optimize_function_for_speed_p (cfun))
43365 if (TARGET_PAD_SHORT_FUNCTION)
43366 ix86_pad_short_function ();
43367 else if (TARGET_PAD_RETURNS)
43368 ix86_pad_returns ();
43369 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43370 if (TARGET_FOUR_JUMP_LIMIT)
43371 ix86_avoid_jump_mispredicts ();
43372 #endif
43376 /* Return nonzero when QImode register that must be represented via REX prefix
43377 is used. */
43378 bool
43379 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43381 int i;
43382 extract_insn_cached (insn);
43383 for (i = 0; i < recog_data.n_operands; i++)
43384 if (GENERAL_REG_P (recog_data.operand[i])
43385 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43386 return true;
43387 return false;
43390 /* Return true when INSN mentions register that must be encoded using REX
43391 prefix. */
43392 bool
43393 x86_extended_reg_mentioned_p (rtx insn)
43395 subrtx_iterator::array_type array;
43396 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43398 const_rtx x = *iter;
43399 if (REG_P (x)
43400 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43401 return true;
43403 return false;
43406 /* If profitable, negate (without causing overflow) integer constant
43407 of mode MODE at location LOC. Return true in this case. */
43408 bool
43409 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43411 HOST_WIDE_INT val;
43413 if (!CONST_INT_P (*loc))
43414 return false;
43416 switch (mode)
43418 case E_DImode:
43419 /* DImode x86_64 constants must fit in 32 bits. */
43420 gcc_assert (x86_64_immediate_operand (*loc, mode));
43422 mode = SImode;
43423 break;
43425 case E_SImode:
43426 case E_HImode:
43427 case E_QImode:
43428 break;
43430 default:
43431 gcc_unreachable ();
43434 /* Avoid overflows. */
43435 if (mode_signbit_p (mode, *loc))
43436 return false;
43438 val = INTVAL (*loc);
43440 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43441 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43442 if ((val < 0 && val != -128)
43443 || val == 128)
43445 *loc = GEN_INT (-val);
43446 return true;
43449 return false;
43452 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43453 optabs would emit if we didn't have TFmode patterns. */
43455 void
43456 x86_emit_floatuns (rtx operands[2])
43458 rtx_code_label *neglab, *donelab;
43459 rtx i0, i1, f0, in, out;
43460 machine_mode mode, inmode;
43462 inmode = GET_MODE (operands[1]);
43463 gcc_assert (inmode == SImode || inmode == DImode);
43465 out = operands[0];
43466 in = force_reg (inmode, operands[1]);
43467 mode = GET_MODE (out);
43468 neglab = gen_label_rtx ();
43469 donelab = gen_label_rtx ();
43470 f0 = gen_reg_rtx (mode);
43472 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43474 expand_float (out, in, 0);
43476 emit_jump_insn (gen_jump (donelab));
43477 emit_barrier ();
43479 emit_label (neglab);
43481 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43482 1, OPTAB_DIRECT);
43483 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43484 1, OPTAB_DIRECT);
43485 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43487 expand_float (f0, i0, 0);
43489 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43491 emit_label (donelab);
43494 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43495 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43496 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43497 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43499 /* Get a vector mode of the same size as the original but with elements
43500 twice as wide. This is only guaranteed to apply to integral vectors. */
43502 static inline machine_mode
43503 get_mode_wider_vector (machine_mode o)
43505 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43506 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
43507 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43508 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43509 return n;
43512 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43513 fill target with val via vec_duplicate. */
43515 static bool
43516 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43518 bool ok;
43519 rtx_insn *insn;
43520 rtx dup;
43522 /* First attempt to recognize VAL as-is. */
43523 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43524 insn = emit_insn (gen_rtx_SET (target, dup));
43525 if (recog_memoized (insn) < 0)
43527 rtx_insn *seq;
43528 machine_mode innermode = GET_MODE_INNER (mode);
43529 rtx reg;
43531 /* If that fails, force VAL into a register. */
43533 start_sequence ();
43534 reg = force_reg (innermode, val);
43535 if (GET_MODE (reg) != innermode)
43536 reg = gen_lowpart (innermode, reg);
43537 XEXP (dup, 0) = reg;
43538 seq = get_insns ();
43539 end_sequence ();
43540 if (seq)
43541 emit_insn_before (seq, insn);
43543 ok = recog_memoized (insn) >= 0;
43544 gcc_assert (ok);
43546 return true;
43549 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43550 with all elements equal to VAR. Return true if successful. */
43552 static bool
43553 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43554 rtx target, rtx val)
43556 bool ok;
43558 switch (mode)
43560 case E_V2SImode:
43561 case E_V2SFmode:
43562 if (!mmx_ok)
43563 return false;
43564 /* FALLTHRU */
43566 case E_V4DFmode:
43567 case E_V4DImode:
43568 case E_V8SFmode:
43569 case E_V8SImode:
43570 case E_V2DFmode:
43571 case E_V2DImode:
43572 case E_V4SFmode:
43573 case E_V4SImode:
43574 case E_V16SImode:
43575 case E_V8DImode:
43576 case E_V16SFmode:
43577 case E_V8DFmode:
43578 return ix86_vector_duplicate_value (mode, target, val);
43580 case E_V4HImode:
43581 if (!mmx_ok)
43582 return false;
43583 if (TARGET_SSE || TARGET_3DNOW_A)
43585 rtx x;
43587 val = gen_lowpart (SImode, val);
43588 x = gen_rtx_TRUNCATE (HImode, val);
43589 x = gen_rtx_VEC_DUPLICATE (mode, x);
43590 emit_insn (gen_rtx_SET (target, x));
43591 return true;
43593 goto widen;
43595 case E_V8QImode:
43596 if (!mmx_ok)
43597 return false;
43598 goto widen;
43600 case E_V8HImode:
43601 if (TARGET_AVX2)
43602 return ix86_vector_duplicate_value (mode, target, val);
43604 if (TARGET_SSE2)
43606 struct expand_vec_perm_d dperm;
43607 rtx tmp1, tmp2;
43609 permute:
43610 memset (&dperm, 0, sizeof (dperm));
43611 dperm.target = target;
43612 dperm.vmode = mode;
43613 dperm.nelt = GET_MODE_NUNITS (mode);
43614 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43615 dperm.one_operand_p = true;
43617 /* Extend to SImode using a paradoxical SUBREG. */
43618 tmp1 = gen_reg_rtx (SImode);
43619 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43621 /* Insert the SImode value as low element of a V4SImode vector. */
43622 tmp2 = gen_reg_rtx (V4SImode);
43623 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43624 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43626 ok = (expand_vec_perm_1 (&dperm)
43627 || expand_vec_perm_broadcast_1 (&dperm));
43628 gcc_assert (ok);
43629 return ok;
43631 goto widen;
43633 case E_V16QImode:
43634 if (TARGET_AVX2)
43635 return ix86_vector_duplicate_value (mode, target, val);
43637 if (TARGET_SSE2)
43638 goto permute;
43639 goto widen;
43641 widen:
43642 /* Replicate the value once into the next wider mode and recurse. */
43644 machine_mode smode, wsmode, wvmode;
43645 rtx x;
43647 smode = GET_MODE_INNER (mode);
43648 wvmode = get_mode_wider_vector (mode);
43649 wsmode = GET_MODE_INNER (wvmode);
43651 val = convert_modes (wsmode, smode, val, true);
43652 x = expand_simple_binop (wsmode, ASHIFT, val,
43653 GEN_INT (GET_MODE_BITSIZE (smode)),
43654 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43655 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43657 x = gen_reg_rtx (wvmode);
43658 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43659 gcc_assert (ok);
43660 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43661 return ok;
43664 case E_V16HImode:
43665 case E_V32QImode:
43666 if (TARGET_AVX2)
43667 return ix86_vector_duplicate_value (mode, target, val);
43668 else
43670 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43671 rtx x = gen_reg_rtx (hvmode);
43673 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43674 gcc_assert (ok);
43676 x = gen_rtx_VEC_CONCAT (mode, x, x);
43677 emit_insn (gen_rtx_SET (target, x));
43679 return true;
43681 case E_V64QImode:
43682 case E_V32HImode:
43683 if (TARGET_AVX512BW)
43684 return ix86_vector_duplicate_value (mode, target, val);
43685 else
43687 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43688 rtx x = gen_reg_rtx (hvmode);
43690 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43691 gcc_assert (ok);
43693 x = gen_rtx_VEC_CONCAT (mode, x, x);
43694 emit_insn (gen_rtx_SET (target, x));
43696 return true;
43698 default:
43699 return false;
43703 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43704 whose ONE_VAR element is VAR, and other elements are zero. Return true
43705 if successful. */
43707 static bool
43708 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43709 rtx target, rtx var, int one_var)
43711 machine_mode vsimode;
43712 rtx new_target;
43713 rtx x, tmp;
43714 bool use_vector_set = false;
43716 switch (mode)
43718 case E_V2DImode:
43719 /* For SSE4.1, we normally use vector set. But if the second
43720 element is zero and inter-unit moves are OK, we use movq
43721 instead. */
43722 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43723 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43724 && one_var == 0));
43725 break;
43726 case E_V16QImode:
43727 case E_V4SImode:
43728 case E_V4SFmode:
43729 use_vector_set = TARGET_SSE4_1;
43730 break;
43731 case E_V8HImode:
43732 use_vector_set = TARGET_SSE2;
43733 break;
43734 case E_V4HImode:
43735 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43736 break;
43737 case E_V32QImode:
43738 case E_V16HImode:
43739 case E_V8SImode:
43740 case E_V8SFmode:
43741 case E_V4DFmode:
43742 use_vector_set = TARGET_AVX;
43743 break;
43744 case E_V4DImode:
43745 /* Use ix86_expand_vector_set in 64bit mode only. */
43746 use_vector_set = TARGET_AVX && TARGET_64BIT;
43747 break;
43748 default:
43749 break;
43752 if (use_vector_set)
43754 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43755 var = force_reg (GET_MODE_INNER (mode), var);
43756 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43757 return true;
43760 switch (mode)
43762 case E_V2SFmode:
43763 case E_V2SImode:
43764 if (!mmx_ok)
43765 return false;
43766 /* FALLTHRU */
43768 case E_V2DFmode:
43769 case E_V2DImode:
43770 if (one_var != 0)
43771 return false;
43772 var = force_reg (GET_MODE_INNER (mode), var);
43773 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43774 emit_insn (gen_rtx_SET (target, x));
43775 return true;
43777 case E_V4SFmode:
43778 case E_V4SImode:
43779 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43780 new_target = gen_reg_rtx (mode);
43781 else
43782 new_target = target;
43783 var = force_reg (GET_MODE_INNER (mode), var);
43784 x = gen_rtx_VEC_DUPLICATE (mode, var);
43785 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43786 emit_insn (gen_rtx_SET (new_target, x));
43787 if (one_var != 0)
43789 /* We need to shuffle the value to the correct position, so
43790 create a new pseudo to store the intermediate result. */
43792 /* With SSE2, we can use the integer shuffle insns. */
43793 if (mode != V4SFmode && TARGET_SSE2)
43795 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43796 const1_rtx,
43797 GEN_INT (one_var == 1 ? 0 : 1),
43798 GEN_INT (one_var == 2 ? 0 : 1),
43799 GEN_INT (one_var == 3 ? 0 : 1)));
43800 if (target != new_target)
43801 emit_move_insn (target, new_target);
43802 return true;
43805 /* Otherwise convert the intermediate result to V4SFmode and
43806 use the SSE1 shuffle instructions. */
43807 if (mode != V4SFmode)
43809 tmp = gen_reg_rtx (V4SFmode);
43810 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43812 else
43813 tmp = new_target;
43815 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43816 const1_rtx,
43817 GEN_INT (one_var == 1 ? 0 : 1),
43818 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43819 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43821 if (mode != V4SFmode)
43822 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43823 else if (tmp != target)
43824 emit_move_insn (target, tmp);
43826 else if (target != new_target)
43827 emit_move_insn (target, new_target);
43828 return true;
43830 case E_V8HImode:
43831 case E_V16QImode:
43832 vsimode = V4SImode;
43833 goto widen;
43834 case E_V4HImode:
43835 case E_V8QImode:
43836 if (!mmx_ok)
43837 return false;
43838 vsimode = V2SImode;
43839 goto widen;
43840 widen:
43841 if (one_var != 0)
43842 return false;
43844 /* Zero extend the variable element to SImode and recurse. */
43845 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43847 x = gen_reg_rtx (vsimode);
43848 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43849 var, one_var))
43850 gcc_unreachable ();
43852 emit_move_insn (target, gen_lowpart (mode, x));
43853 return true;
43855 default:
43856 return false;
43860 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43861 consisting of the values in VALS. It is known that all elements
43862 except ONE_VAR are constants. Return true if successful. */
43864 static bool
43865 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43866 rtx target, rtx vals, int one_var)
43868 rtx var = XVECEXP (vals, 0, one_var);
43869 machine_mode wmode;
43870 rtx const_vec, x;
43872 const_vec = copy_rtx (vals);
43873 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43874 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43876 switch (mode)
43878 case E_V2DFmode:
43879 case E_V2DImode:
43880 case E_V2SFmode:
43881 case E_V2SImode:
43882 /* For the two element vectors, it's just as easy to use
43883 the general case. */
43884 return false;
43886 case E_V4DImode:
43887 /* Use ix86_expand_vector_set in 64bit mode only. */
43888 if (!TARGET_64BIT)
43889 return false;
43890 /* FALLTHRU */
43891 case E_V4DFmode:
43892 case E_V8SFmode:
43893 case E_V8SImode:
43894 case E_V16HImode:
43895 case E_V32QImode:
43896 case E_V4SFmode:
43897 case E_V4SImode:
43898 case E_V8HImode:
43899 case E_V4HImode:
43900 break;
43902 case E_V16QImode:
43903 if (TARGET_SSE4_1)
43904 break;
43905 wmode = V8HImode;
43906 goto widen;
43907 case E_V8QImode:
43908 wmode = V4HImode;
43909 goto widen;
43910 widen:
43911 /* There's no way to set one QImode entry easily. Combine
43912 the variable value with its adjacent constant value, and
43913 promote to an HImode set. */
43914 x = XVECEXP (vals, 0, one_var ^ 1);
43915 if (one_var & 1)
43917 var = convert_modes (HImode, QImode, var, true);
43918 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43919 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43920 x = GEN_INT (INTVAL (x) & 0xff);
43922 else
43924 var = convert_modes (HImode, QImode, var, true);
43925 x = gen_int_mode (INTVAL (x) << 8, HImode);
43927 if (x != const0_rtx)
43928 var = expand_simple_binop (HImode, IOR, var, x, var,
43929 1, OPTAB_LIB_WIDEN);
43931 x = gen_reg_rtx (wmode);
43932 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43933 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43935 emit_move_insn (target, gen_lowpart (mode, x));
43936 return true;
43938 default:
43939 return false;
43942 emit_move_insn (target, const_vec);
43943 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43944 return true;
43947 /* A subroutine of ix86_expand_vector_init_general. Use vector
43948 concatenate to handle the most general case: all values variable,
43949 and none identical. */
43951 static void
43952 ix86_expand_vector_init_concat (machine_mode mode,
43953 rtx target, rtx *ops, int n)
43955 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43956 rtx first[16], second[8], third[4];
43957 rtvec v;
43958 int i, j;
43960 switch (n)
43962 case 2:
43963 switch (mode)
43965 case E_V16SImode:
43966 cmode = V8SImode;
43967 break;
43968 case E_V16SFmode:
43969 cmode = V8SFmode;
43970 break;
43971 case E_V8DImode:
43972 cmode = V4DImode;
43973 break;
43974 case E_V8DFmode:
43975 cmode = V4DFmode;
43976 break;
43977 case E_V8SImode:
43978 cmode = V4SImode;
43979 break;
43980 case E_V8SFmode:
43981 cmode = V4SFmode;
43982 break;
43983 case E_V4DImode:
43984 cmode = V2DImode;
43985 break;
43986 case E_V4DFmode:
43987 cmode = V2DFmode;
43988 break;
43989 case E_V4SImode:
43990 cmode = V2SImode;
43991 break;
43992 case E_V4SFmode:
43993 cmode = V2SFmode;
43994 break;
43995 case E_V2DImode:
43996 cmode = DImode;
43997 break;
43998 case E_V2SImode:
43999 cmode = SImode;
44000 break;
44001 case E_V2DFmode:
44002 cmode = DFmode;
44003 break;
44004 case E_V2SFmode:
44005 cmode = SFmode;
44006 break;
44007 default:
44008 gcc_unreachable ();
44011 if (!register_operand (ops[1], cmode))
44012 ops[1] = force_reg (cmode, ops[1]);
44013 if (!register_operand (ops[0], cmode))
44014 ops[0] = force_reg (cmode, ops[0]);
44015 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
44016 ops[1])));
44017 break;
44019 case 4:
44020 switch (mode)
44022 case E_V4DImode:
44023 cmode = V2DImode;
44024 break;
44025 case E_V4DFmode:
44026 cmode = V2DFmode;
44027 break;
44028 case E_V4SImode:
44029 cmode = V2SImode;
44030 break;
44031 case E_V4SFmode:
44032 cmode = V2SFmode;
44033 break;
44034 default:
44035 gcc_unreachable ();
44037 goto half;
44039 case 8:
44040 switch (mode)
44042 case E_V8DImode:
44043 cmode = V2DImode;
44044 hmode = V4DImode;
44045 break;
44046 case E_V8DFmode:
44047 cmode = V2DFmode;
44048 hmode = V4DFmode;
44049 break;
44050 case E_V8SImode:
44051 cmode = V2SImode;
44052 hmode = V4SImode;
44053 break;
44054 case E_V8SFmode:
44055 cmode = V2SFmode;
44056 hmode = V4SFmode;
44057 break;
44058 default:
44059 gcc_unreachable ();
44061 goto half;
44063 case 16:
44064 switch (mode)
44066 case E_V16SImode:
44067 cmode = V2SImode;
44068 hmode = V4SImode;
44069 gmode = V8SImode;
44070 break;
44071 case E_V16SFmode:
44072 cmode = V2SFmode;
44073 hmode = V4SFmode;
44074 gmode = V8SFmode;
44075 break;
44076 default:
44077 gcc_unreachable ();
44079 goto half;
44081 half:
44082 /* FIXME: We process inputs backward to help RA. PR 36222. */
44083 i = n - 1;
44084 j = (n >> 1) - 1;
44085 for (; i > 0; i -= 2, j--)
44087 first[j] = gen_reg_rtx (cmode);
44088 v = gen_rtvec (2, ops[i - 1], ops[i]);
44089 ix86_expand_vector_init (false, first[j],
44090 gen_rtx_PARALLEL (cmode, v));
44093 n >>= 1;
44094 if (n > 4)
44096 gcc_assert (hmode != VOIDmode);
44097 gcc_assert (gmode != VOIDmode);
44098 for (i = j = 0; i < n; i += 2, j++)
44100 second[j] = gen_reg_rtx (hmode);
44101 ix86_expand_vector_init_concat (hmode, second [j],
44102 &first [i], 2);
44104 n >>= 1;
44105 for (i = j = 0; i < n; i += 2, j++)
44107 third[j] = gen_reg_rtx (gmode);
44108 ix86_expand_vector_init_concat (gmode, third[j],
44109 &second[i], 2);
44111 n >>= 1;
44112 ix86_expand_vector_init_concat (mode, target, third, n);
44114 else if (n > 2)
44116 gcc_assert (hmode != VOIDmode);
44117 for (i = j = 0; i < n; i += 2, j++)
44119 second[j] = gen_reg_rtx (hmode);
44120 ix86_expand_vector_init_concat (hmode, second [j],
44121 &first [i], 2);
44123 n >>= 1;
44124 ix86_expand_vector_init_concat (mode, target, second, n);
44126 else
44127 ix86_expand_vector_init_concat (mode, target, first, n);
44128 break;
44130 default:
44131 gcc_unreachable ();
44135 /* A subroutine of ix86_expand_vector_init_general. Use vector
44136 interleave to handle the most general case: all values variable,
44137 and none identical. */
44139 static void
44140 ix86_expand_vector_init_interleave (machine_mode mode,
44141 rtx target, rtx *ops, int n)
44143 machine_mode first_imode, second_imode, third_imode, inner_mode;
44144 int i, j;
44145 rtx op0, op1;
44146 rtx (*gen_load_even) (rtx, rtx, rtx);
44147 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
44148 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
44150 switch (mode)
44152 case E_V8HImode:
44153 gen_load_even = gen_vec_setv8hi;
44154 gen_interleave_first_low = gen_vec_interleave_lowv4si;
44155 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44156 inner_mode = HImode;
44157 first_imode = V4SImode;
44158 second_imode = V2DImode;
44159 third_imode = VOIDmode;
44160 break;
44161 case E_V16QImode:
44162 gen_load_even = gen_vec_setv16qi;
44163 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44164 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44165 inner_mode = QImode;
44166 first_imode = V8HImode;
44167 second_imode = V4SImode;
44168 third_imode = V2DImode;
44169 break;
44170 default:
44171 gcc_unreachable ();
44174 for (i = 0; i < n; i++)
44176 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44177 op0 = gen_reg_rtx (SImode);
44178 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44180 /* Insert the SImode value as low element of V4SImode vector. */
44181 op1 = gen_reg_rtx (V4SImode);
44182 op0 = gen_rtx_VEC_MERGE (V4SImode,
44183 gen_rtx_VEC_DUPLICATE (V4SImode,
44184 op0),
44185 CONST0_RTX (V4SImode),
44186 const1_rtx);
44187 emit_insn (gen_rtx_SET (op1, op0));
44189 /* Cast the V4SImode vector back to a vector in orignal mode. */
44190 op0 = gen_reg_rtx (mode);
44191 emit_move_insn (op0, gen_lowpart (mode, op1));
44193 /* Load even elements into the second position. */
44194 emit_insn (gen_load_even (op0,
44195 force_reg (inner_mode,
44196 ops [i + i + 1]),
44197 const1_rtx));
44199 /* Cast vector to FIRST_IMODE vector. */
44200 ops[i] = gen_reg_rtx (first_imode);
44201 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44204 /* Interleave low FIRST_IMODE vectors. */
44205 for (i = j = 0; i < n; i += 2, j++)
44207 op0 = gen_reg_rtx (first_imode);
44208 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44210 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44211 ops[j] = gen_reg_rtx (second_imode);
44212 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44215 /* Interleave low SECOND_IMODE vectors. */
44216 switch (second_imode)
44218 case E_V4SImode:
44219 for (i = j = 0; i < n / 2; i += 2, j++)
44221 op0 = gen_reg_rtx (second_imode);
44222 emit_insn (gen_interleave_second_low (op0, ops[i],
44223 ops[i + 1]));
44225 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44226 vector. */
44227 ops[j] = gen_reg_rtx (third_imode);
44228 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44230 second_imode = V2DImode;
44231 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44232 /* FALLTHRU */
44234 case E_V2DImode:
44235 op0 = gen_reg_rtx (second_imode);
44236 emit_insn (gen_interleave_second_low (op0, ops[0],
44237 ops[1]));
44239 /* Cast the SECOND_IMODE vector back to a vector on original
44240 mode. */
44241 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44242 break;
44244 default:
44245 gcc_unreachable ();
44249 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44250 all values variable, and none identical. */
44252 static void
44253 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44254 rtx target, rtx vals)
44256 rtx ops[64], op0, op1, op2, op3, op4, op5;
44257 machine_mode half_mode = VOIDmode;
44258 machine_mode quarter_mode = VOIDmode;
44259 int n, i;
44261 switch (mode)
44263 case E_V2SFmode:
44264 case E_V2SImode:
44265 if (!mmx_ok && !TARGET_SSE)
44266 break;
44267 /* FALLTHRU */
44269 case E_V16SImode:
44270 case E_V16SFmode:
44271 case E_V8DFmode:
44272 case E_V8DImode:
44273 case E_V8SFmode:
44274 case E_V8SImode:
44275 case E_V4DFmode:
44276 case E_V4DImode:
44277 case E_V4SFmode:
44278 case E_V4SImode:
44279 case E_V2DFmode:
44280 case E_V2DImode:
44281 n = GET_MODE_NUNITS (mode);
44282 for (i = 0; i < n; i++)
44283 ops[i] = XVECEXP (vals, 0, i);
44284 ix86_expand_vector_init_concat (mode, target, ops, n);
44285 return;
44287 case E_V2TImode:
44288 for (i = 0; i < 2; i++)
44289 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44290 op0 = gen_reg_rtx (V4DImode);
44291 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44292 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44293 return;
44295 case E_V4TImode:
44296 for (i = 0; i < 4; i++)
44297 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44298 ops[4] = gen_reg_rtx (V4DImode);
44299 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44300 ops[5] = gen_reg_rtx (V4DImode);
44301 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44302 op0 = gen_reg_rtx (V8DImode);
44303 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44304 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44305 return;
44307 case E_V32QImode:
44308 half_mode = V16QImode;
44309 goto half;
44311 case E_V16HImode:
44312 half_mode = V8HImode;
44313 goto half;
44315 half:
44316 n = GET_MODE_NUNITS (mode);
44317 for (i = 0; i < n; i++)
44318 ops[i] = XVECEXP (vals, 0, i);
44319 op0 = gen_reg_rtx (half_mode);
44320 op1 = gen_reg_rtx (half_mode);
44321 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44322 n >> 2);
44323 ix86_expand_vector_init_interleave (half_mode, op1,
44324 &ops [n >> 1], n >> 2);
44325 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44326 return;
44328 case E_V64QImode:
44329 quarter_mode = V16QImode;
44330 half_mode = V32QImode;
44331 goto quarter;
44333 case E_V32HImode:
44334 quarter_mode = V8HImode;
44335 half_mode = V16HImode;
44336 goto quarter;
44338 quarter:
44339 n = GET_MODE_NUNITS (mode);
44340 for (i = 0; i < n; i++)
44341 ops[i] = XVECEXP (vals, 0, i);
44342 op0 = gen_reg_rtx (quarter_mode);
44343 op1 = gen_reg_rtx (quarter_mode);
44344 op2 = gen_reg_rtx (quarter_mode);
44345 op3 = gen_reg_rtx (quarter_mode);
44346 op4 = gen_reg_rtx (half_mode);
44347 op5 = gen_reg_rtx (half_mode);
44348 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44349 n >> 3);
44350 ix86_expand_vector_init_interleave (quarter_mode, op1,
44351 &ops [n >> 2], n >> 3);
44352 ix86_expand_vector_init_interleave (quarter_mode, op2,
44353 &ops [n >> 1], n >> 3);
44354 ix86_expand_vector_init_interleave (quarter_mode, op3,
44355 &ops [(n >> 1) | (n >> 2)], n >> 3);
44356 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44357 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44358 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44359 return;
44361 case E_V16QImode:
44362 if (!TARGET_SSE4_1)
44363 break;
44364 /* FALLTHRU */
44366 case E_V8HImode:
44367 if (!TARGET_SSE2)
44368 break;
44370 /* Don't use ix86_expand_vector_init_interleave if we can't
44371 move from GPR to SSE register directly. */
44372 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44373 break;
44375 n = GET_MODE_NUNITS (mode);
44376 for (i = 0; i < n; i++)
44377 ops[i] = XVECEXP (vals, 0, i);
44378 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44379 return;
44381 case E_V4HImode:
44382 case E_V8QImode:
44383 break;
44385 default:
44386 gcc_unreachable ();
44390 int i, j, n_elts, n_words, n_elt_per_word;
44391 machine_mode inner_mode;
44392 rtx words[4], shift;
44394 inner_mode = GET_MODE_INNER (mode);
44395 n_elts = GET_MODE_NUNITS (mode);
44396 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44397 n_elt_per_word = n_elts / n_words;
44398 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44400 for (i = 0; i < n_words; ++i)
44402 rtx word = NULL_RTX;
44404 for (j = 0; j < n_elt_per_word; ++j)
44406 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44407 elt = convert_modes (word_mode, inner_mode, elt, true);
44409 if (j == 0)
44410 word = elt;
44411 else
44413 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44414 word, 1, OPTAB_LIB_WIDEN);
44415 word = expand_simple_binop (word_mode, IOR, word, elt,
44416 word, 1, OPTAB_LIB_WIDEN);
44420 words[i] = word;
44423 if (n_words == 1)
44424 emit_move_insn (target, gen_lowpart (mode, words[0]));
44425 else if (n_words == 2)
44427 rtx tmp = gen_reg_rtx (mode);
44428 emit_clobber (tmp);
44429 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44430 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44431 emit_move_insn (target, tmp);
44433 else if (n_words == 4)
44435 rtx tmp = gen_reg_rtx (V4SImode);
44436 gcc_assert (word_mode == SImode);
44437 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44438 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44439 emit_move_insn (target, gen_lowpart (mode, tmp));
44441 else
44442 gcc_unreachable ();
44446 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44447 instructions unless MMX_OK is true. */
44449 void
44450 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44452 machine_mode mode = GET_MODE (target);
44453 machine_mode inner_mode = GET_MODE_INNER (mode);
44454 int n_elts = GET_MODE_NUNITS (mode);
44455 int n_var = 0, one_var = -1;
44456 bool all_same = true, all_const_zero = true;
44457 int i;
44458 rtx x;
44460 /* Handle first initialization from vector elts. */
44461 if (n_elts != XVECLEN (vals, 0))
44463 rtx subtarget = target;
44464 x = XVECEXP (vals, 0, 0);
44465 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
44466 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
44468 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
44469 if (inner_mode == QImode || inner_mode == HImode)
44471 mode = mode_for_vector (SImode,
44472 n_elts * GET_MODE_SIZE (inner_mode) / 4);
44473 inner_mode
44474 = mode_for_vector (SImode,
44475 n_elts * GET_MODE_SIZE (inner_mode) / 8);
44476 ops[0] = gen_lowpart (inner_mode, ops[0]);
44477 ops[1] = gen_lowpart (inner_mode, ops[1]);
44478 subtarget = gen_reg_rtx (mode);
44480 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
44481 if (subtarget != target)
44482 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
44483 return;
44485 gcc_unreachable ();
44488 for (i = 0; i < n_elts; ++i)
44490 x = XVECEXP (vals, 0, i);
44491 if (!(CONST_SCALAR_INT_P (x)
44492 || CONST_DOUBLE_P (x)
44493 || CONST_FIXED_P (x)))
44494 n_var++, one_var = i;
44495 else if (x != CONST0_RTX (inner_mode))
44496 all_const_zero = false;
44497 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44498 all_same = false;
44501 /* Constants are best loaded from the constant pool. */
44502 if (n_var == 0)
44504 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44505 return;
44508 /* If all values are identical, broadcast the value. */
44509 if (all_same
44510 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44511 XVECEXP (vals, 0, 0)))
44512 return;
44514 /* Values where only one field is non-constant are best loaded from
44515 the pool and overwritten via move later. */
44516 if (n_var == 1)
44518 if (all_const_zero
44519 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44520 XVECEXP (vals, 0, one_var),
44521 one_var))
44522 return;
44524 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44525 return;
44528 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44531 void
44532 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44534 machine_mode mode = GET_MODE (target);
44535 machine_mode inner_mode = GET_MODE_INNER (mode);
44536 machine_mode half_mode;
44537 bool use_vec_merge = false;
44538 rtx tmp;
44539 static rtx (*gen_extract[6][2]) (rtx, rtx)
44541 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44542 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44543 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44544 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44545 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44546 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44548 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44550 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44551 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44552 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44553 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44554 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44555 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44557 int i, j, n;
44558 machine_mode mmode = VOIDmode;
44559 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44561 switch (mode)
44563 case E_V2SFmode:
44564 case E_V2SImode:
44565 if (mmx_ok)
44567 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44568 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44569 if (elt == 0)
44570 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44571 else
44572 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44573 emit_insn (gen_rtx_SET (target, tmp));
44574 return;
44576 break;
44578 case E_V2DImode:
44579 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44580 if (use_vec_merge)
44581 break;
44583 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44584 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44585 if (elt == 0)
44586 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44587 else
44588 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44589 emit_insn (gen_rtx_SET (target, tmp));
44590 return;
44592 case E_V2DFmode:
44594 rtx op0, op1;
44596 /* For the two element vectors, we implement a VEC_CONCAT with
44597 the extraction of the other element. */
44599 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44600 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44602 if (elt == 0)
44603 op0 = val, op1 = tmp;
44604 else
44605 op0 = tmp, op1 = val;
44607 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44608 emit_insn (gen_rtx_SET (target, tmp));
44610 return;
44612 case E_V4SFmode:
44613 use_vec_merge = TARGET_SSE4_1;
44614 if (use_vec_merge)
44615 break;
44617 switch (elt)
44619 case 0:
44620 use_vec_merge = true;
44621 break;
44623 case 1:
44624 /* tmp = target = A B C D */
44625 tmp = copy_to_reg (target);
44626 /* target = A A B B */
44627 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44628 /* target = X A B B */
44629 ix86_expand_vector_set (false, target, val, 0);
44630 /* target = A X C D */
44631 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44632 const1_rtx, const0_rtx,
44633 GEN_INT (2+4), GEN_INT (3+4)));
44634 return;
44636 case 2:
44637 /* tmp = target = A B C D */
44638 tmp = copy_to_reg (target);
44639 /* tmp = X B C D */
44640 ix86_expand_vector_set (false, tmp, val, 0);
44641 /* target = A B X D */
44642 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44643 const0_rtx, const1_rtx,
44644 GEN_INT (0+4), GEN_INT (3+4)));
44645 return;
44647 case 3:
44648 /* tmp = target = A B C D */
44649 tmp = copy_to_reg (target);
44650 /* tmp = X B C D */
44651 ix86_expand_vector_set (false, tmp, val, 0);
44652 /* target = A B X D */
44653 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44654 const0_rtx, const1_rtx,
44655 GEN_INT (2+4), GEN_INT (0+4)));
44656 return;
44658 default:
44659 gcc_unreachable ();
44661 break;
44663 case E_V4SImode:
44664 use_vec_merge = TARGET_SSE4_1;
44665 if (use_vec_merge)
44666 break;
44668 /* Element 0 handled by vec_merge below. */
44669 if (elt == 0)
44671 use_vec_merge = true;
44672 break;
44675 if (TARGET_SSE2)
44677 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44678 store into element 0, then shuffle them back. */
44680 rtx order[4];
44682 order[0] = GEN_INT (elt);
44683 order[1] = const1_rtx;
44684 order[2] = const2_rtx;
44685 order[3] = GEN_INT (3);
44686 order[elt] = const0_rtx;
44688 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44689 order[1], order[2], order[3]));
44691 ix86_expand_vector_set (false, target, val, 0);
44693 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44694 order[1], order[2], order[3]));
44696 else
44698 /* For SSE1, we have to reuse the V4SF code. */
44699 rtx t = gen_reg_rtx (V4SFmode);
44700 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44701 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44702 emit_move_insn (target, gen_lowpart (mode, t));
44704 return;
44706 case E_V8HImode:
44707 use_vec_merge = TARGET_SSE2;
44708 break;
44709 case E_V4HImode:
44710 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44711 break;
44713 case E_V16QImode:
44714 use_vec_merge = TARGET_SSE4_1;
44715 break;
44717 case E_V8QImode:
44718 break;
44720 case E_V32QImode:
44721 half_mode = V16QImode;
44722 j = 0;
44723 n = 16;
44724 goto half;
44726 case E_V16HImode:
44727 half_mode = V8HImode;
44728 j = 1;
44729 n = 8;
44730 goto half;
44732 case E_V8SImode:
44733 half_mode = V4SImode;
44734 j = 2;
44735 n = 4;
44736 goto half;
44738 case E_V4DImode:
44739 half_mode = V2DImode;
44740 j = 3;
44741 n = 2;
44742 goto half;
44744 case E_V8SFmode:
44745 half_mode = V4SFmode;
44746 j = 4;
44747 n = 4;
44748 goto half;
44750 case E_V4DFmode:
44751 half_mode = V2DFmode;
44752 j = 5;
44753 n = 2;
44754 goto half;
44756 half:
44757 /* Compute offset. */
44758 i = elt / n;
44759 elt %= n;
44761 gcc_assert (i <= 1);
44763 /* Extract the half. */
44764 tmp = gen_reg_rtx (half_mode);
44765 emit_insn (gen_extract[j][i] (tmp, target));
44767 /* Put val in tmp at elt. */
44768 ix86_expand_vector_set (false, tmp, val, elt);
44770 /* Put it back. */
44771 emit_insn (gen_insert[j][i] (target, target, tmp));
44772 return;
44774 case E_V8DFmode:
44775 if (TARGET_AVX512F)
44777 mmode = QImode;
44778 gen_blendm = gen_avx512f_blendmv8df;
44780 break;
44782 case E_V8DImode:
44783 if (TARGET_AVX512F)
44785 mmode = QImode;
44786 gen_blendm = gen_avx512f_blendmv8di;
44788 break;
44790 case E_V16SFmode:
44791 if (TARGET_AVX512F)
44793 mmode = HImode;
44794 gen_blendm = gen_avx512f_blendmv16sf;
44796 break;
44798 case E_V16SImode:
44799 if (TARGET_AVX512F)
44801 mmode = HImode;
44802 gen_blendm = gen_avx512f_blendmv16si;
44804 break;
44806 case E_V32HImode:
44807 if (TARGET_AVX512F && TARGET_AVX512BW)
44809 mmode = SImode;
44810 gen_blendm = gen_avx512bw_blendmv32hi;
44812 break;
44814 case E_V64QImode:
44815 if (TARGET_AVX512F && TARGET_AVX512BW)
44817 mmode = DImode;
44818 gen_blendm = gen_avx512bw_blendmv64qi;
44820 break;
44822 default:
44823 break;
44826 if (mmode != VOIDmode)
44828 tmp = gen_reg_rtx (mode);
44829 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44830 /* The avx512*_blendm<mode> expanders have different operand order
44831 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44832 elements where the mask is set and second input operand otherwise,
44833 in {sse,avx}*_*blend* the first input operand is used for elements
44834 where the mask is clear and second input operand otherwise. */
44835 emit_insn (gen_blendm (target, target, tmp,
44836 force_reg (mmode,
44837 gen_int_mode (1 << elt, mmode))));
44839 else if (use_vec_merge)
44841 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44842 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44843 emit_insn (gen_rtx_SET (target, tmp));
44845 else
44847 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44849 emit_move_insn (mem, target);
44851 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44852 emit_move_insn (tmp, val);
44854 emit_move_insn (target, mem);
44858 void
44859 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44861 machine_mode mode = GET_MODE (vec);
44862 machine_mode inner_mode = GET_MODE_INNER (mode);
44863 bool use_vec_extr = false;
44864 rtx tmp;
44866 switch (mode)
44868 case E_V2SImode:
44869 case E_V2SFmode:
44870 if (!mmx_ok)
44871 break;
44872 /* FALLTHRU */
44874 case E_V2DFmode:
44875 case E_V2DImode:
44876 case E_V2TImode:
44877 case E_V4TImode:
44878 use_vec_extr = true;
44879 break;
44881 case E_V4SFmode:
44882 use_vec_extr = TARGET_SSE4_1;
44883 if (use_vec_extr)
44884 break;
44886 switch (elt)
44888 case 0:
44889 tmp = vec;
44890 break;
44892 case 1:
44893 case 3:
44894 tmp = gen_reg_rtx (mode);
44895 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44896 GEN_INT (elt), GEN_INT (elt),
44897 GEN_INT (elt+4), GEN_INT (elt+4)));
44898 break;
44900 case 2:
44901 tmp = gen_reg_rtx (mode);
44902 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44903 break;
44905 default:
44906 gcc_unreachable ();
44908 vec = tmp;
44909 use_vec_extr = true;
44910 elt = 0;
44911 break;
44913 case E_V4SImode:
44914 use_vec_extr = TARGET_SSE4_1;
44915 if (use_vec_extr)
44916 break;
44918 if (TARGET_SSE2)
44920 switch (elt)
44922 case 0:
44923 tmp = vec;
44924 break;
44926 case 1:
44927 case 3:
44928 tmp = gen_reg_rtx (mode);
44929 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44930 GEN_INT (elt), GEN_INT (elt),
44931 GEN_INT (elt), GEN_INT (elt)));
44932 break;
44934 case 2:
44935 tmp = gen_reg_rtx (mode);
44936 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44937 break;
44939 default:
44940 gcc_unreachable ();
44942 vec = tmp;
44943 use_vec_extr = true;
44944 elt = 0;
44946 else
44948 /* For SSE1, we have to reuse the V4SF code. */
44949 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44950 gen_lowpart (V4SFmode, vec), elt);
44951 return;
44953 break;
44955 case E_V8HImode:
44956 use_vec_extr = TARGET_SSE2;
44957 break;
44958 case E_V4HImode:
44959 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44960 break;
44962 case E_V16QImode:
44963 use_vec_extr = TARGET_SSE4_1;
44964 break;
44966 case E_V8SFmode:
44967 if (TARGET_AVX)
44969 tmp = gen_reg_rtx (V4SFmode);
44970 if (elt < 4)
44971 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44972 else
44973 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44974 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44975 return;
44977 break;
44979 case E_V4DFmode:
44980 if (TARGET_AVX)
44982 tmp = gen_reg_rtx (V2DFmode);
44983 if (elt < 2)
44984 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44985 else
44986 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44987 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44988 return;
44990 break;
44992 case E_V32QImode:
44993 if (TARGET_AVX)
44995 tmp = gen_reg_rtx (V16QImode);
44996 if (elt < 16)
44997 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44998 else
44999 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
45000 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45001 return;
45003 break;
45005 case E_V16HImode:
45006 if (TARGET_AVX)
45008 tmp = gen_reg_rtx (V8HImode);
45009 if (elt < 8)
45010 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
45011 else
45012 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
45013 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45014 return;
45016 break;
45018 case E_V8SImode:
45019 if (TARGET_AVX)
45021 tmp = gen_reg_rtx (V4SImode);
45022 if (elt < 4)
45023 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
45024 else
45025 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
45026 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45027 return;
45029 break;
45031 case E_V4DImode:
45032 if (TARGET_AVX)
45034 tmp = gen_reg_rtx (V2DImode);
45035 if (elt < 2)
45036 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
45037 else
45038 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
45039 ix86_expand_vector_extract (false, target, tmp, elt & 1);
45040 return;
45042 break;
45044 case E_V32HImode:
45045 if (TARGET_AVX512BW)
45047 tmp = gen_reg_rtx (V16HImode);
45048 if (elt < 16)
45049 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
45050 else
45051 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
45052 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45053 return;
45055 break;
45057 case E_V64QImode:
45058 if (TARGET_AVX512BW)
45060 tmp = gen_reg_rtx (V32QImode);
45061 if (elt < 32)
45062 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
45063 else
45064 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
45065 ix86_expand_vector_extract (false, target, tmp, elt & 31);
45066 return;
45068 break;
45070 case E_V16SFmode:
45071 tmp = gen_reg_rtx (V8SFmode);
45072 if (elt < 8)
45073 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
45074 else
45075 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
45076 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45077 return;
45079 case E_V8DFmode:
45080 tmp = gen_reg_rtx (V4DFmode);
45081 if (elt < 4)
45082 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
45083 else
45084 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
45085 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45086 return;
45088 case E_V16SImode:
45089 tmp = gen_reg_rtx (V8SImode);
45090 if (elt < 8)
45091 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
45092 else
45093 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
45094 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45095 return;
45097 case E_V8DImode:
45098 tmp = gen_reg_rtx (V4DImode);
45099 if (elt < 4)
45100 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
45101 else
45102 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
45103 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45104 return;
45106 case E_V8QImode:
45107 /* ??? Could extract the appropriate HImode element and shift. */
45108 default:
45109 break;
45112 if (use_vec_extr)
45114 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
45115 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
45117 /* Let the rtl optimizers know about the zero extension performed. */
45118 if (inner_mode == QImode || inner_mode == HImode)
45120 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
45121 target = gen_lowpart (SImode, target);
45124 emit_insn (gen_rtx_SET (target, tmp));
45126 else
45128 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
45130 emit_move_insn (mem, vec);
45132 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45133 emit_move_insn (target, tmp);
45137 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
45138 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
45139 The upper bits of DEST are undefined, though they shouldn't cause
45140 exceptions (some bits from src or all zeros are ok). */
45142 static void
45143 emit_reduc_half (rtx dest, rtx src, int i)
45145 rtx tem, d = dest;
45146 switch (GET_MODE (src))
45148 case E_V4SFmode:
45149 if (i == 128)
45150 tem = gen_sse_movhlps (dest, src, src);
45151 else
45152 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
45153 GEN_INT (1 + 4), GEN_INT (1 + 4));
45154 break;
45155 case E_V2DFmode:
45156 tem = gen_vec_interleave_highv2df (dest, src, src);
45157 break;
45158 case E_V16QImode:
45159 case E_V8HImode:
45160 case E_V4SImode:
45161 case E_V2DImode:
45162 d = gen_reg_rtx (V1TImode);
45163 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
45164 GEN_INT (i / 2));
45165 break;
45166 case E_V8SFmode:
45167 if (i == 256)
45168 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
45169 else
45170 tem = gen_avx_shufps256 (dest, src, src,
45171 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
45172 break;
45173 case E_V4DFmode:
45174 if (i == 256)
45175 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
45176 else
45177 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
45178 break;
45179 case E_V32QImode:
45180 case E_V16HImode:
45181 case E_V8SImode:
45182 case E_V4DImode:
45183 if (i == 256)
45185 if (GET_MODE (dest) != V4DImode)
45186 d = gen_reg_rtx (V4DImode);
45187 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
45188 gen_lowpart (V4DImode, src),
45189 const1_rtx);
45191 else
45193 d = gen_reg_rtx (V2TImode);
45194 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45195 GEN_INT (i / 2));
45197 break;
45198 case E_V64QImode:
45199 case E_V32HImode:
45200 case E_V16SImode:
45201 case E_V16SFmode:
45202 case E_V8DImode:
45203 case E_V8DFmode:
45204 if (i > 128)
45205 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45206 gen_lowpart (V16SImode, src),
45207 gen_lowpart (V16SImode, src),
45208 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45209 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45210 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45211 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45212 GEN_INT (0xC), GEN_INT (0xD),
45213 GEN_INT (0xE), GEN_INT (0xF),
45214 GEN_INT (0x10), GEN_INT (0x11),
45215 GEN_INT (0x12), GEN_INT (0x13),
45216 GEN_INT (0x14), GEN_INT (0x15),
45217 GEN_INT (0x16), GEN_INT (0x17));
45218 else
45219 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45220 gen_lowpart (V16SImode, src),
45221 GEN_INT (i == 128 ? 0x2 : 0x1),
45222 GEN_INT (0x3),
45223 GEN_INT (0x3),
45224 GEN_INT (0x3),
45225 GEN_INT (i == 128 ? 0x6 : 0x5),
45226 GEN_INT (0x7),
45227 GEN_INT (0x7),
45228 GEN_INT (0x7),
45229 GEN_INT (i == 128 ? 0xA : 0x9),
45230 GEN_INT (0xB),
45231 GEN_INT (0xB),
45232 GEN_INT (0xB),
45233 GEN_INT (i == 128 ? 0xE : 0xD),
45234 GEN_INT (0xF),
45235 GEN_INT (0xF),
45236 GEN_INT (0xF));
45237 break;
45238 default:
45239 gcc_unreachable ();
45241 emit_insn (tem);
45242 if (d != dest)
45243 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45246 /* Expand a vector reduction. FN is the binary pattern to reduce;
45247 DEST is the destination; IN is the input vector. */
45249 void
45250 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45252 rtx half, dst, vec = in;
45253 machine_mode mode = GET_MODE (in);
45254 int i;
45256 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45257 if (TARGET_SSE4_1
45258 && mode == V8HImode
45259 && fn == gen_uminv8hi3)
45261 emit_insn (gen_sse4_1_phminposuw (dest, in));
45262 return;
45265 for (i = GET_MODE_BITSIZE (mode);
45266 i > GET_MODE_UNIT_BITSIZE (mode);
45267 i >>= 1)
45269 half = gen_reg_rtx (mode);
45270 emit_reduc_half (half, vec, i);
45271 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45272 dst = dest;
45273 else
45274 dst = gen_reg_rtx (mode);
45275 emit_insn (fn (dst, half, vec));
45276 vec = dst;
45280 /* Target hook for scalar_mode_supported_p. */
45281 static bool
45282 ix86_scalar_mode_supported_p (machine_mode mode)
45284 if (DECIMAL_FLOAT_MODE_P (mode))
45285 return default_decimal_float_supported_p ();
45286 else if (mode == TFmode)
45287 return true;
45288 else
45289 return default_scalar_mode_supported_p (mode);
45292 /* Implements target hook vector_mode_supported_p. */
45293 static bool
45294 ix86_vector_mode_supported_p (machine_mode mode)
45296 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45297 return true;
45298 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45299 return true;
45300 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45301 return true;
45302 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45303 return true;
45304 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45305 return true;
45306 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45307 return true;
45308 return false;
45311 /* Target hook for c_mode_for_suffix. */
45312 static machine_mode
45313 ix86_c_mode_for_suffix (char suffix)
45315 if (suffix == 'q')
45316 return TFmode;
45317 if (suffix == 'w')
45318 return XFmode;
45320 return VOIDmode;
45323 /* Worker function for TARGET_MD_ASM_ADJUST.
45325 We implement asm flag outputs, and maintain source compatibility
45326 with the old cc0-based compiler. */
45328 static rtx_insn *
45329 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45330 vec<const char *> &constraints,
45331 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45333 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45334 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45336 bool saw_asm_flag = false;
45338 start_sequence ();
45339 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45341 const char *con = constraints[i];
45342 if (strncmp (con, "=@cc", 4) != 0)
45343 continue;
45344 con += 4;
45345 if (strchr (con, ',') != NULL)
45347 error ("alternatives not allowed in asm flag output");
45348 continue;
45351 bool invert = false;
45352 if (con[0] == 'n')
45353 invert = true, con++;
45355 machine_mode mode = CCmode;
45356 rtx_code code = UNKNOWN;
45358 switch (con[0])
45360 case 'a':
45361 if (con[1] == 0)
45362 mode = CCAmode, code = EQ;
45363 else if (con[1] == 'e' && con[2] == 0)
45364 mode = CCCmode, code = NE;
45365 break;
45366 case 'b':
45367 if (con[1] == 0)
45368 mode = CCCmode, code = EQ;
45369 else if (con[1] == 'e' && con[2] == 0)
45370 mode = CCAmode, code = NE;
45371 break;
45372 case 'c':
45373 if (con[1] == 0)
45374 mode = CCCmode, code = EQ;
45375 break;
45376 case 'e':
45377 if (con[1] == 0)
45378 mode = CCZmode, code = EQ;
45379 break;
45380 case 'g':
45381 if (con[1] == 0)
45382 mode = CCGCmode, code = GT;
45383 else if (con[1] == 'e' && con[2] == 0)
45384 mode = CCGCmode, code = GE;
45385 break;
45386 case 'l':
45387 if (con[1] == 0)
45388 mode = CCGCmode, code = LT;
45389 else if (con[1] == 'e' && con[2] == 0)
45390 mode = CCGCmode, code = LE;
45391 break;
45392 case 'o':
45393 if (con[1] == 0)
45394 mode = CCOmode, code = EQ;
45395 break;
45396 case 'p':
45397 if (con[1] == 0)
45398 mode = CCPmode, code = EQ;
45399 break;
45400 case 's':
45401 if (con[1] == 0)
45402 mode = CCSmode, code = EQ;
45403 break;
45404 case 'z':
45405 if (con[1] == 0)
45406 mode = CCZmode, code = EQ;
45407 break;
45409 if (code == UNKNOWN)
45411 error ("unknown asm flag output %qs", constraints[i]);
45412 continue;
45414 if (invert)
45415 code = reverse_condition (code);
45417 rtx dest = outputs[i];
45418 if (!saw_asm_flag)
45420 /* This is the first asm flag output. Here we put the flags
45421 register in as the real output and adjust the condition to
45422 allow it. */
45423 constraints[i] = "=Bf";
45424 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45425 saw_asm_flag = true;
45427 else
45429 /* We don't need the flags register as output twice. */
45430 constraints[i] = "=X";
45431 outputs[i] = gen_rtx_SCRATCH (SImode);
45434 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45435 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45437 machine_mode dest_mode = GET_MODE (dest);
45438 if (!SCALAR_INT_MODE_P (dest_mode))
45440 error ("invalid type for asm flag output");
45441 continue;
45444 if (dest_mode == DImode && !TARGET_64BIT)
45445 dest_mode = SImode;
45447 if (dest_mode != QImode)
45449 rtx destqi = gen_reg_rtx (QImode);
45450 emit_insn (gen_rtx_SET (destqi, x));
45452 if (TARGET_ZERO_EXTEND_WITH_AND
45453 && optimize_function_for_speed_p (cfun))
45455 x = force_reg (dest_mode, const0_rtx);
45457 emit_insn (gen_movstrictqi
45458 (gen_lowpart (QImode, x), destqi));
45460 else
45461 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45464 if (dest_mode != GET_MODE (dest))
45466 rtx tmp = gen_reg_rtx (SImode);
45468 emit_insn (gen_rtx_SET (tmp, x));
45469 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45471 else
45472 emit_insn (gen_rtx_SET (dest, x));
45474 rtx_insn *seq = get_insns ();
45475 end_sequence ();
45477 if (saw_asm_flag)
45478 return seq;
45479 else
45481 /* If we had no asm flag outputs, clobber the flags. */
45482 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45483 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45484 return NULL;
45488 /* Implements target vector targetm.asm.encode_section_info. */
45490 static void ATTRIBUTE_UNUSED
45491 ix86_encode_section_info (tree decl, rtx rtl, int first)
45493 default_encode_section_info (decl, rtl, first);
45495 if (ix86_in_large_data_p (decl))
45496 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45499 /* Worker function for REVERSE_CONDITION. */
45501 enum rtx_code
45502 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45504 return (mode != CCFPmode && mode != CCFPUmode
45505 ? reverse_condition (code)
45506 : reverse_condition_maybe_unordered (code));
45509 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45510 to OPERANDS[0]. */
45512 const char *
45513 output_387_reg_move (rtx_insn *insn, rtx *operands)
45515 if (REG_P (operands[0]))
45517 if (REG_P (operands[1])
45518 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45520 if (REGNO (operands[0]) == FIRST_STACK_REG)
45521 return output_387_ffreep (operands, 0);
45522 return "fstp\t%y0";
45524 if (STACK_TOP_P (operands[0]))
45525 return "fld%Z1\t%y1";
45526 return "fst\t%y0";
45528 else if (MEM_P (operands[0]))
45530 gcc_assert (REG_P (operands[1]));
45531 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45532 return "fstp%Z0\t%y0";
45533 else
45535 /* There is no non-popping store to memory for XFmode.
45536 So if we need one, follow the store with a load. */
45537 if (GET_MODE (operands[0]) == XFmode)
45538 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45539 else
45540 return "fst%Z0\t%y0";
45543 else
45544 gcc_unreachable();
45547 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45548 FP status register is set. */
45550 void
45551 ix86_emit_fp_unordered_jump (rtx label)
45553 rtx reg = gen_reg_rtx (HImode);
45554 rtx temp;
45556 emit_insn (gen_x86_fnstsw_1 (reg));
45558 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45560 emit_insn (gen_x86_sahf_1 (reg));
45562 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45563 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45565 else
45567 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45569 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45570 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45573 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45574 gen_rtx_LABEL_REF (VOIDmode, label),
45575 pc_rtx);
45576 temp = gen_rtx_SET (pc_rtx, temp);
45578 emit_jump_insn (temp);
45579 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45582 /* Output code to perform a log1p XFmode calculation. */
45584 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45586 rtx_code_label *label1 = gen_label_rtx ();
45587 rtx_code_label *label2 = gen_label_rtx ();
45589 rtx tmp = gen_reg_rtx (XFmode);
45590 rtx tmp2 = gen_reg_rtx (XFmode);
45591 rtx test;
45593 emit_insn (gen_absxf2 (tmp, op1));
45594 test = gen_rtx_GE (VOIDmode, tmp,
45595 const_double_from_real_value (
45596 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45597 XFmode));
45598 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45600 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45601 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45602 emit_jump (label2);
45604 emit_label (label1);
45605 emit_move_insn (tmp, CONST1_RTX (XFmode));
45606 emit_insn (gen_addxf3 (tmp, op1, tmp));
45607 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45608 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45610 emit_label (label2);
45613 /* Emit code for round calculation. */
45614 void ix86_emit_i387_round (rtx op0, rtx op1)
45616 machine_mode inmode = GET_MODE (op1);
45617 machine_mode outmode = GET_MODE (op0);
45618 rtx e1, e2, res, tmp, tmp1, half;
45619 rtx scratch = gen_reg_rtx (HImode);
45620 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45621 rtx_code_label *jump_label = gen_label_rtx ();
45622 rtx insn;
45623 rtx (*gen_abs) (rtx, rtx);
45624 rtx (*gen_neg) (rtx, rtx);
45626 switch (inmode)
45628 case E_SFmode:
45629 gen_abs = gen_abssf2;
45630 break;
45631 case E_DFmode:
45632 gen_abs = gen_absdf2;
45633 break;
45634 case E_XFmode:
45635 gen_abs = gen_absxf2;
45636 break;
45637 default:
45638 gcc_unreachable ();
45641 switch (outmode)
45643 case E_SFmode:
45644 gen_neg = gen_negsf2;
45645 break;
45646 case E_DFmode:
45647 gen_neg = gen_negdf2;
45648 break;
45649 case E_XFmode:
45650 gen_neg = gen_negxf2;
45651 break;
45652 case E_HImode:
45653 gen_neg = gen_neghi2;
45654 break;
45655 case E_SImode:
45656 gen_neg = gen_negsi2;
45657 break;
45658 case E_DImode:
45659 gen_neg = gen_negdi2;
45660 break;
45661 default:
45662 gcc_unreachable ();
45665 e1 = gen_reg_rtx (inmode);
45666 e2 = gen_reg_rtx (inmode);
45667 res = gen_reg_rtx (outmode);
45669 half = const_double_from_real_value (dconsthalf, inmode);
45671 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45673 /* scratch = fxam(op1) */
45674 emit_insn (gen_rtx_SET (scratch,
45675 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45676 UNSPEC_FXAM)));
45677 /* e1 = fabs(op1) */
45678 emit_insn (gen_abs (e1, op1));
45680 /* e2 = e1 + 0.5 */
45681 half = force_reg (inmode, half);
45682 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45684 /* res = floor(e2) */
45685 if (inmode != XFmode)
45687 tmp1 = gen_reg_rtx (XFmode);
45689 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45691 else
45692 tmp1 = e2;
45694 switch (outmode)
45696 case E_SFmode:
45697 case E_DFmode:
45699 rtx tmp0 = gen_reg_rtx (XFmode);
45701 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45703 emit_insn (gen_rtx_SET (res,
45704 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45705 UNSPEC_TRUNC_NOOP)));
45707 break;
45708 case E_XFmode:
45709 emit_insn (gen_frndintxf2_floor (res, tmp1));
45710 break;
45711 case E_HImode:
45712 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45713 break;
45714 case E_SImode:
45715 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45716 break;
45717 case E_DImode:
45718 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45719 break;
45720 default:
45721 gcc_unreachable ();
45724 /* flags = signbit(a) */
45725 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45727 /* if (flags) then res = -res */
45728 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45729 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45730 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45731 pc_rtx);
45732 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45733 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45734 JUMP_LABEL (insn) = jump_label;
45736 emit_insn (gen_neg (res, res));
45738 emit_label (jump_label);
45739 LABEL_NUSES (jump_label) = 1;
45741 emit_move_insn (op0, res);
45744 /* Output code to perform a Newton-Rhapson approximation of a single precision
45745 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45747 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45749 rtx x0, x1, e0, e1;
45751 x0 = gen_reg_rtx (mode);
45752 e0 = gen_reg_rtx (mode);
45753 e1 = gen_reg_rtx (mode);
45754 x1 = gen_reg_rtx (mode);
45756 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45758 b = force_reg (mode, b);
45760 /* x0 = rcp(b) estimate */
45761 if (mode == V16SFmode || mode == V8DFmode)
45763 if (TARGET_AVX512ER)
45765 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45766 UNSPEC_RCP28)));
45767 /* res = a * x0 */
45768 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45769 return;
45771 else
45772 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45773 UNSPEC_RCP14)));
45775 else
45776 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45777 UNSPEC_RCP)));
45779 /* e0 = x0 * b */
45780 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45782 /* e0 = x0 * e0 */
45783 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45785 /* e1 = x0 + x0 */
45786 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45788 /* x1 = e1 - e0 */
45789 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45791 /* res = a * x1 */
45792 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45795 /* Output code to perform a Newton-Rhapson approximation of a
45796 single precision floating point [reciprocal] square root. */
45798 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45800 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45801 REAL_VALUE_TYPE r;
45802 int unspec;
45804 x0 = gen_reg_rtx (mode);
45805 e0 = gen_reg_rtx (mode);
45806 e1 = gen_reg_rtx (mode);
45807 e2 = gen_reg_rtx (mode);
45808 e3 = gen_reg_rtx (mode);
45810 if (TARGET_AVX512ER && mode == V16SFmode)
45812 if (recip)
45813 /* res = rsqrt28(a) estimate */
45814 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45815 UNSPEC_RSQRT28)));
45816 else
45818 /* x0 = rsqrt28(a) estimate */
45819 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45820 UNSPEC_RSQRT28)));
45821 /* res = rcp28(x0) estimate */
45822 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45823 UNSPEC_RCP28)));
45825 return;
45828 real_from_integer (&r, VOIDmode, -3, SIGNED);
45829 mthree = const_double_from_real_value (r, SFmode);
45831 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45832 mhalf = const_double_from_real_value (r, SFmode);
45833 unspec = UNSPEC_RSQRT;
45835 if (VECTOR_MODE_P (mode))
45837 mthree = ix86_build_const_vector (mode, true, mthree);
45838 mhalf = ix86_build_const_vector (mode, true, mhalf);
45839 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45840 if (GET_MODE_SIZE (mode) == 64)
45841 unspec = UNSPEC_RSQRT14;
45844 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45845 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45847 a = force_reg (mode, a);
45849 /* x0 = rsqrt(a) estimate */
45850 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45851 unspec)));
45853 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45854 if (!recip)
45856 rtx zero = force_reg (mode, CONST0_RTX(mode));
45857 rtx mask;
45859 /* Handle masked compare. */
45860 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45862 mask = gen_reg_rtx (HImode);
45863 /* Imm value 0x4 corresponds to not-equal comparison. */
45864 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45865 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45867 else
45869 mask = gen_reg_rtx (mode);
45870 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45871 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45875 /* e0 = x0 * a */
45876 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45877 /* e1 = e0 * x0 */
45878 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45880 /* e2 = e1 - 3. */
45881 mthree = force_reg (mode, mthree);
45882 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45884 mhalf = force_reg (mode, mhalf);
45885 if (recip)
45886 /* e3 = -.5 * x0 */
45887 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45888 else
45889 /* e3 = -.5 * e0 */
45890 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45891 /* ret = e2 * e3 */
45892 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45895 #ifdef TARGET_SOLARIS
45896 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45898 static void
45899 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45900 tree decl)
45902 /* With Binutils 2.15, the "@unwind" marker must be specified on
45903 every occurrence of the ".eh_frame" section, not just the first
45904 one. */
45905 if (TARGET_64BIT
45906 && strcmp (name, ".eh_frame") == 0)
45908 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45909 flags & SECTION_WRITE ? "aw" : "a");
45910 return;
45913 #ifndef USE_GAS
45914 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45916 solaris_elf_asm_comdat_section (name, flags, decl);
45917 return;
45919 #endif
45921 default_elf_asm_named_section (name, flags, decl);
45923 #endif /* TARGET_SOLARIS */
45925 /* Return the mangling of TYPE if it is an extended fundamental type. */
45927 static const char *
45928 ix86_mangle_type (const_tree type)
45930 type = TYPE_MAIN_VARIANT (type);
45932 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45933 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45934 return NULL;
45936 switch (TYPE_MODE (type))
45938 case E_TFmode:
45939 /* __float128 is "g". */
45940 return "g";
45941 case E_XFmode:
45942 /* "long double" or __float80 is "e". */
45943 return "e";
45944 default:
45945 return NULL;
45949 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45951 static tree
45952 ix86_stack_protect_guard (void)
45954 if (TARGET_SSP_TLS_GUARD)
45956 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45957 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45958 tree type = build_qualified_type (type_node, qual);
45959 tree t;
45961 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45963 t = ix86_tls_stack_chk_guard_decl;
45965 if (t == NULL)
45967 rtx x;
45969 t = build_decl
45970 (UNKNOWN_LOCATION, VAR_DECL,
45971 get_identifier (ix86_stack_protector_guard_symbol_str),
45972 type);
45973 TREE_STATIC (t) = 1;
45974 TREE_PUBLIC (t) = 1;
45975 DECL_EXTERNAL (t) = 1;
45976 TREE_USED (t) = 1;
45977 TREE_THIS_VOLATILE (t) = 1;
45978 DECL_ARTIFICIAL (t) = 1;
45979 DECL_IGNORED_P (t) = 1;
45981 /* Do not share RTL as the declaration is visible outside of
45982 current function. */
45983 x = DECL_RTL (t);
45984 RTX_FLAG (x, used) = 1;
45986 ix86_tls_stack_chk_guard_decl = t;
45989 else
45991 tree asptrtype = build_pointer_type (type);
45993 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45994 t = build2 (MEM_REF, asptrtype, t,
45995 build_int_cst (asptrtype, 0));
45998 return t;
46001 return default_stack_protect_guard ();
46004 /* For 32-bit code we can save PIC register setup by using
46005 __stack_chk_fail_local hidden function instead of calling
46006 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
46007 register, so it is better to call __stack_chk_fail directly. */
46009 static tree ATTRIBUTE_UNUSED
46010 ix86_stack_protect_fail (void)
46012 return TARGET_64BIT
46013 ? default_external_stack_protect_fail ()
46014 : default_hidden_stack_protect_fail ();
46017 /* Select a format to encode pointers in exception handling data. CODE
46018 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
46019 true if the symbol may be affected by dynamic relocations.
46021 ??? All x86 object file formats are capable of representing this.
46022 After all, the relocation needed is the same as for the call insn.
46023 Whether or not a particular assembler allows us to enter such, I
46024 guess we'll have to see. */
46026 asm_preferred_eh_data_format (int code, int global)
46028 if (flag_pic)
46030 int type = DW_EH_PE_sdata8;
46031 if (!TARGET_64BIT
46032 || ix86_cmodel == CM_SMALL_PIC
46033 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
46034 type = DW_EH_PE_sdata4;
46035 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
46037 if (ix86_cmodel == CM_SMALL
46038 || (ix86_cmodel == CM_MEDIUM && code))
46039 return DW_EH_PE_udata4;
46040 return DW_EH_PE_absptr;
46043 /* Expand copysign from SIGN to the positive value ABS_VALUE
46044 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
46045 the sign-bit. */
46046 static void
46047 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
46049 machine_mode mode = GET_MODE (sign);
46050 rtx sgn = gen_reg_rtx (mode);
46051 if (mask == NULL_RTX)
46053 machine_mode vmode;
46055 if (mode == SFmode)
46056 vmode = V4SFmode;
46057 else if (mode == DFmode)
46058 vmode = V2DFmode;
46059 else
46060 vmode = mode;
46062 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
46063 if (!VECTOR_MODE_P (mode))
46065 /* We need to generate a scalar mode mask in this case. */
46066 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46067 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46068 mask = gen_reg_rtx (mode);
46069 emit_insn (gen_rtx_SET (mask, tmp));
46072 else
46073 mask = gen_rtx_NOT (mode, mask);
46074 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
46075 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
46078 /* Expand fabs (OP0) and return a new rtx that holds the result. The
46079 mask for masking out the sign-bit is stored in *SMASK, if that is
46080 non-null. */
46081 static rtx
46082 ix86_expand_sse_fabs (rtx op0, rtx *smask)
46084 machine_mode vmode, mode = GET_MODE (op0);
46085 rtx xa, mask;
46087 xa = gen_reg_rtx (mode);
46088 if (mode == SFmode)
46089 vmode = V4SFmode;
46090 else if (mode == DFmode)
46091 vmode = V2DFmode;
46092 else
46093 vmode = mode;
46094 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
46095 if (!VECTOR_MODE_P (mode))
46097 /* We need to generate a scalar mode mask in this case. */
46098 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46099 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46100 mask = gen_reg_rtx (mode);
46101 emit_insn (gen_rtx_SET (mask, tmp));
46103 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
46105 if (smask)
46106 *smask = mask;
46108 return xa;
46111 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
46112 swapping the operands if SWAP_OPERANDS is true. The expanded
46113 code is a forward jump to a newly created label in case the
46114 comparison is true. The generated label rtx is returned. */
46115 static rtx_code_label *
46116 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
46117 bool swap_operands)
46119 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
46120 rtx_code_label *label;
46121 rtx tmp;
46123 if (swap_operands)
46124 std::swap (op0, op1);
46126 label = gen_label_rtx ();
46127 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
46128 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
46129 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
46130 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
46131 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
46132 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
46133 JUMP_LABEL (tmp) = label;
46135 return label;
46138 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
46139 using comparison code CODE. Operands are swapped for the comparison if
46140 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
46141 static rtx
46142 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
46143 bool swap_operands)
46145 rtx (*insn)(rtx, rtx, rtx, rtx);
46146 machine_mode mode = GET_MODE (op0);
46147 rtx mask = gen_reg_rtx (mode);
46149 if (swap_operands)
46150 std::swap (op0, op1);
46152 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
46154 emit_insn (insn (mask, op0, op1,
46155 gen_rtx_fmt_ee (code, mode, op0, op1)));
46156 return mask;
46159 /* Generate and return a rtx of mode MODE for 2**n where n is the number
46160 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
46161 static rtx
46162 ix86_gen_TWO52 (machine_mode mode)
46164 REAL_VALUE_TYPE TWO52r;
46165 rtx TWO52;
46167 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
46168 TWO52 = const_double_from_real_value (TWO52r, mode);
46169 TWO52 = force_reg (mode, TWO52);
46171 return TWO52;
46174 /* Expand SSE sequence for computing lround from OP1 storing
46175 into OP0. */
46176 void
46177 ix86_expand_lround (rtx op0, rtx op1)
46179 /* C code for the stuff we're doing below:
46180 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
46181 return (long)tmp;
46183 machine_mode mode = GET_MODE (op1);
46184 const struct real_format *fmt;
46185 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46186 rtx adj;
46188 /* load nextafter (0.5, 0.0) */
46189 fmt = REAL_MODE_FORMAT (mode);
46190 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46191 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46193 /* adj = copysign (0.5, op1) */
46194 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
46195 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
46197 /* adj = op1 + adj */
46198 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
46200 /* op0 = (imode)adj */
46201 expand_fix (op0, adj, 0);
46204 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
46205 into OPERAND0. */
46206 void
46207 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
46209 /* C code for the stuff we're doing below (for do_floor):
46210 xi = (long)op1;
46211 xi -= (double)xi > op1 ? 1 : 0;
46212 return xi;
46214 machine_mode fmode = GET_MODE (op1);
46215 machine_mode imode = GET_MODE (op0);
46216 rtx ireg, freg, tmp;
46217 rtx_code_label *label;
46219 /* reg = (long)op1 */
46220 ireg = gen_reg_rtx (imode);
46221 expand_fix (ireg, op1, 0);
46223 /* freg = (double)reg */
46224 freg = gen_reg_rtx (fmode);
46225 expand_float (freg, ireg, 0);
46227 /* ireg = (freg > op1) ? ireg - 1 : ireg */
46228 label = ix86_expand_sse_compare_and_jump (UNLE,
46229 freg, op1, !do_floor);
46230 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
46231 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
46232 emit_move_insn (ireg, tmp);
46234 emit_label (label);
46235 LABEL_NUSES (label) = 1;
46237 emit_move_insn (op0, ireg);
46240 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46241 result in OPERAND0. */
46242 void
46243 ix86_expand_rint (rtx operand0, rtx operand1)
46245 /* C code for the stuff we're doing below:
46246 xa = fabs (operand1);
46247 if (!isless (xa, 2**52))
46248 return operand1;
46249 xa = xa + 2**52 - 2**52;
46250 return copysign (xa, operand1);
46252 machine_mode mode = GET_MODE (operand0);
46253 rtx res, xa, TWO52, mask;
46254 rtx_code_label *label;
46256 res = gen_reg_rtx (mode);
46257 emit_move_insn (res, operand1);
46259 /* xa = abs (operand1) */
46260 xa = ix86_expand_sse_fabs (res, &mask);
46262 /* if (!isless (xa, TWO52)) goto label; */
46263 TWO52 = ix86_gen_TWO52 (mode);
46264 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46266 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46267 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46269 ix86_sse_copysign_to_positive (res, xa, res, mask);
46271 emit_label (label);
46272 LABEL_NUSES (label) = 1;
46274 emit_move_insn (operand0, res);
46277 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46278 into OPERAND0. */
46279 void
46280 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46282 /* C code for the stuff we expand below.
46283 double xa = fabs (x), x2;
46284 if (!isless (xa, TWO52))
46285 return x;
46286 xa = xa + TWO52 - TWO52;
46287 x2 = copysign (xa, x);
46288 Compensate. Floor:
46289 if (x2 > x)
46290 x2 -= 1;
46291 Compensate. Ceil:
46292 if (x2 < x)
46293 x2 -= -1;
46294 return x2;
46296 machine_mode mode = GET_MODE (operand0);
46297 rtx xa, TWO52, tmp, one, res, mask;
46298 rtx_code_label *label;
46300 TWO52 = ix86_gen_TWO52 (mode);
46302 /* Temporary for holding the result, initialized to the input
46303 operand to ease control flow. */
46304 res = gen_reg_rtx (mode);
46305 emit_move_insn (res, operand1);
46307 /* xa = abs (operand1) */
46308 xa = ix86_expand_sse_fabs (res, &mask);
46310 /* if (!isless (xa, TWO52)) goto label; */
46311 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46313 /* xa = xa + TWO52 - TWO52; */
46314 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46315 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46317 /* xa = copysign (xa, operand1) */
46318 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46320 /* generate 1.0 or -1.0 */
46321 one = force_reg (mode,
46322 const_double_from_real_value (do_floor
46323 ? dconst1 : dconstm1, mode));
46325 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46326 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46327 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46328 /* We always need to subtract here to preserve signed zero. */
46329 tmp = expand_simple_binop (mode, MINUS,
46330 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46331 emit_move_insn (res, tmp);
46333 emit_label (label);
46334 LABEL_NUSES (label) = 1;
46336 emit_move_insn (operand0, res);
46339 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46340 into OPERAND0. */
46341 void
46342 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46344 /* C code for the stuff we expand below.
46345 double xa = fabs (x), x2;
46346 if (!isless (xa, TWO52))
46347 return x;
46348 x2 = (double)(long)x;
46349 Compensate. Floor:
46350 if (x2 > x)
46351 x2 -= 1;
46352 Compensate. Ceil:
46353 if (x2 < x)
46354 x2 += 1;
46355 if (HONOR_SIGNED_ZEROS (mode))
46356 return copysign (x2, x);
46357 return x2;
46359 machine_mode mode = GET_MODE (operand0);
46360 rtx xa, xi, TWO52, tmp, one, res, mask;
46361 rtx_code_label *label;
46363 TWO52 = ix86_gen_TWO52 (mode);
46365 /* Temporary for holding the result, initialized to the input
46366 operand to ease control flow. */
46367 res = gen_reg_rtx (mode);
46368 emit_move_insn (res, operand1);
46370 /* xa = abs (operand1) */
46371 xa = ix86_expand_sse_fabs (res, &mask);
46373 /* if (!isless (xa, TWO52)) goto label; */
46374 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46376 /* xa = (double)(long)x */
46377 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46378 expand_fix (xi, res, 0);
46379 expand_float (xa, xi, 0);
46381 /* generate 1.0 */
46382 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46384 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46385 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46386 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46387 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46388 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46389 emit_move_insn (res, tmp);
46391 if (HONOR_SIGNED_ZEROS (mode))
46392 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46394 emit_label (label);
46395 LABEL_NUSES (label) = 1;
46397 emit_move_insn (operand0, res);
46400 /* Expand SSE sequence for computing round from OPERAND1 storing
46401 into OPERAND0. Sequence that works without relying on DImode truncation
46402 via cvttsd2siq that is only available on 64bit targets. */
46403 void
46404 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46406 /* C code for the stuff we expand below.
46407 double xa = fabs (x), xa2, x2;
46408 if (!isless (xa, TWO52))
46409 return x;
46410 Using the absolute value and copying back sign makes
46411 -0.0 -> -0.0 correct.
46412 xa2 = xa + TWO52 - TWO52;
46413 Compensate.
46414 dxa = xa2 - xa;
46415 if (dxa <= -0.5)
46416 xa2 += 1;
46417 else if (dxa > 0.5)
46418 xa2 -= 1;
46419 x2 = copysign (xa2, x);
46420 return x2;
46422 machine_mode mode = GET_MODE (operand0);
46423 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46424 rtx_code_label *label;
46426 TWO52 = ix86_gen_TWO52 (mode);
46428 /* Temporary for holding the result, initialized to the input
46429 operand to ease control flow. */
46430 res = gen_reg_rtx (mode);
46431 emit_move_insn (res, operand1);
46433 /* xa = abs (operand1) */
46434 xa = ix86_expand_sse_fabs (res, &mask);
46436 /* if (!isless (xa, TWO52)) goto label; */
46437 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46439 /* xa2 = xa + TWO52 - TWO52; */
46440 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46441 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46443 /* dxa = xa2 - xa; */
46444 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46446 /* generate 0.5, 1.0 and -0.5 */
46447 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46448 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46449 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46450 0, OPTAB_DIRECT);
46452 /* Compensate. */
46453 tmp = gen_reg_rtx (mode);
46454 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46455 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46456 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46457 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46458 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46459 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46460 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46461 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46463 /* res = copysign (xa2, operand1) */
46464 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46466 emit_label (label);
46467 LABEL_NUSES (label) = 1;
46469 emit_move_insn (operand0, res);
46472 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46473 into OPERAND0. */
46474 void
46475 ix86_expand_trunc (rtx operand0, rtx operand1)
46477 /* C code for SSE variant we expand below.
46478 double xa = fabs (x), x2;
46479 if (!isless (xa, TWO52))
46480 return x;
46481 x2 = (double)(long)x;
46482 if (HONOR_SIGNED_ZEROS (mode))
46483 return copysign (x2, x);
46484 return x2;
46486 machine_mode mode = GET_MODE (operand0);
46487 rtx xa, xi, TWO52, res, mask;
46488 rtx_code_label *label;
46490 TWO52 = ix86_gen_TWO52 (mode);
46492 /* Temporary for holding the result, initialized to the input
46493 operand to ease control flow. */
46494 res = gen_reg_rtx (mode);
46495 emit_move_insn (res, operand1);
46497 /* xa = abs (operand1) */
46498 xa = ix86_expand_sse_fabs (res, &mask);
46500 /* if (!isless (xa, TWO52)) goto label; */
46501 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46503 /* x = (double)(long)x */
46504 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46505 expand_fix (xi, res, 0);
46506 expand_float (res, xi, 0);
46508 if (HONOR_SIGNED_ZEROS (mode))
46509 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46511 emit_label (label);
46512 LABEL_NUSES (label) = 1;
46514 emit_move_insn (operand0, res);
46517 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46518 into OPERAND0. */
46519 void
46520 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46522 machine_mode mode = GET_MODE (operand0);
46523 rtx xa, mask, TWO52, one, res, smask, tmp;
46524 rtx_code_label *label;
46526 /* C code for SSE variant we expand below.
46527 double xa = fabs (x), x2;
46528 if (!isless (xa, TWO52))
46529 return x;
46530 xa2 = xa + TWO52 - TWO52;
46531 Compensate:
46532 if (xa2 > xa)
46533 xa2 -= 1.0;
46534 x2 = copysign (xa2, x);
46535 return x2;
46538 TWO52 = ix86_gen_TWO52 (mode);
46540 /* Temporary for holding the result, initialized to the input
46541 operand to ease control flow. */
46542 res = gen_reg_rtx (mode);
46543 emit_move_insn (res, operand1);
46545 /* xa = abs (operand1) */
46546 xa = ix86_expand_sse_fabs (res, &smask);
46548 /* if (!isless (xa, TWO52)) goto label; */
46549 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46551 /* res = xa + TWO52 - TWO52; */
46552 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46553 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46554 emit_move_insn (res, tmp);
46556 /* generate 1.0 */
46557 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46559 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46560 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46561 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46562 tmp = expand_simple_binop (mode, MINUS,
46563 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46564 emit_move_insn (res, tmp);
46566 /* res = copysign (res, operand1) */
46567 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46569 emit_label (label);
46570 LABEL_NUSES (label) = 1;
46572 emit_move_insn (operand0, res);
46575 /* Expand SSE sequence for computing round from OPERAND1 storing
46576 into OPERAND0. */
46577 void
46578 ix86_expand_round (rtx operand0, rtx operand1)
46580 /* C code for the stuff we're doing below:
46581 double xa = fabs (x);
46582 if (!isless (xa, TWO52))
46583 return x;
46584 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46585 return copysign (xa, x);
46587 machine_mode mode = GET_MODE (operand0);
46588 rtx res, TWO52, xa, xi, half, mask;
46589 rtx_code_label *label;
46590 const struct real_format *fmt;
46591 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46593 /* Temporary for holding the result, initialized to the input
46594 operand to ease control flow. */
46595 res = gen_reg_rtx (mode);
46596 emit_move_insn (res, operand1);
46598 TWO52 = ix86_gen_TWO52 (mode);
46599 xa = ix86_expand_sse_fabs (res, &mask);
46600 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46602 /* load nextafter (0.5, 0.0) */
46603 fmt = REAL_MODE_FORMAT (mode);
46604 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46605 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46607 /* xa = xa + 0.5 */
46608 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46609 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46611 /* xa = (double)(int64_t)xa */
46612 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46613 expand_fix (xi, xa, 0);
46614 expand_float (xa, xi, 0);
46616 /* res = copysign (xa, operand1) */
46617 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46619 emit_label (label);
46620 LABEL_NUSES (label) = 1;
46622 emit_move_insn (operand0, res);
46625 /* Expand SSE sequence for computing round
46626 from OP1 storing into OP0 using sse4 round insn. */
46627 void
46628 ix86_expand_round_sse4 (rtx op0, rtx op1)
46630 machine_mode mode = GET_MODE (op0);
46631 rtx e1, e2, res, half;
46632 const struct real_format *fmt;
46633 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46634 rtx (*gen_copysign) (rtx, rtx, rtx);
46635 rtx (*gen_round) (rtx, rtx, rtx);
46637 switch (mode)
46639 case E_SFmode:
46640 gen_copysign = gen_copysignsf3;
46641 gen_round = gen_sse4_1_roundsf2;
46642 break;
46643 case E_DFmode:
46644 gen_copysign = gen_copysigndf3;
46645 gen_round = gen_sse4_1_rounddf2;
46646 break;
46647 default:
46648 gcc_unreachable ();
46651 /* round (a) = trunc (a + copysign (0.5, a)) */
46653 /* load nextafter (0.5, 0.0) */
46654 fmt = REAL_MODE_FORMAT (mode);
46655 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46656 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46657 half = const_double_from_real_value (pred_half, mode);
46659 /* e1 = copysign (0.5, op1) */
46660 e1 = gen_reg_rtx (mode);
46661 emit_insn (gen_copysign (e1, half, op1));
46663 /* e2 = op1 + e1 */
46664 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46666 /* res = trunc (e2) */
46667 res = gen_reg_rtx (mode);
46668 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46670 emit_move_insn (op0, res);
46674 /* Table of valid machine attributes. */
46675 static const struct attribute_spec ix86_attribute_table[] =
46677 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46678 affects_type_identity } */
46679 /* Stdcall attribute says callee is responsible for popping arguments
46680 if they are not variable. */
46681 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46682 true },
46683 /* Fastcall attribute says callee is responsible for popping arguments
46684 if they are not variable. */
46685 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46686 true },
46687 /* Thiscall attribute says callee is responsible for popping arguments
46688 if they are not variable. */
46689 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46690 true },
46691 /* Cdecl attribute says the callee is a normal C declaration */
46692 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46693 true },
46694 /* Regparm attribute specifies how many integer arguments are to be
46695 passed in registers. */
46696 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46697 true },
46698 /* Sseregparm attribute says we are using x86_64 calling conventions
46699 for FP arguments. */
46700 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46701 true },
46702 /* The transactional memory builtins are implicitly regparm or fastcall
46703 depending on the ABI. Override the generic do-nothing attribute that
46704 these builtins were declared with. */
46705 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46706 true },
46707 /* force_align_arg_pointer says this function realigns the stack at entry. */
46708 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46709 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46710 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46711 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46712 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46713 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46714 false },
46715 #endif
46716 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46717 false },
46718 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46719 false },
46720 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46721 SUBTARGET_ATTRIBUTE_TABLE,
46722 #endif
46723 /* ms_abi and sysv_abi calling convention function attributes. */
46724 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46725 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46726 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46727 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46728 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46729 false },
46730 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46731 ix86_handle_callee_pop_aggregate_return, true },
46732 { "interrupt", 0, 0, false, true, true,
46733 ix86_handle_interrupt_attribute, false },
46734 { "no_caller_saved_registers", 0, 0, false, true, true,
46735 ix86_handle_no_caller_saved_registers_attribute, false },
46736 { "naked", 0, 0, true, false, false,
46737 ix86_handle_fndecl_attribute, false },
46739 /* End element. */
46740 { NULL, 0, 0, false, false, false, NULL, false }
46743 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46744 static int
46745 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46746 tree vectype, int)
46748 switch (type_of_cost)
46750 case scalar_stmt:
46751 return ix86_cost->scalar_stmt_cost;
46753 case scalar_load:
46754 return ix86_cost->scalar_load_cost;
46756 case scalar_store:
46757 return ix86_cost->scalar_store_cost;
46759 case vector_stmt:
46760 return ix86_cost->vec_stmt_cost;
46762 case vector_load:
46763 return ix86_cost->vec_align_load_cost;
46765 case vector_store:
46766 return ix86_cost->vec_store_cost;
46768 case vec_to_scalar:
46769 return ix86_cost->vec_to_scalar_cost;
46771 case scalar_to_vec:
46772 return ix86_cost->scalar_to_vec_cost;
46774 case unaligned_load:
46775 case unaligned_store:
46776 return ix86_cost->vec_unalign_load_cost;
46778 case cond_branch_taken:
46779 return ix86_cost->cond_taken_branch_cost;
46781 case cond_branch_not_taken:
46782 return ix86_cost->cond_not_taken_branch_cost;
46784 case vec_perm:
46785 case vec_promote_demote:
46786 return ix86_cost->vec_stmt_cost;
46788 case vec_construct:
46789 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46791 default:
46792 gcc_unreachable ();
46796 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46797 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46798 insn every time. */
46800 static GTY(()) rtx_insn *vselect_insn;
46802 /* Initialize vselect_insn. */
46804 static void
46805 init_vselect_insn (void)
46807 unsigned i;
46808 rtx x;
46810 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46811 for (i = 0; i < MAX_VECT_LEN; ++i)
46812 XVECEXP (x, 0, i) = const0_rtx;
46813 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46814 const0_rtx), x);
46815 x = gen_rtx_SET (const0_rtx, x);
46816 start_sequence ();
46817 vselect_insn = emit_insn (x);
46818 end_sequence ();
46821 /* Construct (set target (vec_select op0 (parallel perm))) and
46822 return true if that's a valid instruction in the active ISA. */
46824 static bool
46825 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46826 unsigned nelt, bool testing_p)
46828 unsigned int i;
46829 rtx x, save_vconcat;
46830 int icode;
46832 if (vselect_insn == NULL_RTX)
46833 init_vselect_insn ();
46835 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46836 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46837 for (i = 0; i < nelt; ++i)
46838 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46839 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46840 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46841 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46842 SET_DEST (PATTERN (vselect_insn)) = target;
46843 icode = recog_memoized (vselect_insn);
46845 if (icode >= 0 && !testing_p)
46846 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46848 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46849 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46850 INSN_CODE (vselect_insn) = -1;
46852 return icode >= 0;
46855 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46857 static bool
46858 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46859 const unsigned char *perm, unsigned nelt,
46860 bool testing_p)
46862 machine_mode v2mode;
46863 rtx x;
46864 bool ok;
46866 if (vselect_insn == NULL_RTX)
46867 init_vselect_insn ();
46869 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
46870 return false;
46871 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46872 PUT_MODE (x, v2mode);
46873 XEXP (x, 0) = op0;
46874 XEXP (x, 1) = op1;
46875 ok = expand_vselect (target, x, perm, nelt, testing_p);
46876 XEXP (x, 0) = const0_rtx;
46877 XEXP (x, 1) = const0_rtx;
46878 return ok;
46881 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46882 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46884 static bool
46885 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46887 machine_mode mmode, vmode = d->vmode;
46888 unsigned i, mask, nelt = d->nelt;
46889 rtx target, op0, op1, maskop, x;
46890 rtx rperm[32], vperm;
46892 if (d->one_operand_p)
46893 return false;
46894 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46895 && (TARGET_AVX512BW
46896 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46898 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46900 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46902 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46904 else
46905 return false;
46907 /* This is a blend, not a permute. Elements must stay in their
46908 respective lanes. */
46909 for (i = 0; i < nelt; ++i)
46911 unsigned e = d->perm[i];
46912 if (!(e == i || e == i + nelt))
46913 return false;
46916 if (d->testing_p)
46917 return true;
46919 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46920 decision should be extracted elsewhere, so that we only try that
46921 sequence once all budget==3 options have been tried. */
46922 target = d->target;
46923 op0 = d->op0;
46924 op1 = d->op1;
46925 mask = 0;
46927 switch (vmode)
46929 case E_V8DFmode:
46930 case E_V16SFmode:
46931 case E_V4DFmode:
46932 case E_V8SFmode:
46933 case E_V2DFmode:
46934 case E_V4SFmode:
46935 case E_V8HImode:
46936 case E_V8SImode:
46937 case E_V32HImode:
46938 case E_V64QImode:
46939 case E_V16SImode:
46940 case E_V8DImode:
46941 for (i = 0; i < nelt; ++i)
46942 mask |= (d->perm[i] >= nelt) << i;
46943 break;
46945 case E_V2DImode:
46946 for (i = 0; i < 2; ++i)
46947 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46948 vmode = V8HImode;
46949 goto do_subreg;
46951 case E_V4SImode:
46952 for (i = 0; i < 4; ++i)
46953 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46954 vmode = V8HImode;
46955 goto do_subreg;
46957 case E_V16QImode:
46958 /* See if bytes move in pairs so we can use pblendw with
46959 an immediate argument, rather than pblendvb with a vector
46960 argument. */
46961 for (i = 0; i < 16; i += 2)
46962 if (d->perm[i] + 1 != d->perm[i + 1])
46964 use_pblendvb:
46965 for (i = 0; i < nelt; ++i)
46966 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46968 finish_pblendvb:
46969 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46970 vperm = force_reg (vmode, vperm);
46972 if (GET_MODE_SIZE (vmode) == 16)
46973 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46974 else
46975 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46976 if (target != d->target)
46977 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46978 return true;
46981 for (i = 0; i < 8; ++i)
46982 mask |= (d->perm[i * 2] >= 16) << i;
46983 vmode = V8HImode;
46984 /* FALLTHRU */
46986 do_subreg:
46987 target = gen_reg_rtx (vmode);
46988 op0 = gen_lowpart (vmode, op0);
46989 op1 = gen_lowpart (vmode, op1);
46990 break;
46992 case E_V32QImode:
46993 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46994 for (i = 0; i < 32; i += 2)
46995 if (d->perm[i] + 1 != d->perm[i + 1])
46996 goto use_pblendvb;
46997 /* See if bytes move in quadruplets. If yes, vpblendd
46998 with immediate can be used. */
46999 for (i = 0; i < 32; i += 4)
47000 if (d->perm[i] + 2 != d->perm[i + 2])
47001 break;
47002 if (i < 32)
47004 /* See if bytes move the same in both lanes. If yes,
47005 vpblendw with immediate can be used. */
47006 for (i = 0; i < 16; i += 2)
47007 if (d->perm[i] + 16 != d->perm[i + 16])
47008 goto use_pblendvb;
47010 /* Use vpblendw. */
47011 for (i = 0; i < 16; ++i)
47012 mask |= (d->perm[i * 2] >= 32) << i;
47013 vmode = V16HImode;
47014 goto do_subreg;
47017 /* Use vpblendd. */
47018 for (i = 0; i < 8; ++i)
47019 mask |= (d->perm[i * 4] >= 32) << i;
47020 vmode = V8SImode;
47021 goto do_subreg;
47023 case E_V16HImode:
47024 /* See if words move in pairs. If yes, vpblendd can be used. */
47025 for (i = 0; i < 16; i += 2)
47026 if (d->perm[i] + 1 != d->perm[i + 1])
47027 break;
47028 if (i < 16)
47030 /* See if words move the same in both lanes. If not,
47031 vpblendvb must be used. */
47032 for (i = 0; i < 8; i++)
47033 if (d->perm[i] + 8 != d->perm[i + 8])
47035 /* Use vpblendvb. */
47036 for (i = 0; i < 32; ++i)
47037 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
47039 vmode = V32QImode;
47040 nelt = 32;
47041 target = gen_reg_rtx (vmode);
47042 op0 = gen_lowpart (vmode, op0);
47043 op1 = gen_lowpart (vmode, op1);
47044 goto finish_pblendvb;
47047 /* Use vpblendw. */
47048 for (i = 0; i < 16; ++i)
47049 mask |= (d->perm[i] >= 16) << i;
47050 break;
47053 /* Use vpblendd. */
47054 for (i = 0; i < 8; ++i)
47055 mask |= (d->perm[i * 2] >= 16) << i;
47056 vmode = V8SImode;
47057 goto do_subreg;
47059 case E_V4DImode:
47060 /* Use vpblendd. */
47061 for (i = 0; i < 4; ++i)
47062 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
47063 vmode = V8SImode;
47064 goto do_subreg;
47066 default:
47067 gcc_unreachable ();
47070 switch (vmode)
47072 case E_V8DFmode:
47073 case E_V8DImode:
47074 mmode = QImode;
47075 break;
47076 case E_V16SFmode:
47077 case E_V16SImode:
47078 mmode = HImode;
47079 break;
47080 case E_V32HImode:
47081 mmode = SImode;
47082 break;
47083 case E_V64QImode:
47084 mmode = DImode;
47085 break;
47086 default:
47087 mmode = VOIDmode;
47090 if (mmode != VOIDmode)
47091 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
47092 else
47093 maskop = GEN_INT (mask);
47095 /* This matches five different patterns with the different modes. */
47096 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
47097 x = gen_rtx_SET (target, x);
47098 emit_insn (x);
47099 if (target != d->target)
47100 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47102 return true;
47105 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47106 in terms of the variable form of vpermilps.
47108 Note that we will have already failed the immediate input vpermilps,
47109 which requires that the high and low part shuffle be identical; the
47110 variable form doesn't require that. */
47112 static bool
47113 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
47115 rtx rperm[8], vperm;
47116 unsigned i;
47118 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
47119 return false;
47121 /* We can only permute within the 128-bit lane. */
47122 for (i = 0; i < 8; ++i)
47124 unsigned e = d->perm[i];
47125 if (i < 4 ? e >= 4 : e < 4)
47126 return false;
47129 if (d->testing_p)
47130 return true;
47132 for (i = 0; i < 8; ++i)
47134 unsigned e = d->perm[i];
47136 /* Within each 128-bit lane, the elements of op0 are numbered
47137 from 0 and the elements of op1 are numbered from 4. */
47138 if (e >= 8 + 4)
47139 e -= 8;
47140 else if (e >= 4)
47141 e -= 4;
47143 rperm[i] = GEN_INT (e);
47146 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
47147 vperm = force_reg (V8SImode, vperm);
47148 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
47150 return true;
47153 /* Return true if permutation D can be performed as VMODE permutation
47154 instead. */
47156 static bool
47157 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
47159 unsigned int i, j, chunk;
47161 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
47162 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
47163 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
47164 return false;
47166 if (GET_MODE_NUNITS (vmode) >= d->nelt)
47167 return true;
47169 chunk = d->nelt / GET_MODE_NUNITS (vmode);
47170 for (i = 0; i < d->nelt; i += chunk)
47171 if (d->perm[i] & (chunk - 1))
47172 return false;
47173 else
47174 for (j = 1; j < chunk; ++j)
47175 if (d->perm[i] + j != d->perm[i + j])
47176 return false;
47178 return true;
47181 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47182 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
47184 static bool
47185 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
47187 unsigned i, nelt, eltsz, mask;
47188 unsigned char perm[64];
47189 machine_mode vmode = V16QImode;
47190 rtx rperm[64], vperm, target, op0, op1;
47192 nelt = d->nelt;
47194 if (!d->one_operand_p)
47196 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
47198 if (TARGET_AVX2
47199 && valid_perm_using_mode_p (V2TImode, d))
47201 if (d->testing_p)
47202 return true;
47204 /* Use vperm2i128 insn. The pattern uses
47205 V4DImode instead of V2TImode. */
47206 target = d->target;
47207 if (d->vmode != V4DImode)
47208 target = gen_reg_rtx (V4DImode);
47209 op0 = gen_lowpart (V4DImode, d->op0);
47210 op1 = gen_lowpart (V4DImode, d->op1);
47211 rperm[0]
47212 = GEN_INT ((d->perm[0] / (nelt / 2))
47213 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
47214 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
47215 if (target != d->target)
47216 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47217 return true;
47219 return false;
47222 else
47224 if (GET_MODE_SIZE (d->vmode) == 16)
47226 if (!TARGET_SSSE3)
47227 return false;
47229 else if (GET_MODE_SIZE (d->vmode) == 32)
47231 if (!TARGET_AVX2)
47232 return false;
47234 /* V4DImode should be already handled through
47235 expand_vselect by vpermq instruction. */
47236 gcc_assert (d->vmode != V4DImode);
47238 vmode = V32QImode;
47239 if (d->vmode == V8SImode
47240 || d->vmode == V16HImode
47241 || d->vmode == V32QImode)
47243 /* First see if vpermq can be used for
47244 V8SImode/V16HImode/V32QImode. */
47245 if (valid_perm_using_mode_p (V4DImode, d))
47247 for (i = 0; i < 4; i++)
47248 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47249 if (d->testing_p)
47250 return true;
47251 target = gen_reg_rtx (V4DImode);
47252 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47253 perm, 4, false))
47255 emit_move_insn (d->target,
47256 gen_lowpart (d->vmode, target));
47257 return true;
47259 return false;
47262 /* Next see if vpermd can be used. */
47263 if (valid_perm_using_mode_p (V8SImode, d))
47264 vmode = V8SImode;
47266 /* Or if vpermps can be used. */
47267 else if (d->vmode == V8SFmode)
47268 vmode = V8SImode;
47270 if (vmode == V32QImode)
47272 /* vpshufb only works intra lanes, it is not
47273 possible to shuffle bytes in between the lanes. */
47274 for (i = 0; i < nelt; ++i)
47275 if ((d->perm[i] ^ i) & (nelt / 2))
47276 return false;
47279 else if (GET_MODE_SIZE (d->vmode) == 64)
47281 if (!TARGET_AVX512BW)
47282 return false;
47284 /* If vpermq didn't work, vpshufb won't work either. */
47285 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47286 return false;
47288 vmode = V64QImode;
47289 if (d->vmode == V16SImode
47290 || d->vmode == V32HImode
47291 || d->vmode == V64QImode)
47293 /* First see if vpermq can be used for
47294 V16SImode/V32HImode/V64QImode. */
47295 if (valid_perm_using_mode_p (V8DImode, d))
47297 for (i = 0; i < 8; i++)
47298 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47299 if (d->testing_p)
47300 return true;
47301 target = gen_reg_rtx (V8DImode);
47302 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47303 perm, 8, false))
47305 emit_move_insn (d->target,
47306 gen_lowpart (d->vmode, target));
47307 return true;
47309 return false;
47312 /* Next see if vpermd can be used. */
47313 if (valid_perm_using_mode_p (V16SImode, d))
47314 vmode = V16SImode;
47316 /* Or if vpermps can be used. */
47317 else if (d->vmode == V16SFmode)
47318 vmode = V16SImode;
47319 if (vmode == V64QImode)
47321 /* vpshufb only works intra lanes, it is not
47322 possible to shuffle bytes in between the lanes. */
47323 for (i = 0; i < nelt; ++i)
47324 if ((d->perm[i] ^ i) & (nelt / 4))
47325 return false;
47328 else
47329 return false;
47332 if (d->testing_p)
47333 return true;
47335 if (vmode == V8SImode)
47336 for (i = 0; i < 8; ++i)
47337 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47338 else if (vmode == V16SImode)
47339 for (i = 0; i < 16; ++i)
47340 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47341 else
47343 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47344 if (!d->one_operand_p)
47345 mask = 2 * nelt - 1;
47346 else if (vmode == V16QImode)
47347 mask = nelt - 1;
47348 else if (vmode == V64QImode)
47349 mask = nelt / 4 - 1;
47350 else
47351 mask = nelt / 2 - 1;
47353 for (i = 0; i < nelt; ++i)
47355 unsigned j, e = d->perm[i] & mask;
47356 for (j = 0; j < eltsz; ++j)
47357 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47361 vperm = gen_rtx_CONST_VECTOR (vmode,
47362 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47363 vperm = force_reg (vmode, vperm);
47365 target = d->target;
47366 if (d->vmode != vmode)
47367 target = gen_reg_rtx (vmode);
47368 op0 = gen_lowpart (vmode, d->op0);
47369 if (d->one_operand_p)
47371 if (vmode == V16QImode)
47372 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47373 else if (vmode == V32QImode)
47374 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47375 else if (vmode == V64QImode)
47376 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47377 else if (vmode == V8SFmode)
47378 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47379 else if (vmode == V8SImode)
47380 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47381 else if (vmode == V16SFmode)
47382 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47383 else if (vmode == V16SImode)
47384 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47385 else
47386 gcc_unreachable ();
47388 else
47390 op1 = gen_lowpart (vmode, d->op1);
47391 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47393 if (target != d->target)
47394 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47396 return true;
47399 /* For V*[QHS]Imode permutations, check if the same permutation
47400 can't be performed in a 2x, 4x or 8x wider inner mode. */
47402 static bool
47403 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47404 struct expand_vec_perm_d *nd)
47406 int i;
47407 machine_mode mode = VOIDmode;
47409 switch (d->vmode)
47411 case E_V16QImode: mode = V8HImode; break;
47412 case E_V32QImode: mode = V16HImode; break;
47413 case E_V64QImode: mode = V32HImode; break;
47414 case E_V8HImode: mode = V4SImode; break;
47415 case E_V16HImode: mode = V8SImode; break;
47416 case E_V32HImode: mode = V16SImode; break;
47417 case E_V4SImode: mode = V2DImode; break;
47418 case E_V8SImode: mode = V4DImode; break;
47419 case E_V16SImode: mode = V8DImode; break;
47420 default: return false;
47422 for (i = 0; i < d->nelt; i += 2)
47423 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47424 return false;
47425 nd->vmode = mode;
47426 nd->nelt = d->nelt / 2;
47427 for (i = 0; i < nd->nelt; i++)
47428 nd->perm[i] = d->perm[2 * i] / 2;
47429 if (GET_MODE_INNER (mode) != DImode)
47430 canonicalize_vector_int_perm (nd, nd);
47431 if (nd != d)
47433 nd->one_operand_p = d->one_operand_p;
47434 nd->testing_p = d->testing_p;
47435 if (d->op0 == d->op1)
47436 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47437 else
47439 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47440 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47442 if (d->testing_p)
47443 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47444 else
47445 nd->target = gen_reg_rtx (nd->vmode);
47447 return true;
47450 /* Try to expand one-operand permutation with constant mask. */
47452 static bool
47453 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47455 machine_mode mode = GET_MODE (d->op0);
47456 machine_mode maskmode = mode;
47457 rtx (*gen) (rtx, rtx, rtx) = NULL;
47458 rtx target, op0, mask;
47459 rtx vec[64];
47461 if (!rtx_equal_p (d->op0, d->op1))
47462 return false;
47464 if (!TARGET_AVX512F)
47465 return false;
47467 switch (mode)
47469 case E_V16SImode:
47470 gen = gen_avx512f_permvarv16si;
47471 break;
47472 case E_V16SFmode:
47473 gen = gen_avx512f_permvarv16sf;
47474 maskmode = V16SImode;
47475 break;
47476 case E_V8DImode:
47477 gen = gen_avx512f_permvarv8di;
47478 break;
47479 case E_V8DFmode:
47480 gen = gen_avx512f_permvarv8df;
47481 maskmode = V8DImode;
47482 break;
47483 default:
47484 return false;
47487 target = d->target;
47488 op0 = d->op0;
47489 for (int i = 0; i < d->nelt; ++i)
47490 vec[i] = GEN_INT (d->perm[i]);
47491 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47492 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47493 return true;
47496 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47497 in a single instruction. */
47499 static bool
47500 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47502 unsigned i, nelt = d->nelt;
47503 struct expand_vec_perm_d nd;
47505 /* Check plain VEC_SELECT first, because AVX has instructions that could
47506 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47507 input where SEL+CONCAT may not. */
47508 if (d->one_operand_p)
47510 int mask = nelt - 1;
47511 bool identity_perm = true;
47512 bool broadcast_perm = true;
47514 for (i = 0; i < nelt; i++)
47516 nd.perm[i] = d->perm[i] & mask;
47517 if (nd.perm[i] != i)
47518 identity_perm = false;
47519 if (nd.perm[i])
47520 broadcast_perm = false;
47523 if (identity_perm)
47525 if (!d->testing_p)
47526 emit_move_insn (d->target, d->op0);
47527 return true;
47529 else if (broadcast_perm && TARGET_AVX2)
47531 /* Use vpbroadcast{b,w,d}. */
47532 rtx (*gen) (rtx, rtx) = NULL;
47533 switch (d->vmode)
47535 case E_V64QImode:
47536 if (TARGET_AVX512BW)
47537 gen = gen_avx512bw_vec_dupv64qi_1;
47538 break;
47539 case E_V32QImode:
47540 gen = gen_avx2_pbroadcastv32qi_1;
47541 break;
47542 case E_V32HImode:
47543 if (TARGET_AVX512BW)
47544 gen = gen_avx512bw_vec_dupv32hi_1;
47545 break;
47546 case E_V16HImode:
47547 gen = gen_avx2_pbroadcastv16hi_1;
47548 break;
47549 case E_V16SImode:
47550 if (TARGET_AVX512F)
47551 gen = gen_avx512f_vec_dupv16si_1;
47552 break;
47553 case E_V8SImode:
47554 gen = gen_avx2_pbroadcastv8si_1;
47555 break;
47556 case E_V16QImode:
47557 gen = gen_avx2_pbroadcastv16qi;
47558 break;
47559 case E_V8HImode:
47560 gen = gen_avx2_pbroadcastv8hi;
47561 break;
47562 case E_V16SFmode:
47563 if (TARGET_AVX512F)
47564 gen = gen_avx512f_vec_dupv16sf_1;
47565 break;
47566 case E_V8SFmode:
47567 gen = gen_avx2_vec_dupv8sf_1;
47568 break;
47569 case E_V8DFmode:
47570 if (TARGET_AVX512F)
47571 gen = gen_avx512f_vec_dupv8df_1;
47572 break;
47573 case E_V8DImode:
47574 if (TARGET_AVX512F)
47575 gen = gen_avx512f_vec_dupv8di_1;
47576 break;
47577 /* For other modes prefer other shuffles this function creates. */
47578 default: break;
47580 if (gen != NULL)
47582 if (!d->testing_p)
47583 emit_insn (gen (d->target, d->op0));
47584 return true;
47588 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47589 return true;
47591 /* There are plenty of patterns in sse.md that are written for
47592 SEL+CONCAT and are not replicated for a single op. Perhaps
47593 that should be changed, to avoid the nastiness here. */
47595 /* Recognize interleave style patterns, which means incrementing
47596 every other permutation operand. */
47597 for (i = 0; i < nelt; i += 2)
47599 nd.perm[i] = d->perm[i] & mask;
47600 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47602 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47603 d->testing_p))
47604 return true;
47606 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47607 if (nelt >= 4)
47609 for (i = 0; i < nelt; i += 4)
47611 nd.perm[i + 0] = d->perm[i + 0] & mask;
47612 nd.perm[i + 1] = d->perm[i + 1] & mask;
47613 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47614 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47617 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47618 d->testing_p))
47619 return true;
47623 /* Finally, try the fully general two operand permute. */
47624 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47625 d->testing_p))
47626 return true;
47628 /* Recognize interleave style patterns with reversed operands. */
47629 if (!d->one_operand_p)
47631 for (i = 0; i < nelt; ++i)
47633 unsigned e = d->perm[i];
47634 if (e >= nelt)
47635 e -= nelt;
47636 else
47637 e += nelt;
47638 nd.perm[i] = e;
47641 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47642 d->testing_p))
47643 return true;
47646 /* Try the SSE4.1 blend variable merge instructions. */
47647 if (expand_vec_perm_blend (d))
47648 return true;
47650 /* Try one of the AVX vpermil variable permutations. */
47651 if (expand_vec_perm_vpermil (d))
47652 return true;
47654 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47655 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47656 if (expand_vec_perm_pshufb (d))
47657 return true;
47659 /* Try the AVX2 vpalignr instruction. */
47660 if (expand_vec_perm_palignr (d, true))
47661 return true;
47663 /* Try the AVX512F vperm{s,d} instructions. */
47664 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47665 return true;
47667 /* Try the AVX512F vpermi2 instructions. */
47668 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47669 return true;
47671 /* See if we can get the same permutation in different vector integer
47672 mode. */
47673 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47675 if (!d->testing_p)
47676 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47677 return true;
47679 return false;
47682 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47683 in terms of a pair of pshuflw + pshufhw instructions. */
47685 static bool
47686 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47688 unsigned char perm2[MAX_VECT_LEN];
47689 unsigned i;
47690 bool ok;
47692 if (d->vmode != V8HImode || !d->one_operand_p)
47693 return false;
47695 /* The two permutations only operate in 64-bit lanes. */
47696 for (i = 0; i < 4; ++i)
47697 if (d->perm[i] >= 4)
47698 return false;
47699 for (i = 4; i < 8; ++i)
47700 if (d->perm[i] < 4)
47701 return false;
47703 if (d->testing_p)
47704 return true;
47706 /* Emit the pshuflw. */
47707 memcpy (perm2, d->perm, 4);
47708 for (i = 4; i < 8; ++i)
47709 perm2[i] = i;
47710 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47711 gcc_assert (ok);
47713 /* Emit the pshufhw. */
47714 memcpy (perm2 + 4, d->perm + 4, 4);
47715 for (i = 0; i < 4; ++i)
47716 perm2[i] = i;
47717 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47718 gcc_assert (ok);
47720 return true;
47723 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47724 the permutation using the SSSE3 palignr instruction. This succeeds
47725 when all of the elements in PERM fit within one vector and we merely
47726 need to shift them down so that a single vector permutation has a
47727 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47728 the vpalignr instruction itself can perform the requested permutation. */
47730 static bool
47731 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47733 unsigned i, nelt = d->nelt;
47734 unsigned min, max, minswap, maxswap;
47735 bool in_order, ok, swap = false;
47736 rtx shift, target;
47737 struct expand_vec_perm_d dcopy;
47739 /* Even with AVX, palignr only operates on 128-bit vectors,
47740 in AVX2 palignr operates on both 128-bit lanes. */
47741 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47742 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47743 return false;
47745 min = 2 * nelt;
47746 max = 0;
47747 minswap = 2 * nelt;
47748 maxswap = 0;
47749 for (i = 0; i < nelt; ++i)
47751 unsigned e = d->perm[i];
47752 unsigned eswap = d->perm[i] ^ nelt;
47753 if (GET_MODE_SIZE (d->vmode) == 32)
47755 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47756 eswap = e ^ (nelt / 2);
47758 if (e < min)
47759 min = e;
47760 if (e > max)
47761 max = e;
47762 if (eswap < minswap)
47763 minswap = eswap;
47764 if (eswap > maxswap)
47765 maxswap = eswap;
47767 if (min == 0
47768 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47770 if (d->one_operand_p
47771 || minswap == 0
47772 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47773 ? nelt / 2 : nelt))
47774 return false;
47775 swap = true;
47776 min = minswap;
47777 max = maxswap;
47780 /* Given that we have SSSE3, we know we'll be able to implement the
47781 single operand permutation after the palignr with pshufb for
47782 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47783 first. */
47784 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47785 return true;
47787 dcopy = *d;
47788 if (swap)
47790 dcopy.op0 = d->op1;
47791 dcopy.op1 = d->op0;
47792 for (i = 0; i < nelt; ++i)
47793 dcopy.perm[i] ^= nelt;
47796 in_order = true;
47797 for (i = 0; i < nelt; ++i)
47799 unsigned e = dcopy.perm[i];
47800 if (GET_MODE_SIZE (d->vmode) == 32
47801 && e >= nelt
47802 && (e & (nelt / 2 - 1)) < min)
47803 e = e - min - (nelt / 2);
47804 else
47805 e = e - min;
47806 if (e != i)
47807 in_order = false;
47808 dcopy.perm[i] = e;
47810 dcopy.one_operand_p = true;
47812 if (single_insn_only_p && !in_order)
47813 return false;
47815 /* For AVX2, test whether we can permute the result in one instruction. */
47816 if (d->testing_p)
47818 if (in_order)
47819 return true;
47820 dcopy.op1 = dcopy.op0;
47821 return expand_vec_perm_1 (&dcopy);
47824 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47825 if (GET_MODE_SIZE (d->vmode) == 16)
47827 target = gen_reg_rtx (TImode);
47828 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47829 gen_lowpart (TImode, dcopy.op0), shift));
47831 else
47833 target = gen_reg_rtx (V2TImode);
47834 emit_insn (gen_avx2_palignrv2ti (target,
47835 gen_lowpart (V2TImode, dcopy.op1),
47836 gen_lowpart (V2TImode, dcopy.op0),
47837 shift));
47840 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47842 /* Test for the degenerate case where the alignment by itself
47843 produces the desired permutation. */
47844 if (in_order)
47846 emit_move_insn (d->target, dcopy.op0);
47847 return true;
47850 ok = expand_vec_perm_1 (&dcopy);
47851 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47853 return ok;
47856 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47857 the permutation using the SSE4_1 pblendv instruction. Potentially
47858 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47860 static bool
47861 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47863 unsigned i, which, nelt = d->nelt;
47864 struct expand_vec_perm_d dcopy, dcopy1;
47865 machine_mode vmode = d->vmode;
47866 bool ok;
47868 /* Use the same checks as in expand_vec_perm_blend. */
47869 if (d->one_operand_p)
47870 return false;
47871 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47873 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47875 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47877 else
47878 return false;
47880 /* Figure out where permutation elements stay not in their
47881 respective lanes. */
47882 for (i = 0, which = 0; i < nelt; ++i)
47884 unsigned e = d->perm[i];
47885 if (e != i)
47886 which |= (e < nelt ? 1 : 2);
47888 /* We can pblend the part where elements stay not in their
47889 respective lanes only when these elements are all in one
47890 half of a permutation.
47891 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47892 lanes, but both 8 and 9 >= 8
47893 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47894 respective lanes and 8 >= 8, but 2 not. */
47895 if (which != 1 && which != 2)
47896 return false;
47897 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47898 return true;
47900 /* First we apply one operand permutation to the part where
47901 elements stay not in their respective lanes. */
47902 dcopy = *d;
47903 if (which == 2)
47904 dcopy.op0 = dcopy.op1 = d->op1;
47905 else
47906 dcopy.op0 = dcopy.op1 = d->op0;
47907 if (!d->testing_p)
47908 dcopy.target = gen_reg_rtx (vmode);
47909 dcopy.one_operand_p = true;
47911 for (i = 0; i < nelt; ++i)
47912 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47914 ok = expand_vec_perm_1 (&dcopy);
47915 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47916 return false;
47917 else
47918 gcc_assert (ok);
47919 if (d->testing_p)
47920 return true;
47922 /* Next we put permuted elements into their positions. */
47923 dcopy1 = *d;
47924 if (which == 2)
47925 dcopy1.op1 = dcopy.target;
47926 else
47927 dcopy1.op0 = dcopy.target;
47929 for (i = 0; i < nelt; ++i)
47930 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47932 ok = expand_vec_perm_blend (&dcopy1);
47933 gcc_assert (ok);
47935 return true;
47938 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47940 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47941 a two vector permutation into a single vector permutation by using
47942 an interleave operation to merge the vectors. */
47944 static bool
47945 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47947 struct expand_vec_perm_d dremap, dfinal;
47948 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47949 unsigned HOST_WIDE_INT contents;
47950 unsigned char remap[2 * MAX_VECT_LEN];
47951 rtx_insn *seq;
47952 bool ok, same_halves = false;
47954 if (GET_MODE_SIZE (d->vmode) == 16)
47956 if (d->one_operand_p)
47957 return false;
47959 else if (GET_MODE_SIZE (d->vmode) == 32)
47961 if (!TARGET_AVX)
47962 return false;
47963 /* For 32-byte modes allow even d->one_operand_p.
47964 The lack of cross-lane shuffling in some instructions
47965 might prevent a single insn shuffle. */
47966 dfinal = *d;
47967 dfinal.testing_p = true;
47968 /* If expand_vec_perm_interleave3 can expand this into
47969 a 3 insn sequence, give up and let it be expanded as
47970 3 insn sequence. While that is one insn longer,
47971 it doesn't need a memory operand and in the common
47972 case that both interleave low and high permutations
47973 with the same operands are adjacent needs 4 insns
47974 for both after CSE. */
47975 if (expand_vec_perm_interleave3 (&dfinal))
47976 return false;
47978 else
47979 return false;
47981 /* Examine from whence the elements come. */
47982 contents = 0;
47983 for (i = 0; i < nelt; ++i)
47984 contents |= HOST_WIDE_INT_1U << d->perm[i];
47986 memset (remap, 0xff, sizeof (remap));
47987 dremap = *d;
47989 if (GET_MODE_SIZE (d->vmode) == 16)
47991 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47993 /* Split the two input vectors into 4 halves. */
47994 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47995 h2 = h1 << nelt2;
47996 h3 = h2 << nelt2;
47997 h4 = h3 << nelt2;
47999 /* If the elements from the low halves use interleave low, and similarly
48000 for interleave high. If the elements are from mis-matched halves, we
48001 can use shufps for V4SF/V4SI or do a DImode shuffle. */
48002 if ((contents & (h1 | h3)) == contents)
48004 /* punpckl* */
48005 for (i = 0; i < nelt2; ++i)
48007 remap[i] = i * 2;
48008 remap[i + nelt] = i * 2 + 1;
48009 dremap.perm[i * 2] = i;
48010 dremap.perm[i * 2 + 1] = i + nelt;
48012 if (!TARGET_SSE2 && d->vmode == V4SImode)
48013 dremap.vmode = V4SFmode;
48015 else if ((contents & (h2 | h4)) == contents)
48017 /* punpckh* */
48018 for (i = 0; i < nelt2; ++i)
48020 remap[i + nelt2] = i * 2;
48021 remap[i + nelt + nelt2] = i * 2 + 1;
48022 dremap.perm[i * 2] = i + nelt2;
48023 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
48025 if (!TARGET_SSE2 && d->vmode == V4SImode)
48026 dremap.vmode = V4SFmode;
48028 else if ((contents & (h1 | h4)) == contents)
48030 /* shufps */
48031 for (i = 0; i < nelt2; ++i)
48033 remap[i] = i;
48034 remap[i + nelt + nelt2] = i + nelt2;
48035 dremap.perm[i] = i;
48036 dremap.perm[i + nelt2] = i + nelt + nelt2;
48038 if (nelt != 4)
48040 /* shufpd */
48041 dremap.vmode = V2DImode;
48042 dremap.nelt = 2;
48043 dremap.perm[0] = 0;
48044 dremap.perm[1] = 3;
48047 else if ((contents & (h2 | h3)) == contents)
48049 /* shufps */
48050 for (i = 0; i < nelt2; ++i)
48052 remap[i + nelt2] = i;
48053 remap[i + nelt] = i + nelt2;
48054 dremap.perm[i] = i + nelt2;
48055 dremap.perm[i + nelt2] = i + nelt;
48057 if (nelt != 4)
48059 /* shufpd */
48060 dremap.vmode = V2DImode;
48061 dremap.nelt = 2;
48062 dremap.perm[0] = 1;
48063 dremap.perm[1] = 2;
48066 else
48067 return false;
48069 else
48071 unsigned int nelt4 = nelt / 4, nzcnt = 0;
48072 unsigned HOST_WIDE_INT q[8];
48073 unsigned int nonzero_halves[4];
48075 /* Split the two input vectors into 8 quarters. */
48076 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
48077 for (i = 1; i < 8; ++i)
48078 q[i] = q[0] << (nelt4 * i);
48079 for (i = 0; i < 4; ++i)
48080 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
48082 nonzero_halves[nzcnt] = i;
48083 ++nzcnt;
48086 if (nzcnt == 1)
48088 gcc_assert (d->one_operand_p);
48089 nonzero_halves[1] = nonzero_halves[0];
48090 same_halves = true;
48092 else if (d->one_operand_p)
48094 gcc_assert (nonzero_halves[0] == 0);
48095 gcc_assert (nonzero_halves[1] == 1);
48098 if (nzcnt <= 2)
48100 if (d->perm[0] / nelt2 == nonzero_halves[1])
48102 /* Attempt to increase the likelihood that dfinal
48103 shuffle will be intra-lane. */
48104 std::swap (nonzero_halves[0], nonzero_halves[1]);
48107 /* vperm2f128 or vperm2i128. */
48108 for (i = 0; i < nelt2; ++i)
48110 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
48111 remap[i + nonzero_halves[0] * nelt2] = i;
48112 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
48113 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
48116 if (d->vmode != V8SFmode
48117 && d->vmode != V4DFmode
48118 && d->vmode != V8SImode)
48120 dremap.vmode = V8SImode;
48121 dremap.nelt = 8;
48122 for (i = 0; i < 4; ++i)
48124 dremap.perm[i] = i + nonzero_halves[0] * 4;
48125 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
48129 else if (d->one_operand_p)
48130 return false;
48131 else if (TARGET_AVX2
48132 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
48134 /* vpunpckl* */
48135 for (i = 0; i < nelt4; ++i)
48137 remap[i] = i * 2;
48138 remap[i + nelt] = i * 2 + 1;
48139 remap[i + nelt2] = i * 2 + nelt2;
48140 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
48141 dremap.perm[i * 2] = i;
48142 dremap.perm[i * 2 + 1] = i + nelt;
48143 dremap.perm[i * 2 + nelt2] = i + nelt2;
48144 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
48147 else if (TARGET_AVX2
48148 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
48150 /* vpunpckh* */
48151 for (i = 0; i < nelt4; ++i)
48153 remap[i + nelt4] = i * 2;
48154 remap[i + nelt + nelt4] = i * 2 + 1;
48155 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
48156 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
48157 dremap.perm[i * 2] = i + nelt4;
48158 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
48159 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
48160 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
48163 else
48164 return false;
48167 /* Use the remapping array set up above to move the elements from their
48168 swizzled locations into their final destinations. */
48169 dfinal = *d;
48170 for (i = 0; i < nelt; ++i)
48172 unsigned e = remap[d->perm[i]];
48173 gcc_assert (e < nelt);
48174 /* If same_halves is true, both halves of the remapped vector are the
48175 same. Avoid cross-lane accesses if possible. */
48176 if (same_halves && i >= nelt2)
48178 gcc_assert (e < nelt2);
48179 dfinal.perm[i] = e + nelt2;
48181 else
48182 dfinal.perm[i] = e;
48184 if (!d->testing_p)
48186 dremap.target = gen_reg_rtx (dremap.vmode);
48187 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48189 dfinal.op1 = dfinal.op0;
48190 dfinal.one_operand_p = true;
48192 /* Test if the final remap can be done with a single insn. For V4SFmode or
48193 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
48194 start_sequence ();
48195 ok = expand_vec_perm_1 (&dfinal);
48196 seq = get_insns ();
48197 end_sequence ();
48199 if (!ok)
48200 return false;
48202 if (d->testing_p)
48203 return true;
48205 if (dremap.vmode != dfinal.vmode)
48207 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
48208 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
48211 ok = expand_vec_perm_1 (&dremap);
48212 gcc_assert (ok);
48214 emit_insn (seq);
48215 return true;
48218 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48219 a single vector cross-lane permutation into vpermq followed
48220 by any of the single insn permutations. */
48222 static bool
48223 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
48225 struct expand_vec_perm_d dremap, dfinal;
48226 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
48227 unsigned contents[2];
48228 bool ok;
48230 if (!(TARGET_AVX2
48231 && (d->vmode == V32QImode || d->vmode == V16HImode)
48232 && d->one_operand_p))
48233 return false;
48235 contents[0] = 0;
48236 contents[1] = 0;
48237 for (i = 0; i < nelt2; ++i)
48239 contents[0] |= 1u << (d->perm[i] / nelt4);
48240 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48243 for (i = 0; i < 2; ++i)
48245 unsigned int cnt = 0;
48246 for (j = 0; j < 4; ++j)
48247 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48248 return false;
48251 if (d->testing_p)
48252 return true;
48254 dremap = *d;
48255 dremap.vmode = V4DImode;
48256 dremap.nelt = 4;
48257 dremap.target = gen_reg_rtx (V4DImode);
48258 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48259 dremap.op1 = dremap.op0;
48260 dremap.one_operand_p = true;
48261 for (i = 0; i < 2; ++i)
48263 unsigned int cnt = 0;
48264 for (j = 0; j < 4; ++j)
48265 if ((contents[i] & (1u << j)) != 0)
48266 dremap.perm[2 * i + cnt++] = j;
48267 for (; cnt < 2; ++cnt)
48268 dremap.perm[2 * i + cnt] = 0;
48271 dfinal = *d;
48272 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48273 dfinal.op1 = dfinal.op0;
48274 dfinal.one_operand_p = true;
48275 for (i = 0, j = 0; i < nelt; ++i)
48277 if (i == nelt2)
48278 j = 2;
48279 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48280 if ((d->perm[i] / nelt4) == dremap.perm[j])
48282 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48283 dfinal.perm[i] |= nelt4;
48284 else
48285 gcc_unreachable ();
48288 ok = expand_vec_perm_1 (&dremap);
48289 gcc_assert (ok);
48291 ok = expand_vec_perm_1 (&dfinal);
48292 gcc_assert (ok);
48294 return true;
48297 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48298 a vector permutation using two instructions, vperm2f128 resp.
48299 vperm2i128 followed by any single in-lane permutation. */
48301 static bool
48302 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48304 struct expand_vec_perm_d dfirst, dsecond;
48305 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48306 bool ok;
48308 if (!TARGET_AVX
48309 || GET_MODE_SIZE (d->vmode) != 32
48310 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48311 return false;
48313 dsecond = *d;
48314 dsecond.one_operand_p = false;
48315 dsecond.testing_p = true;
48317 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48318 immediate. For perm < 16 the second permutation uses
48319 d->op0 as first operand, for perm >= 16 it uses d->op1
48320 as first operand. The second operand is the result of
48321 vperm2[fi]128. */
48322 for (perm = 0; perm < 32; perm++)
48324 /* Ignore permutations which do not move anything cross-lane. */
48325 if (perm < 16)
48327 /* The second shuffle for e.g. V4DFmode has
48328 0123 and ABCD operands.
48329 Ignore AB23, as 23 is already in the second lane
48330 of the first operand. */
48331 if ((perm & 0xc) == (1 << 2)) continue;
48332 /* And 01CD, as 01 is in the first lane of the first
48333 operand. */
48334 if ((perm & 3) == 0) continue;
48335 /* And 4567, as then the vperm2[fi]128 doesn't change
48336 anything on the original 4567 second operand. */
48337 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48339 else
48341 /* The second shuffle for e.g. V4DFmode has
48342 4567 and ABCD operands.
48343 Ignore AB67, as 67 is already in the second lane
48344 of the first operand. */
48345 if ((perm & 0xc) == (3 << 2)) continue;
48346 /* And 45CD, as 45 is in the first lane of the first
48347 operand. */
48348 if ((perm & 3) == 2) continue;
48349 /* And 0123, as then the vperm2[fi]128 doesn't change
48350 anything on the original 0123 first operand. */
48351 if ((perm & 0xf) == (1 << 2)) continue;
48354 for (i = 0; i < nelt; i++)
48356 j = d->perm[i] / nelt2;
48357 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48358 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48359 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48360 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48361 else
48362 break;
48365 if (i == nelt)
48367 start_sequence ();
48368 ok = expand_vec_perm_1 (&dsecond);
48369 end_sequence ();
48371 else
48372 ok = false;
48374 if (ok)
48376 if (d->testing_p)
48377 return true;
48379 /* Found a usable second shuffle. dfirst will be
48380 vperm2f128 on d->op0 and d->op1. */
48381 dsecond.testing_p = false;
48382 dfirst = *d;
48383 dfirst.target = gen_reg_rtx (d->vmode);
48384 for (i = 0; i < nelt; i++)
48385 dfirst.perm[i] = (i & (nelt2 - 1))
48386 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48388 canonicalize_perm (&dfirst);
48389 ok = expand_vec_perm_1 (&dfirst);
48390 gcc_assert (ok);
48392 /* And dsecond is some single insn shuffle, taking
48393 d->op0 and result of vperm2f128 (if perm < 16) or
48394 d->op1 and result of vperm2f128 (otherwise). */
48395 if (perm >= 16)
48396 dsecond.op0 = dsecond.op1;
48397 dsecond.op1 = dfirst.target;
48399 ok = expand_vec_perm_1 (&dsecond);
48400 gcc_assert (ok);
48402 return true;
48405 /* For one operand, the only useful vperm2f128 permutation is 0x01
48406 aka lanes swap. */
48407 if (d->one_operand_p)
48408 return false;
48411 return false;
48414 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48415 a two vector permutation using 2 intra-lane interleave insns
48416 and cross-lane shuffle for 32-byte vectors. */
48418 static bool
48419 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48421 unsigned i, nelt;
48422 rtx (*gen) (rtx, rtx, rtx);
48424 if (d->one_operand_p)
48425 return false;
48426 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48428 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48430 else
48431 return false;
48433 nelt = d->nelt;
48434 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48435 return false;
48436 for (i = 0; i < nelt; i += 2)
48437 if (d->perm[i] != d->perm[0] + i / 2
48438 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48439 return false;
48441 if (d->testing_p)
48442 return true;
48444 switch (d->vmode)
48446 case E_V32QImode:
48447 if (d->perm[0])
48448 gen = gen_vec_interleave_highv32qi;
48449 else
48450 gen = gen_vec_interleave_lowv32qi;
48451 break;
48452 case E_V16HImode:
48453 if (d->perm[0])
48454 gen = gen_vec_interleave_highv16hi;
48455 else
48456 gen = gen_vec_interleave_lowv16hi;
48457 break;
48458 case E_V8SImode:
48459 if (d->perm[0])
48460 gen = gen_vec_interleave_highv8si;
48461 else
48462 gen = gen_vec_interleave_lowv8si;
48463 break;
48464 case E_V4DImode:
48465 if (d->perm[0])
48466 gen = gen_vec_interleave_highv4di;
48467 else
48468 gen = gen_vec_interleave_lowv4di;
48469 break;
48470 case E_V8SFmode:
48471 if (d->perm[0])
48472 gen = gen_vec_interleave_highv8sf;
48473 else
48474 gen = gen_vec_interleave_lowv8sf;
48475 break;
48476 case E_V4DFmode:
48477 if (d->perm[0])
48478 gen = gen_vec_interleave_highv4df;
48479 else
48480 gen = gen_vec_interleave_lowv4df;
48481 break;
48482 default:
48483 gcc_unreachable ();
48486 emit_insn (gen (d->target, d->op0, d->op1));
48487 return true;
48490 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48491 a single vector permutation using a single intra-lane vector
48492 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48493 the non-swapped and swapped vectors together. */
48495 static bool
48496 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48498 struct expand_vec_perm_d dfirst, dsecond;
48499 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48500 rtx_insn *seq;
48501 bool ok;
48502 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48504 if (!TARGET_AVX
48505 || TARGET_AVX2
48506 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48507 || !d->one_operand_p)
48508 return false;
48510 dfirst = *d;
48511 for (i = 0; i < nelt; i++)
48512 dfirst.perm[i] = 0xff;
48513 for (i = 0, msk = 0; i < nelt; i++)
48515 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48516 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48517 return false;
48518 dfirst.perm[j] = d->perm[i];
48519 if (j != i)
48520 msk |= (1 << i);
48522 for (i = 0; i < nelt; i++)
48523 if (dfirst.perm[i] == 0xff)
48524 dfirst.perm[i] = i;
48526 if (!d->testing_p)
48527 dfirst.target = gen_reg_rtx (dfirst.vmode);
48529 start_sequence ();
48530 ok = expand_vec_perm_1 (&dfirst);
48531 seq = get_insns ();
48532 end_sequence ();
48534 if (!ok)
48535 return false;
48537 if (d->testing_p)
48538 return true;
48540 emit_insn (seq);
48542 dsecond = *d;
48543 dsecond.op0 = dfirst.target;
48544 dsecond.op1 = dfirst.target;
48545 dsecond.one_operand_p = true;
48546 dsecond.target = gen_reg_rtx (dsecond.vmode);
48547 for (i = 0; i < nelt; i++)
48548 dsecond.perm[i] = i ^ nelt2;
48550 ok = expand_vec_perm_1 (&dsecond);
48551 gcc_assert (ok);
48553 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48554 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48555 return true;
48558 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48559 permutation using two vperm2f128, followed by a vshufpd insn blending
48560 the two vectors together. */
48562 static bool
48563 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48565 struct expand_vec_perm_d dfirst, dsecond, dthird;
48566 bool ok;
48568 if (!TARGET_AVX || (d->vmode != V4DFmode))
48569 return false;
48571 if (d->testing_p)
48572 return true;
48574 dfirst = *d;
48575 dsecond = *d;
48576 dthird = *d;
48578 dfirst.perm[0] = (d->perm[0] & ~1);
48579 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48580 dfirst.perm[2] = (d->perm[2] & ~1);
48581 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48582 dsecond.perm[0] = (d->perm[1] & ~1);
48583 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48584 dsecond.perm[2] = (d->perm[3] & ~1);
48585 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48586 dthird.perm[0] = (d->perm[0] % 2);
48587 dthird.perm[1] = (d->perm[1] % 2) + 4;
48588 dthird.perm[2] = (d->perm[2] % 2) + 2;
48589 dthird.perm[3] = (d->perm[3] % 2) + 6;
48591 dfirst.target = gen_reg_rtx (dfirst.vmode);
48592 dsecond.target = gen_reg_rtx (dsecond.vmode);
48593 dthird.op0 = dfirst.target;
48594 dthird.op1 = dsecond.target;
48595 dthird.one_operand_p = false;
48597 canonicalize_perm (&dfirst);
48598 canonicalize_perm (&dsecond);
48600 ok = expand_vec_perm_1 (&dfirst)
48601 && expand_vec_perm_1 (&dsecond)
48602 && expand_vec_perm_1 (&dthird);
48604 gcc_assert (ok);
48606 return true;
48609 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48610 permutation with two pshufb insns and an ior. We should have already
48611 failed all two instruction sequences. */
48613 static bool
48614 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48616 rtx rperm[2][16], vperm, l, h, op, m128;
48617 unsigned int i, nelt, eltsz;
48619 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48620 return false;
48621 gcc_assert (!d->one_operand_p);
48623 if (d->testing_p)
48624 return true;
48626 nelt = d->nelt;
48627 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48629 /* Generate two permutation masks. If the required element is within
48630 the given vector it is shuffled into the proper lane. If the required
48631 element is in the other vector, force a zero into the lane by setting
48632 bit 7 in the permutation mask. */
48633 m128 = GEN_INT (-128);
48634 for (i = 0; i < nelt; ++i)
48636 unsigned j, e = d->perm[i];
48637 unsigned which = (e >= nelt);
48638 if (e >= nelt)
48639 e -= nelt;
48641 for (j = 0; j < eltsz; ++j)
48643 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48644 rperm[1-which][i*eltsz + j] = m128;
48648 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48649 vperm = force_reg (V16QImode, vperm);
48651 l = gen_reg_rtx (V16QImode);
48652 op = gen_lowpart (V16QImode, d->op0);
48653 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48655 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48656 vperm = force_reg (V16QImode, vperm);
48658 h = gen_reg_rtx (V16QImode);
48659 op = gen_lowpart (V16QImode, d->op1);
48660 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48662 op = d->target;
48663 if (d->vmode != V16QImode)
48664 op = gen_reg_rtx (V16QImode);
48665 emit_insn (gen_iorv16qi3 (op, l, h));
48666 if (op != d->target)
48667 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48669 return true;
48672 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48673 with two vpshufb insns, vpermq and vpor. We should have already failed
48674 all two or three instruction sequences. */
48676 static bool
48677 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48679 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48680 unsigned int i, nelt, eltsz;
48682 if (!TARGET_AVX2
48683 || !d->one_operand_p
48684 || (d->vmode != V32QImode && d->vmode != V16HImode))
48685 return false;
48687 if (d->testing_p)
48688 return true;
48690 nelt = d->nelt;
48691 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48693 /* Generate two permutation masks. If the required element is within
48694 the same lane, it is shuffled in. If the required element from the
48695 other lane, force a zero by setting bit 7 in the permutation mask.
48696 In the other mask the mask has non-negative elements if element
48697 is requested from the other lane, but also moved to the other lane,
48698 so that the result of vpshufb can have the two V2TImode halves
48699 swapped. */
48700 m128 = GEN_INT (-128);
48701 for (i = 0; i < nelt; ++i)
48703 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48704 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48706 for (j = 0; j < eltsz; ++j)
48708 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48709 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48713 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48714 vperm = force_reg (V32QImode, vperm);
48716 h = gen_reg_rtx (V32QImode);
48717 op = gen_lowpart (V32QImode, d->op0);
48718 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48720 /* Swap the 128-byte lanes of h into hp. */
48721 hp = gen_reg_rtx (V4DImode);
48722 op = gen_lowpart (V4DImode, h);
48723 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48724 const1_rtx));
48726 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48727 vperm = force_reg (V32QImode, vperm);
48729 l = gen_reg_rtx (V32QImode);
48730 op = gen_lowpart (V32QImode, d->op0);
48731 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48733 op = d->target;
48734 if (d->vmode != V32QImode)
48735 op = gen_reg_rtx (V32QImode);
48736 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48737 if (op != d->target)
48738 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48740 return true;
48743 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48744 and extract-odd permutations of two V32QImode and V16QImode operand
48745 with two vpshufb insns, vpor and vpermq. We should have already
48746 failed all two or three instruction sequences. */
48748 static bool
48749 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48751 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48752 unsigned int i, nelt, eltsz;
48754 if (!TARGET_AVX2
48755 || d->one_operand_p
48756 || (d->vmode != V32QImode && d->vmode != V16HImode))
48757 return false;
48759 for (i = 0; i < d->nelt; ++i)
48760 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48761 return false;
48763 if (d->testing_p)
48764 return true;
48766 nelt = d->nelt;
48767 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48769 /* Generate two permutation masks. In the first permutation mask
48770 the first quarter will contain indexes for the first half
48771 of the op0, the second quarter will contain bit 7 set, third quarter
48772 will contain indexes for the second half of the op0 and the
48773 last quarter bit 7 set. In the second permutation mask
48774 the first quarter will contain bit 7 set, the second quarter
48775 indexes for the first half of the op1, the third quarter bit 7 set
48776 and last quarter indexes for the second half of the op1.
48777 I.e. the first mask e.g. for V32QImode extract even will be:
48778 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48779 (all values masked with 0xf except for -128) and second mask
48780 for extract even will be
48781 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48782 m128 = GEN_INT (-128);
48783 for (i = 0; i < nelt; ++i)
48785 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48786 unsigned which = d->perm[i] >= nelt;
48787 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48789 for (j = 0; j < eltsz; ++j)
48791 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48792 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48796 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48797 vperm = force_reg (V32QImode, vperm);
48799 l = gen_reg_rtx (V32QImode);
48800 op = gen_lowpart (V32QImode, d->op0);
48801 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48803 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48804 vperm = force_reg (V32QImode, vperm);
48806 h = gen_reg_rtx (V32QImode);
48807 op = gen_lowpart (V32QImode, d->op1);
48808 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48810 ior = gen_reg_rtx (V32QImode);
48811 emit_insn (gen_iorv32qi3 (ior, l, h));
48813 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48814 op = gen_reg_rtx (V4DImode);
48815 ior = gen_lowpart (V4DImode, ior);
48816 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48817 const1_rtx, GEN_INT (3)));
48818 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48820 return true;
48823 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48824 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48825 with two "and" and "pack" or two "shift" and "pack" insns. We should
48826 have already failed all two instruction sequences. */
48828 static bool
48829 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48831 rtx op, dop0, dop1, t, rperm[16];
48832 unsigned i, odd, c, s, nelt = d->nelt;
48833 bool end_perm = false;
48834 machine_mode half_mode;
48835 rtx (*gen_and) (rtx, rtx, rtx);
48836 rtx (*gen_pack) (rtx, rtx, rtx);
48837 rtx (*gen_shift) (rtx, rtx, rtx);
48839 if (d->one_operand_p)
48840 return false;
48842 switch (d->vmode)
48844 case E_V8HImode:
48845 /* Required for "pack". */
48846 if (!TARGET_SSE4_1)
48847 return false;
48848 c = 0xffff;
48849 s = 16;
48850 half_mode = V4SImode;
48851 gen_and = gen_andv4si3;
48852 gen_pack = gen_sse4_1_packusdw;
48853 gen_shift = gen_lshrv4si3;
48854 break;
48855 case E_V16QImode:
48856 /* No check as all instructions are SSE2. */
48857 c = 0xff;
48858 s = 8;
48859 half_mode = V8HImode;
48860 gen_and = gen_andv8hi3;
48861 gen_pack = gen_sse2_packuswb;
48862 gen_shift = gen_lshrv8hi3;
48863 break;
48864 case E_V16HImode:
48865 if (!TARGET_AVX2)
48866 return false;
48867 c = 0xffff;
48868 s = 16;
48869 half_mode = V8SImode;
48870 gen_and = gen_andv8si3;
48871 gen_pack = gen_avx2_packusdw;
48872 gen_shift = gen_lshrv8si3;
48873 end_perm = true;
48874 break;
48875 case E_V32QImode:
48876 if (!TARGET_AVX2)
48877 return false;
48878 c = 0xff;
48879 s = 8;
48880 half_mode = V16HImode;
48881 gen_and = gen_andv16hi3;
48882 gen_pack = gen_avx2_packuswb;
48883 gen_shift = gen_lshrv16hi3;
48884 end_perm = true;
48885 break;
48886 default:
48887 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48888 general shuffles. */
48889 return false;
48892 /* Check that permutation is even or odd. */
48893 odd = d->perm[0];
48894 if (odd > 1)
48895 return false;
48897 for (i = 1; i < nelt; ++i)
48898 if (d->perm[i] != 2 * i + odd)
48899 return false;
48901 if (d->testing_p)
48902 return true;
48904 dop0 = gen_reg_rtx (half_mode);
48905 dop1 = gen_reg_rtx (half_mode);
48906 if (odd == 0)
48908 for (i = 0; i < nelt / 2; i++)
48909 rperm[i] = GEN_INT (c);
48910 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
48911 t = force_reg (half_mode, t);
48912 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48913 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48915 else
48917 emit_insn (gen_shift (dop0,
48918 gen_lowpart (half_mode, d->op0),
48919 GEN_INT (s)));
48920 emit_insn (gen_shift (dop1,
48921 gen_lowpart (half_mode, d->op1),
48922 GEN_INT (s)));
48924 /* In AVX2 for 256 bit case we need to permute pack result. */
48925 if (TARGET_AVX2 && end_perm)
48927 op = gen_reg_rtx (d->vmode);
48928 t = gen_reg_rtx (V4DImode);
48929 emit_insn (gen_pack (op, dop0, dop1));
48930 emit_insn (gen_avx2_permv4di_1 (t,
48931 gen_lowpart (V4DImode, op),
48932 const0_rtx,
48933 const2_rtx,
48934 const1_rtx,
48935 GEN_INT (3)));
48936 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48938 else
48939 emit_insn (gen_pack (d->target, dop0, dop1));
48941 return true;
48944 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48945 and extract-odd permutations of two V64QI operands
48946 with two "shifts", two "truncs" and one "concat" insns for "odd"
48947 and two "truncs" and one concat insn for "even."
48948 Have already failed all two instruction sequences. */
48950 static bool
48951 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48953 rtx t1, t2, t3, t4;
48954 unsigned i, odd, nelt = d->nelt;
48956 if (!TARGET_AVX512BW
48957 || d->one_operand_p
48958 || d->vmode != V64QImode)
48959 return false;
48961 /* Check that permutation is even or odd. */
48962 odd = d->perm[0];
48963 if (odd > 1)
48964 return false;
48966 for (i = 1; i < nelt; ++i)
48967 if (d->perm[i] != 2 * i + odd)
48968 return false;
48970 if (d->testing_p)
48971 return true;
48974 if (odd)
48976 t1 = gen_reg_rtx (V32HImode);
48977 t2 = gen_reg_rtx (V32HImode);
48978 emit_insn (gen_lshrv32hi3 (t1,
48979 gen_lowpart (V32HImode, d->op0),
48980 GEN_INT (8)));
48981 emit_insn (gen_lshrv32hi3 (t2,
48982 gen_lowpart (V32HImode, d->op1),
48983 GEN_INT (8)));
48985 else
48987 t1 = gen_lowpart (V32HImode, d->op0);
48988 t2 = gen_lowpart (V32HImode, d->op1);
48991 t3 = gen_reg_rtx (V32QImode);
48992 t4 = gen_reg_rtx (V32QImode);
48993 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48994 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48995 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48997 return true;
49000 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
49001 and extract-odd permutations. */
49003 static bool
49004 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
49006 rtx t1, t2, t3, t4, t5;
49008 switch (d->vmode)
49010 case E_V4DFmode:
49011 if (d->testing_p)
49012 break;
49013 t1 = gen_reg_rtx (V4DFmode);
49014 t2 = gen_reg_rtx (V4DFmode);
49016 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49017 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
49018 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
49020 /* Now an unpck[lh]pd will produce the result required. */
49021 if (odd)
49022 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
49023 else
49024 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
49025 emit_insn (t3);
49026 break;
49028 case E_V8SFmode:
49030 int mask = odd ? 0xdd : 0x88;
49032 if (d->testing_p)
49033 break;
49034 t1 = gen_reg_rtx (V8SFmode);
49035 t2 = gen_reg_rtx (V8SFmode);
49036 t3 = gen_reg_rtx (V8SFmode);
49038 /* Shuffle within the 128-bit lanes to produce:
49039 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
49040 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
49041 GEN_INT (mask)));
49043 /* Shuffle the lanes around to produce:
49044 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
49045 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
49046 GEN_INT (0x3)));
49048 /* Shuffle within the 128-bit lanes to produce:
49049 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
49050 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
49052 /* Shuffle within the 128-bit lanes to produce:
49053 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
49054 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
49056 /* Shuffle the lanes around to produce:
49057 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
49058 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
49059 GEN_INT (0x20)));
49061 break;
49063 case E_V2DFmode:
49064 case E_V4SFmode:
49065 case E_V2DImode:
49066 case E_V4SImode:
49067 /* These are always directly implementable by expand_vec_perm_1. */
49068 gcc_unreachable ();
49070 case E_V8HImode:
49071 if (TARGET_SSE4_1)
49072 return expand_vec_perm_even_odd_pack (d);
49073 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
49074 return expand_vec_perm_pshufb2 (d);
49075 else
49077 if (d->testing_p)
49078 break;
49079 /* We need 2*log2(N)-1 operations to achieve odd/even
49080 with interleave. */
49081 t1 = gen_reg_rtx (V8HImode);
49082 t2 = gen_reg_rtx (V8HImode);
49083 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
49084 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
49085 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
49086 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
49087 if (odd)
49088 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
49089 else
49090 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
49091 emit_insn (t3);
49093 break;
49095 case E_V16QImode:
49096 return expand_vec_perm_even_odd_pack (d);
49098 case E_V16HImode:
49099 case E_V32QImode:
49100 return expand_vec_perm_even_odd_pack (d);
49102 case E_V64QImode:
49103 return expand_vec_perm_even_odd_trunc (d);
49105 case E_V4DImode:
49106 if (!TARGET_AVX2)
49108 struct expand_vec_perm_d d_copy = *d;
49109 d_copy.vmode = V4DFmode;
49110 if (d->testing_p)
49111 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
49112 else
49113 d_copy.target = gen_reg_rtx (V4DFmode);
49114 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
49115 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
49116 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49118 if (!d->testing_p)
49119 emit_move_insn (d->target,
49120 gen_lowpart (V4DImode, d_copy.target));
49121 return true;
49123 return false;
49126 if (d->testing_p)
49127 break;
49129 t1 = gen_reg_rtx (V4DImode);
49130 t2 = gen_reg_rtx (V4DImode);
49132 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49133 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
49134 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
49136 /* Now an vpunpck[lh]qdq will produce the result required. */
49137 if (odd)
49138 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
49139 else
49140 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
49141 emit_insn (t3);
49142 break;
49144 case E_V8SImode:
49145 if (!TARGET_AVX2)
49147 struct expand_vec_perm_d d_copy = *d;
49148 d_copy.vmode = V8SFmode;
49149 if (d->testing_p)
49150 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
49151 else
49152 d_copy.target = gen_reg_rtx (V8SFmode);
49153 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
49154 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
49155 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49157 if (!d->testing_p)
49158 emit_move_insn (d->target,
49159 gen_lowpart (V8SImode, d_copy.target));
49160 return true;
49162 return false;
49165 if (d->testing_p)
49166 break;
49168 t1 = gen_reg_rtx (V8SImode);
49169 t2 = gen_reg_rtx (V8SImode);
49170 t3 = gen_reg_rtx (V4DImode);
49171 t4 = gen_reg_rtx (V4DImode);
49172 t5 = gen_reg_rtx (V4DImode);
49174 /* Shuffle the lanes around into
49175 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
49176 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
49177 gen_lowpart (V4DImode, d->op1),
49178 GEN_INT (0x20)));
49179 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
49180 gen_lowpart (V4DImode, d->op1),
49181 GEN_INT (0x31)));
49183 /* Swap the 2nd and 3rd position in each lane into
49184 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
49185 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
49186 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49187 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
49188 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49190 /* Now an vpunpck[lh]qdq will produce
49191 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
49192 if (odd)
49193 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
49194 gen_lowpart (V4DImode, t2));
49195 else
49196 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
49197 gen_lowpart (V4DImode, t2));
49198 emit_insn (t3);
49199 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
49200 break;
49202 default:
49203 gcc_unreachable ();
49206 return true;
49209 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49210 extract-even and extract-odd permutations. */
49212 static bool
49213 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
49215 unsigned i, odd, nelt = d->nelt;
49217 odd = d->perm[0];
49218 if (odd != 0 && odd != 1)
49219 return false;
49221 for (i = 1; i < nelt; ++i)
49222 if (d->perm[i] != 2 * i + odd)
49223 return false;
49225 return expand_vec_perm_even_odd_1 (d, odd);
49228 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
49229 permutations. We assume that expand_vec_perm_1 has already failed. */
49231 static bool
49232 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
49234 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
49235 machine_mode vmode = d->vmode;
49236 unsigned char perm2[4];
49237 rtx op0 = d->op0, dest;
49238 bool ok;
49240 switch (vmode)
49242 case E_V4DFmode:
49243 case E_V8SFmode:
49244 /* These are special-cased in sse.md so that we can optionally
49245 use the vbroadcast instruction. They expand to two insns
49246 if the input happens to be in a register. */
49247 gcc_unreachable ();
49249 case E_V2DFmode:
49250 case E_V2DImode:
49251 case E_V4SFmode:
49252 case E_V4SImode:
49253 /* These are always implementable using standard shuffle patterns. */
49254 gcc_unreachable ();
49256 case E_V8HImode:
49257 case E_V16QImode:
49258 /* These can be implemented via interleave. We save one insn by
49259 stopping once we have promoted to V4SImode and then use pshufd. */
49260 if (d->testing_p)
49261 return true;
49264 rtx dest;
49265 rtx (*gen) (rtx, rtx, rtx)
49266 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49267 : gen_vec_interleave_lowv8hi;
49269 if (elt >= nelt2)
49271 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49272 : gen_vec_interleave_highv8hi;
49273 elt -= nelt2;
49275 nelt2 /= 2;
49277 dest = gen_reg_rtx (vmode);
49278 emit_insn (gen (dest, op0, op0));
49279 vmode = get_mode_wider_vector (vmode);
49280 op0 = gen_lowpart (vmode, dest);
49282 while (vmode != V4SImode);
49284 memset (perm2, elt, 4);
49285 dest = gen_reg_rtx (V4SImode);
49286 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49287 gcc_assert (ok);
49288 if (!d->testing_p)
49289 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49290 return true;
49292 case E_V64QImode:
49293 case E_V32QImode:
49294 case E_V16HImode:
49295 case E_V8SImode:
49296 case E_V4DImode:
49297 /* For AVX2 broadcasts of the first element vpbroadcast* or
49298 vpermq should be used by expand_vec_perm_1. */
49299 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49300 return false;
49302 default:
49303 gcc_unreachable ();
49307 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49308 broadcast permutations. */
49310 static bool
49311 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49313 unsigned i, elt, nelt = d->nelt;
49315 if (!d->one_operand_p)
49316 return false;
49318 elt = d->perm[0];
49319 for (i = 1; i < nelt; ++i)
49320 if (d->perm[i] != elt)
49321 return false;
49323 return expand_vec_perm_broadcast_1 (d);
49326 /* Implement arbitrary permutations of two V64QImode operands
49327 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49328 static bool
49329 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49331 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49332 return false;
49334 if (d->testing_p)
49335 return true;
49337 struct expand_vec_perm_d ds[2];
49338 rtx rperm[128], vperm, target0, target1;
49339 unsigned int i, nelt;
49340 machine_mode vmode;
49342 nelt = d->nelt;
49343 vmode = V64QImode;
49345 for (i = 0; i < 2; i++)
49347 ds[i] = *d;
49348 ds[i].vmode = V32HImode;
49349 ds[i].nelt = 32;
49350 ds[i].target = gen_reg_rtx (V32HImode);
49351 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49352 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49355 /* Prepare permutations such that the first one takes care of
49356 putting the even bytes into the right positions or one higher
49357 positions (ds[0]) and the second one takes care of
49358 putting the odd bytes into the right positions or one below
49359 (ds[1]). */
49361 for (i = 0; i < nelt; i++)
49363 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49364 if (i & 1)
49366 rperm[i] = constm1_rtx;
49367 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49369 else
49371 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49372 rperm[i + 64] = constm1_rtx;
49376 bool ok = expand_vec_perm_1 (&ds[0]);
49377 gcc_assert (ok);
49378 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49380 ok = expand_vec_perm_1 (&ds[1]);
49381 gcc_assert (ok);
49382 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49384 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49385 vperm = force_reg (vmode, vperm);
49386 target0 = gen_reg_rtx (V64QImode);
49387 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49389 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49390 vperm = force_reg (vmode, vperm);
49391 target1 = gen_reg_rtx (V64QImode);
49392 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49394 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49395 return true;
49398 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49399 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49400 all the shorter instruction sequences. */
49402 static bool
49403 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49405 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49406 unsigned int i, nelt, eltsz;
49407 bool used[4];
49409 if (!TARGET_AVX2
49410 || d->one_operand_p
49411 || (d->vmode != V32QImode && d->vmode != V16HImode))
49412 return false;
49414 if (d->testing_p)
49415 return true;
49417 nelt = d->nelt;
49418 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49420 /* Generate 4 permutation masks. If the required element is within
49421 the same lane, it is shuffled in. If the required element from the
49422 other lane, force a zero by setting bit 7 in the permutation mask.
49423 In the other mask the mask has non-negative elements if element
49424 is requested from the other lane, but also moved to the other lane,
49425 so that the result of vpshufb can have the two V2TImode halves
49426 swapped. */
49427 m128 = GEN_INT (-128);
49428 for (i = 0; i < 32; ++i)
49430 rperm[0][i] = m128;
49431 rperm[1][i] = m128;
49432 rperm[2][i] = m128;
49433 rperm[3][i] = m128;
49435 used[0] = false;
49436 used[1] = false;
49437 used[2] = false;
49438 used[3] = false;
49439 for (i = 0; i < nelt; ++i)
49441 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49442 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49443 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49445 for (j = 0; j < eltsz; ++j)
49446 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49447 used[which] = true;
49450 for (i = 0; i < 2; ++i)
49452 if (!used[2 * i + 1])
49454 h[i] = NULL_RTX;
49455 continue;
49457 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49458 gen_rtvec_v (32, rperm[2 * i + 1]));
49459 vperm = force_reg (V32QImode, vperm);
49460 h[i] = gen_reg_rtx (V32QImode);
49461 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49462 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49465 /* Swap the 128-byte lanes of h[X]. */
49466 for (i = 0; i < 2; ++i)
49468 if (h[i] == NULL_RTX)
49469 continue;
49470 op = gen_reg_rtx (V4DImode);
49471 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49472 const2_rtx, GEN_INT (3), const0_rtx,
49473 const1_rtx));
49474 h[i] = gen_lowpart (V32QImode, op);
49477 for (i = 0; i < 2; ++i)
49479 if (!used[2 * i])
49481 l[i] = NULL_RTX;
49482 continue;
49484 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49485 vperm = force_reg (V32QImode, vperm);
49486 l[i] = gen_reg_rtx (V32QImode);
49487 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49488 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49491 for (i = 0; i < 2; ++i)
49493 if (h[i] && l[i])
49495 op = gen_reg_rtx (V32QImode);
49496 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49497 l[i] = op;
49499 else if (h[i])
49500 l[i] = h[i];
49503 gcc_assert (l[0] && l[1]);
49504 op = d->target;
49505 if (d->vmode != V32QImode)
49506 op = gen_reg_rtx (V32QImode);
49507 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49508 if (op != d->target)
49509 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49510 return true;
49513 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49514 With all of the interface bits taken care of, perform the expansion
49515 in D and return true on success. */
49517 static bool
49518 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49520 /* Try a single instruction expansion. */
49521 if (expand_vec_perm_1 (d))
49522 return true;
49524 /* Try sequences of two instructions. */
49526 if (expand_vec_perm_pshuflw_pshufhw (d))
49527 return true;
49529 if (expand_vec_perm_palignr (d, false))
49530 return true;
49532 if (expand_vec_perm_interleave2 (d))
49533 return true;
49535 if (expand_vec_perm_broadcast (d))
49536 return true;
49538 if (expand_vec_perm_vpermq_perm_1 (d))
49539 return true;
49541 if (expand_vec_perm_vperm2f128 (d))
49542 return true;
49544 if (expand_vec_perm_pblendv (d))
49545 return true;
49547 /* Try sequences of three instructions. */
49549 if (expand_vec_perm_even_odd_pack (d))
49550 return true;
49552 if (expand_vec_perm_2vperm2f128_vshuf (d))
49553 return true;
49555 if (expand_vec_perm_pshufb2 (d))
49556 return true;
49558 if (expand_vec_perm_interleave3 (d))
49559 return true;
49561 if (expand_vec_perm_vperm2f128_vblend (d))
49562 return true;
49564 /* Try sequences of four instructions. */
49566 if (expand_vec_perm_even_odd_trunc (d))
49567 return true;
49568 if (expand_vec_perm_vpshufb2_vpermq (d))
49569 return true;
49571 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49572 return true;
49574 if (expand_vec_perm_vpermi2_vpshub2 (d))
49575 return true;
49577 /* ??? Look for narrow permutations whose element orderings would
49578 allow the promotion to a wider mode. */
49580 /* ??? Look for sequences of interleave or a wider permute that place
49581 the data into the correct lanes for a half-vector shuffle like
49582 pshuf[lh]w or vpermilps. */
49584 /* ??? Look for sequences of interleave that produce the desired results.
49585 The combinatorics of punpck[lh] get pretty ugly... */
49587 if (expand_vec_perm_even_odd (d))
49588 return true;
49590 /* Even longer sequences. */
49591 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49592 return true;
49594 /* See if we can get the same permutation in different vector integer
49595 mode. */
49596 struct expand_vec_perm_d nd;
49597 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49599 if (!d->testing_p)
49600 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49601 return true;
49604 return false;
49607 /* If a permutation only uses one operand, make it clear. Returns true
49608 if the permutation references both operands. */
49610 static bool
49611 canonicalize_perm (struct expand_vec_perm_d *d)
49613 int i, which, nelt = d->nelt;
49615 for (i = which = 0; i < nelt; ++i)
49616 which |= (d->perm[i] < nelt ? 1 : 2);
49618 d->one_operand_p = true;
49619 switch (which)
49621 default:
49622 gcc_unreachable();
49624 case 3:
49625 if (!rtx_equal_p (d->op0, d->op1))
49627 d->one_operand_p = false;
49628 break;
49630 /* The elements of PERM do not suggest that only the first operand
49631 is used, but both operands are identical. Allow easier matching
49632 of the permutation by folding the permutation into the single
49633 input vector. */
49634 /* FALLTHRU */
49636 case 2:
49637 for (i = 0; i < nelt; ++i)
49638 d->perm[i] &= nelt - 1;
49639 d->op0 = d->op1;
49640 break;
49642 case 1:
49643 d->op1 = d->op0;
49644 break;
49647 return (which == 3);
49650 bool
49651 ix86_expand_vec_perm_const (rtx operands[4])
49653 struct expand_vec_perm_d d;
49654 unsigned char perm[MAX_VECT_LEN];
49655 int i, nelt;
49656 bool two_args;
49657 rtx sel;
49659 d.target = operands[0];
49660 d.op0 = operands[1];
49661 d.op1 = operands[2];
49662 sel = operands[3];
49664 d.vmode = GET_MODE (d.target);
49665 gcc_assert (VECTOR_MODE_P (d.vmode));
49666 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49667 d.testing_p = false;
49669 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49670 gcc_assert (XVECLEN (sel, 0) == nelt);
49671 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49673 for (i = 0; i < nelt; ++i)
49675 rtx e = XVECEXP (sel, 0, i);
49676 int ei = INTVAL (e) & (2 * nelt - 1);
49677 d.perm[i] = ei;
49678 perm[i] = ei;
49681 two_args = canonicalize_perm (&d);
49683 if (ix86_expand_vec_perm_const_1 (&d))
49684 return true;
49686 /* If the selector says both arguments are needed, but the operands are the
49687 same, the above tried to expand with one_operand_p and flattened selector.
49688 If that didn't work, retry without one_operand_p; we succeeded with that
49689 during testing. */
49690 if (two_args && d.one_operand_p)
49692 d.one_operand_p = false;
49693 memcpy (d.perm, perm, sizeof (perm));
49694 return ix86_expand_vec_perm_const_1 (&d);
49697 return false;
49700 /* Implement targetm.vectorize.vec_perm_const_ok. */
49702 static bool
49703 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49704 const unsigned char *sel)
49706 struct expand_vec_perm_d d;
49707 unsigned int i, nelt, which;
49708 bool ret;
49710 d.vmode = vmode;
49711 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49712 d.testing_p = true;
49714 /* Given sufficient ISA support we can just return true here
49715 for selected vector modes. */
49716 switch (d.vmode)
49718 case E_V16SFmode:
49719 case E_V16SImode:
49720 case E_V8DImode:
49721 case E_V8DFmode:
49722 if (TARGET_AVX512F)
49723 /* All implementable with a single vpermi2 insn. */
49724 return true;
49725 break;
49726 case E_V32HImode:
49727 if (TARGET_AVX512BW)
49728 /* All implementable with a single vpermi2 insn. */
49729 return true;
49730 break;
49731 case E_V64QImode:
49732 if (TARGET_AVX512BW)
49733 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49734 return true;
49735 break;
49736 case E_V8SImode:
49737 case E_V8SFmode:
49738 case E_V4DFmode:
49739 case E_V4DImode:
49740 if (TARGET_AVX512VL)
49741 /* All implementable with a single vpermi2 insn. */
49742 return true;
49743 break;
49744 case E_V16HImode:
49745 if (TARGET_AVX2)
49746 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49747 return true;
49748 break;
49749 case E_V32QImode:
49750 if (TARGET_AVX2)
49751 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49752 return true;
49753 break;
49754 case E_V4SImode:
49755 case E_V4SFmode:
49756 case E_V8HImode:
49757 case E_V16QImode:
49758 /* All implementable with a single vpperm insn. */
49759 if (TARGET_XOP)
49760 return true;
49761 /* All implementable with 2 pshufb + 1 ior. */
49762 if (TARGET_SSSE3)
49763 return true;
49764 break;
49765 case E_V2DImode:
49766 case E_V2DFmode:
49767 /* All implementable with shufpd or unpck[lh]pd. */
49768 return true;
49769 default:
49770 return false;
49773 /* Extract the values from the vector CST into the permutation
49774 array in D. */
49775 memcpy (d.perm, sel, nelt);
49776 for (i = which = 0; i < nelt; ++i)
49778 unsigned char e = d.perm[i];
49779 gcc_assert (e < 2 * nelt);
49780 which |= (e < nelt ? 1 : 2);
49783 /* For all elements from second vector, fold the elements to first. */
49784 if (which == 2)
49785 for (i = 0; i < nelt; ++i)
49786 d.perm[i] -= nelt;
49788 /* Check whether the mask can be applied to the vector type. */
49789 d.one_operand_p = (which != 3);
49791 /* Implementable with shufps or pshufd. */
49792 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49793 return true;
49795 /* Otherwise we have to go through the motions and see if we can
49796 figure out how to generate the requested permutation. */
49797 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49798 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49799 if (!d.one_operand_p)
49800 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49802 start_sequence ();
49803 ret = ix86_expand_vec_perm_const_1 (&d);
49804 end_sequence ();
49806 return ret;
49809 void
49810 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49812 struct expand_vec_perm_d d;
49813 unsigned i, nelt;
49815 d.target = targ;
49816 d.op0 = op0;
49817 d.op1 = op1;
49818 d.vmode = GET_MODE (targ);
49819 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49820 d.one_operand_p = false;
49821 d.testing_p = false;
49823 for (i = 0; i < nelt; ++i)
49824 d.perm[i] = i * 2 + odd;
49826 /* We'll either be able to implement the permutation directly... */
49827 if (expand_vec_perm_1 (&d))
49828 return;
49830 /* ... or we use the special-case patterns. */
49831 expand_vec_perm_even_odd_1 (&d, odd);
49834 static void
49835 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49837 struct expand_vec_perm_d d;
49838 unsigned i, nelt, base;
49839 bool ok;
49841 d.target = targ;
49842 d.op0 = op0;
49843 d.op1 = op1;
49844 d.vmode = GET_MODE (targ);
49845 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49846 d.one_operand_p = false;
49847 d.testing_p = false;
49849 base = high_p ? nelt / 2 : 0;
49850 for (i = 0; i < nelt / 2; ++i)
49852 d.perm[i * 2] = i + base;
49853 d.perm[i * 2 + 1] = i + base + nelt;
49856 /* Note that for AVX this isn't one instruction. */
49857 ok = ix86_expand_vec_perm_const_1 (&d);
49858 gcc_assert (ok);
49862 /* Expand a vector operation CODE for a V*QImode in terms of the
49863 same operation on V*HImode. */
49865 void
49866 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49868 machine_mode qimode = GET_MODE (dest);
49869 machine_mode himode;
49870 rtx (*gen_il) (rtx, rtx, rtx);
49871 rtx (*gen_ih) (rtx, rtx, rtx);
49872 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49873 struct expand_vec_perm_d d;
49874 bool ok, full_interleave;
49875 bool uns_p = false;
49876 int i;
49878 switch (qimode)
49880 case E_V16QImode:
49881 himode = V8HImode;
49882 gen_il = gen_vec_interleave_lowv16qi;
49883 gen_ih = gen_vec_interleave_highv16qi;
49884 break;
49885 case E_V32QImode:
49886 himode = V16HImode;
49887 gen_il = gen_avx2_interleave_lowv32qi;
49888 gen_ih = gen_avx2_interleave_highv32qi;
49889 break;
49890 case E_V64QImode:
49891 himode = V32HImode;
49892 gen_il = gen_avx512bw_interleave_lowv64qi;
49893 gen_ih = gen_avx512bw_interleave_highv64qi;
49894 break;
49895 default:
49896 gcc_unreachable ();
49899 op2_l = op2_h = op2;
49900 switch (code)
49902 case MULT:
49903 /* Unpack data such that we've got a source byte in each low byte of
49904 each word. We don't care what goes into the high byte of each word.
49905 Rather than trying to get zero in there, most convenient is to let
49906 it be a copy of the low byte. */
49907 op2_l = gen_reg_rtx (qimode);
49908 op2_h = gen_reg_rtx (qimode);
49909 emit_insn (gen_il (op2_l, op2, op2));
49910 emit_insn (gen_ih (op2_h, op2, op2));
49911 /* FALLTHRU */
49913 op1_l = gen_reg_rtx (qimode);
49914 op1_h = gen_reg_rtx (qimode);
49915 emit_insn (gen_il (op1_l, op1, op1));
49916 emit_insn (gen_ih (op1_h, op1, op1));
49917 full_interleave = qimode == V16QImode;
49918 break;
49920 case ASHIFT:
49921 case LSHIFTRT:
49922 uns_p = true;
49923 /* FALLTHRU */
49924 case ASHIFTRT:
49925 op1_l = gen_reg_rtx (himode);
49926 op1_h = gen_reg_rtx (himode);
49927 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49928 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49929 full_interleave = true;
49930 break;
49931 default:
49932 gcc_unreachable ();
49935 /* Perform the operation. */
49936 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49937 1, OPTAB_DIRECT);
49938 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49939 1, OPTAB_DIRECT);
49940 gcc_assert (res_l && res_h);
49942 /* Merge the data back into the right place. */
49943 d.target = dest;
49944 d.op0 = gen_lowpart (qimode, res_l);
49945 d.op1 = gen_lowpart (qimode, res_h);
49946 d.vmode = qimode;
49947 d.nelt = GET_MODE_NUNITS (qimode);
49948 d.one_operand_p = false;
49949 d.testing_p = false;
49951 if (full_interleave)
49953 /* For SSE2, we used an full interleave, so the desired
49954 results are in the even elements. */
49955 for (i = 0; i < d.nelt; ++i)
49956 d.perm[i] = i * 2;
49958 else
49960 /* For AVX, the interleave used above was not cross-lane. So the
49961 extraction is evens but with the second and third quarter swapped.
49962 Happily, that is even one insn shorter than even extraction.
49963 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49964 always first from the first and then from the second source operand,
49965 the index bits above the low 4 bits remains the same.
49966 Thus, for d.nelt == 32 we want permutation
49967 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49968 and for d.nelt == 64 we want permutation
49969 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49970 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49971 for (i = 0; i < d.nelt; ++i)
49972 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49975 ok = ix86_expand_vec_perm_const_1 (&d);
49976 gcc_assert (ok);
49978 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49979 gen_rtx_fmt_ee (code, qimode, op1, op2));
49982 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49983 if op is CONST_VECTOR with all odd elements equal to their
49984 preceding element. */
49986 static bool
49987 const_vector_equal_evenodd_p (rtx op)
49989 machine_mode mode = GET_MODE (op);
49990 int i, nunits = GET_MODE_NUNITS (mode);
49991 if (GET_CODE (op) != CONST_VECTOR
49992 || nunits != CONST_VECTOR_NUNITS (op))
49993 return false;
49994 for (i = 0; i < nunits; i += 2)
49995 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49996 return false;
49997 return true;
50000 void
50001 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
50002 bool uns_p, bool odd_p)
50004 machine_mode mode = GET_MODE (op1);
50005 machine_mode wmode = GET_MODE (dest);
50006 rtx x;
50007 rtx orig_op1 = op1, orig_op2 = op2;
50009 if (!nonimmediate_operand (op1, mode))
50010 op1 = force_reg (mode, op1);
50011 if (!nonimmediate_operand (op2, mode))
50012 op2 = force_reg (mode, op2);
50014 /* We only play even/odd games with vectors of SImode. */
50015 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
50017 /* If we're looking for the odd results, shift those members down to
50018 the even slots. For some cpus this is faster than a PSHUFD. */
50019 if (odd_p)
50021 /* For XOP use vpmacsdqh, but only for smult, as it is only
50022 signed. */
50023 if (TARGET_XOP && mode == V4SImode && !uns_p)
50025 x = force_reg (wmode, CONST0_RTX (wmode));
50026 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
50027 return;
50030 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
50031 if (!const_vector_equal_evenodd_p (orig_op1))
50032 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
50033 x, NULL, 1, OPTAB_DIRECT);
50034 if (!const_vector_equal_evenodd_p (orig_op2))
50035 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
50036 x, NULL, 1, OPTAB_DIRECT);
50037 op1 = gen_lowpart (mode, op1);
50038 op2 = gen_lowpart (mode, op2);
50041 if (mode == V16SImode)
50043 if (uns_p)
50044 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
50045 else
50046 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
50048 else if (mode == V8SImode)
50050 if (uns_p)
50051 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
50052 else
50053 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
50055 else if (uns_p)
50056 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
50057 else if (TARGET_SSE4_1)
50058 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
50059 else
50061 rtx s1, s2, t0, t1, t2;
50063 /* The easiest way to implement this without PMULDQ is to go through
50064 the motions as if we are performing a full 64-bit multiply. With
50065 the exception that we need to do less shuffling of the elements. */
50067 /* Compute the sign-extension, aka highparts, of the two operands. */
50068 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50069 op1, pc_rtx, pc_rtx);
50070 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50071 op2, pc_rtx, pc_rtx);
50073 /* Multiply LO(A) * HI(B), and vice-versa. */
50074 t1 = gen_reg_rtx (wmode);
50075 t2 = gen_reg_rtx (wmode);
50076 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
50077 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
50079 /* Multiply LO(A) * LO(B). */
50080 t0 = gen_reg_rtx (wmode);
50081 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
50083 /* Combine and shift the highparts into place. */
50084 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
50085 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
50086 1, OPTAB_DIRECT);
50088 /* Combine high and low parts. */
50089 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
50090 return;
50092 emit_insn (x);
50095 void
50096 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
50097 bool uns_p, bool high_p)
50099 machine_mode wmode = GET_MODE (dest);
50100 machine_mode mode = GET_MODE (op1);
50101 rtx t1, t2, t3, t4, mask;
50103 switch (mode)
50105 case E_V4SImode:
50106 t1 = gen_reg_rtx (mode);
50107 t2 = gen_reg_rtx (mode);
50108 if (TARGET_XOP && !uns_p)
50110 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
50111 shuffle the elements once so that all elements are in the right
50112 place for immediate use: { A C B D }. */
50113 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
50114 const1_rtx, GEN_INT (3)));
50115 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
50116 const1_rtx, GEN_INT (3)));
50118 else
50120 /* Put the elements into place for the multiply. */
50121 ix86_expand_vec_interleave (t1, op1, op1, high_p);
50122 ix86_expand_vec_interleave (t2, op2, op2, high_p);
50123 high_p = false;
50125 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
50126 break;
50128 case E_V8SImode:
50129 /* Shuffle the elements between the lanes. After this we
50130 have { A B E F | C D G H } for each operand. */
50131 t1 = gen_reg_rtx (V4DImode);
50132 t2 = gen_reg_rtx (V4DImode);
50133 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
50134 const0_rtx, const2_rtx,
50135 const1_rtx, GEN_INT (3)));
50136 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
50137 const0_rtx, const2_rtx,
50138 const1_rtx, GEN_INT (3)));
50140 /* Shuffle the elements within the lanes. After this we
50141 have { A A B B | C C D D } or { E E F F | G G H H }. */
50142 t3 = gen_reg_rtx (V8SImode);
50143 t4 = gen_reg_rtx (V8SImode);
50144 mask = GEN_INT (high_p
50145 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
50146 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
50147 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
50148 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
50150 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
50151 break;
50153 case E_V8HImode:
50154 case E_V16HImode:
50155 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
50156 uns_p, OPTAB_DIRECT);
50157 t2 = expand_binop (mode,
50158 uns_p ? umul_highpart_optab : smul_highpart_optab,
50159 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
50160 gcc_assert (t1 && t2);
50162 t3 = gen_reg_rtx (mode);
50163 ix86_expand_vec_interleave (t3, t1, t2, high_p);
50164 emit_move_insn (dest, gen_lowpart (wmode, t3));
50165 break;
50167 case E_V16QImode:
50168 case E_V32QImode:
50169 case E_V32HImode:
50170 case E_V16SImode:
50171 case E_V64QImode:
50172 t1 = gen_reg_rtx (wmode);
50173 t2 = gen_reg_rtx (wmode);
50174 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
50175 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
50177 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
50178 break;
50180 default:
50181 gcc_unreachable ();
50185 void
50186 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
50188 rtx res_1, res_2, res_3, res_4;
50190 res_1 = gen_reg_rtx (V4SImode);
50191 res_2 = gen_reg_rtx (V4SImode);
50192 res_3 = gen_reg_rtx (V2DImode);
50193 res_4 = gen_reg_rtx (V2DImode);
50194 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
50195 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
50197 /* Move the results in element 2 down to element 1; we don't care
50198 what goes in elements 2 and 3. Then we can merge the parts
50199 back together with an interleave.
50201 Note that two other sequences were tried:
50202 (1) Use interleaves at the start instead of psrldq, which allows
50203 us to use a single shufps to merge things back at the end.
50204 (2) Use shufps here to combine the two vectors, then pshufd to
50205 put the elements in the correct order.
50206 In both cases the cost of the reformatting stall was too high
50207 and the overall sequence slower. */
50209 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
50210 const0_rtx, const2_rtx,
50211 const0_rtx, const0_rtx));
50212 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
50213 const0_rtx, const2_rtx,
50214 const0_rtx, const0_rtx));
50215 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
50217 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
50220 void
50221 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
50223 machine_mode mode = GET_MODE (op0);
50224 rtx t1, t2, t3, t4, t5, t6;
50226 if (TARGET_AVX512DQ && mode == V8DImode)
50227 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
50228 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
50229 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
50230 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
50231 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
50232 else if (TARGET_XOP && mode == V2DImode)
50234 /* op1: A,B,C,D, op2: E,F,G,H */
50235 op1 = gen_lowpart (V4SImode, op1);
50236 op2 = gen_lowpart (V4SImode, op2);
50238 t1 = gen_reg_rtx (V4SImode);
50239 t2 = gen_reg_rtx (V4SImode);
50240 t3 = gen_reg_rtx (V2DImode);
50241 t4 = gen_reg_rtx (V2DImode);
50243 /* t1: B,A,D,C */
50244 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50245 GEN_INT (1),
50246 GEN_INT (0),
50247 GEN_INT (3),
50248 GEN_INT (2)));
50250 /* t2: (B*E),(A*F),(D*G),(C*H) */
50251 emit_insn (gen_mulv4si3 (t2, t1, op2));
50253 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50254 emit_insn (gen_xop_phadddq (t3, t2));
50256 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50257 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50259 /* Multiply lower parts and add all */
50260 t5 = gen_reg_rtx (V2DImode);
50261 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50262 gen_lowpart (V4SImode, op1),
50263 gen_lowpart (V4SImode, op2)));
50264 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50267 else
50269 machine_mode nmode;
50270 rtx (*umul) (rtx, rtx, rtx);
50272 if (mode == V2DImode)
50274 umul = gen_vec_widen_umult_even_v4si;
50275 nmode = V4SImode;
50277 else if (mode == V4DImode)
50279 umul = gen_vec_widen_umult_even_v8si;
50280 nmode = V8SImode;
50282 else if (mode == V8DImode)
50284 umul = gen_vec_widen_umult_even_v16si;
50285 nmode = V16SImode;
50287 else
50288 gcc_unreachable ();
50291 /* Multiply low parts. */
50292 t1 = gen_reg_rtx (mode);
50293 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50295 /* Shift input vectors right 32 bits so we can multiply high parts. */
50296 t6 = GEN_INT (32);
50297 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50298 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50300 /* Multiply high parts by low parts. */
50301 t4 = gen_reg_rtx (mode);
50302 t5 = gen_reg_rtx (mode);
50303 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50304 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50306 /* Combine and shift the highparts back. */
50307 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50308 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50310 /* Combine high and low parts. */
50311 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50314 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50315 gen_rtx_MULT (mode, op1, op2));
50318 /* Return 1 if control tansfer instruction INSN
50319 should be encoded with bnd prefix.
50320 If insn is NULL then return 1 when control
50321 transfer instructions should be prefixed with
50322 bnd by default for current function. */
50324 bool
50325 ix86_bnd_prefixed_insn_p (rtx insn)
50327 /* For call insns check special flag. */
50328 if (insn && CALL_P (insn))
50330 rtx call = get_call_rtx_from (insn);
50331 if (call)
50332 return CALL_EXPR_WITH_BOUNDS_P (call);
50335 /* All other insns are prefixed only if function is instrumented. */
50336 return chkp_function_instrumented_p (current_function_decl);
50339 /* Calculate integer abs() using only SSE2 instructions. */
50341 void
50342 ix86_expand_sse2_abs (rtx target, rtx input)
50344 machine_mode mode = GET_MODE (target);
50345 rtx tmp0, tmp1, x;
50347 switch (mode)
50349 /* For 32-bit signed integer X, the best way to calculate the absolute
50350 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50351 case E_V4SImode:
50352 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50353 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50354 NULL, 0, OPTAB_DIRECT);
50355 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50356 NULL, 0, OPTAB_DIRECT);
50357 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50358 target, 0, OPTAB_DIRECT);
50359 break;
50361 /* For 16-bit signed integer X, the best way to calculate the absolute
50362 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50363 case E_V8HImode:
50364 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50366 x = expand_simple_binop (mode, SMAX, tmp0, input,
50367 target, 0, OPTAB_DIRECT);
50368 break;
50370 /* For 8-bit signed integer X, the best way to calculate the absolute
50371 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50372 as SSE2 provides the PMINUB insn. */
50373 case E_V16QImode:
50374 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50376 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50377 target, 0, OPTAB_DIRECT);
50378 break;
50380 default:
50381 gcc_unreachable ();
50384 if (x != target)
50385 emit_move_insn (target, x);
50388 /* Expand an extract from a vector register through pextr insn.
50389 Return true if successful. */
50391 bool
50392 ix86_expand_pextr (rtx *operands)
50394 rtx dst = operands[0];
50395 rtx src = operands[1];
50397 unsigned int size = INTVAL (operands[2]);
50398 unsigned int pos = INTVAL (operands[3]);
50400 if (SUBREG_P (dst))
50402 /* Reject non-lowpart subregs. */
50403 if (SUBREG_BYTE (dst) > 0)
50404 return false;
50405 dst = SUBREG_REG (dst);
50408 if (SUBREG_P (src))
50410 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50411 src = SUBREG_REG (src);
50414 switch (GET_MODE (src))
50416 case E_V16QImode:
50417 case E_V8HImode:
50418 case E_V4SImode:
50419 case E_V2DImode:
50420 case E_V1TImode:
50421 case E_TImode:
50423 machine_mode srcmode, dstmode;
50424 rtx d, pat;
50426 dstmode = mode_for_size (size, MODE_INT, 0);
50428 switch (dstmode)
50430 case E_QImode:
50431 if (!TARGET_SSE4_1)
50432 return false;
50433 srcmode = V16QImode;
50434 break;
50436 case E_HImode:
50437 if (!TARGET_SSE2)
50438 return false;
50439 srcmode = V8HImode;
50440 break;
50442 case E_SImode:
50443 if (!TARGET_SSE4_1)
50444 return false;
50445 srcmode = V4SImode;
50446 break;
50448 case E_DImode:
50449 gcc_assert (TARGET_64BIT);
50450 if (!TARGET_SSE4_1)
50451 return false;
50452 srcmode = V2DImode;
50453 break;
50455 default:
50456 return false;
50459 /* Reject extractions from misaligned positions. */
50460 if (pos & (size-1))
50461 return false;
50463 if (GET_MODE (dst) == dstmode)
50464 d = dst;
50465 else
50466 d = gen_reg_rtx (dstmode);
50468 /* Construct insn pattern. */
50469 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50470 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50472 /* Let the rtl optimizers know about the zero extension performed. */
50473 if (dstmode == QImode || dstmode == HImode)
50475 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50476 d = gen_lowpart (SImode, d);
50479 emit_insn (gen_rtx_SET (d, pat));
50481 if (d != dst)
50482 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50483 return true;
50486 default:
50487 return false;
50491 /* Expand an insert into a vector register through pinsr insn.
50492 Return true if successful. */
50494 bool
50495 ix86_expand_pinsr (rtx *operands)
50497 rtx dst = operands[0];
50498 rtx src = operands[3];
50500 unsigned int size = INTVAL (operands[1]);
50501 unsigned int pos = INTVAL (operands[2]);
50503 if (SUBREG_P (dst))
50505 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50506 dst = SUBREG_REG (dst);
50509 switch (GET_MODE (dst))
50511 case E_V16QImode:
50512 case E_V8HImode:
50513 case E_V4SImode:
50514 case E_V2DImode:
50515 case E_V1TImode:
50516 case E_TImode:
50518 machine_mode srcmode, dstmode;
50519 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50520 rtx d;
50522 srcmode = mode_for_size (size, MODE_INT, 0);
50524 switch (srcmode)
50526 case E_QImode:
50527 if (!TARGET_SSE4_1)
50528 return false;
50529 dstmode = V16QImode;
50530 pinsr = gen_sse4_1_pinsrb;
50531 break;
50533 case E_HImode:
50534 if (!TARGET_SSE2)
50535 return false;
50536 dstmode = V8HImode;
50537 pinsr = gen_sse2_pinsrw;
50538 break;
50540 case E_SImode:
50541 if (!TARGET_SSE4_1)
50542 return false;
50543 dstmode = V4SImode;
50544 pinsr = gen_sse4_1_pinsrd;
50545 break;
50547 case E_DImode:
50548 gcc_assert (TARGET_64BIT);
50549 if (!TARGET_SSE4_1)
50550 return false;
50551 dstmode = V2DImode;
50552 pinsr = gen_sse4_1_pinsrq;
50553 break;
50555 default:
50556 return false;
50559 /* Reject insertions to misaligned positions. */
50560 if (pos & (size-1))
50561 return false;
50563 if (SUBREG_P (src))
50565 unsigned int srcpos = SUBREG_BYTE (src);
50567 if (srcpos > 0)
50569 rtx extr_ops[4];
50571 extr_ops[0] = gen_reg_rtx (srcmode);
50572 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50573 extr_ops[2] = GEN_INT (size);
50574 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50576 if (!ix86_expand_pextr (extr_ops))
50577 return false;
50579 src = extr_ops[0];
50581 else
50582 src = gen_lowpart (srcmode, SUBREG_REG (src));
50585 if (GET_MODE (dst) == dstmode)
50586 d = dst;
50587 else
50588 d = gen_reg_rtx (dstmode);
50590 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50591 gen_lowpart (srcmode, src),
50592 GEN_INT (1 << (pos / size))));
50593 if (d != dst)
50594 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50595 return true;
50598 default:
50599 return false;
50603 /* This function returns the calling abi specific va_list type node.
50604 It returns the FNDECL specific va_list type. */
50606 static tree
50607 ix86_fn_abi_va_list (tree fndecl)
50609 if (!TARGET_64BIT)
50610 return va_list_type_node;
50611 gcc_assert (fndecl != NULL_TREE);
50613 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50614 return ms_va_list_type_node;
50615 else
50616 return sysv_va_list_type_node;
50619 /* Returns the canonical va_list type specified by TYPE. If there
50620 is no valid TYPE provided, it return NULL_TREE. */
50622 static tree
50623 ix86_canonical_va_list_type (tree type)
50625 if (TARGET_64BIT)
50627 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50628 return ms_va_list_type_node;
50630 if ((TREE_CODE (type) == ARRAY_TYPE
50631 && integer_zerop (array_type_nelts (type)))
50632 || POINTER_TYPE_P (type))
50634 tree elem_type = TREE_TYPE (type);
50635 if (TREE_CODE (elem_type) == RECORD_TYPE
50636 && lookup_attribute ("sysv_abi va_list",
50637 TYPE_ATTRIBUTES (elem_type)))
50638 return sysv_va_list_type_node;
50641 return NULL_TREE;
50644 return std_canonical_va_list_type (type);
50647 /* Iterate through the target-specific builtin types for va_list.
50648 IDX denotes the iterator, *PTREE is set to the result type of
50649 the va_list builtin, and *PNAME to its internal type.
50650 Returns zero if there is no element for this index, otherwise
50651 IDX should be increased upon the next call.
50652 Note, do not iterate a base builtin's name like __builtin_va_list.
50653 Used from c_common_nodes_and_builtins. */
50655 static int
50656 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50658 if (TARGET_64BIT)
50660 switch (idx)
50662 default:
50663 break;
50665 case 0:
50666 *ptree = ms_va_list_type_node;
50667 *pname = "__builtin_ms_va_list";
50668 return 1;
50670 case 1:
50671 *ptree = sysv_va_list_type_node;
50672 *pname = "__builtin_sysv_va_list";
50673 return 1;
50677 return 0;
50680 #undef TARGET_SCHED_DISPATCH
50681 #define TARGET_SCHED_DISPATCH has_dispatch
50682 #undef TARGET_SCHED_DISPATCH_DO
50683 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50684 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50685 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50686 #undef TARGET_SCHED_REORDER
50687 #define TARGET_SCHED_REORDER ix86_sched_reorder
50688 #undef TARGET_SCHED_ADJUST_PRIORITY
50689 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50690 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50691 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50692 ix86_dependencies_evaluation_hook
50694 /* The size of the dispatch window is the total number of bytes of
50695 object code allowed in a window. */
50696 #define DISPATCH_WINDOW_SIZE 16
50698 /* Number of dispatch windows considered for scheduling. */
50699 #define MAX_DISPATCH_WINDOWS 3
50701 /* Maximum number of instructions in a window. */
50702 #define MAX_INSN 4
50704 /* Maximum number of immediate operands in a window. */
50705 #define MAX_IMM 4
50707 /* Maximum number of immediate bits allowed in a window. */
50708 #define MAX_IMM_SIZE 128
50710 /* Maximum number of 32 bit immediates allowed in a window. */
50711 #define MAX_IMM_32 4
50713 /* Maximum number of 64 bit immediates allowed in a window. */
50714 #define MAX_IMM_64 2
50716 /* Maximum total of loads or prefetches allowed in a window. */
50717 #define MAX_LOAD 2
50719 /* Maximum total of stores allowed in a window. */
50720 #define MAX_STORE 1
50722 #undef BIG
50723 #define BIG 100
50726 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50727 enum dispatch_group {
50728 disp_no_group = 0,
50729 disp_load,
50730 disp_store,
50731 disp_load_store,
50732 disp_prefetch,
50733 disp_imm,
50734 disp_imm_32,
50735 disp_imm_64,
50736 disp_branch,
50737 disp_cmp,
50738 disp_jcc,
50739 disp_last
50742 /* Number of allowable groups in a dispatch window. It is an array
50743 indexed by dispatch_group enum. 100 is used as a big number,
50744 because the number of these kind of operations does not have any
50745 effect in dispatch window, but we need them for other reasons in
50746 the table. */
50747 static unsigned int num_allowable_groups[disp_last] = {
50748 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50751 char group_name[disp_last + 1][16] = {
50752 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50753 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50754 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50757 /* Instruction path. */
50758 enum insn_path {
50759 no_path = 0,
50760 path_single, /* Single micro op. */
50761 path_double, /* Double micro op. */
50762 path_multi, /* Instructions with more than 2 micro op.. */
50763 last_path
50766 /* sched_insn_info defines a window to the instructions scheduled in
50767 the basic block. It contains a pointer to the insn_info table and
50768 the instruction scheduled.
50770 Windows are allocated for each basic block and are linked
50771 together. */
50772 typedef struct sched_insn_info_s {
50773 rtx insn;
50774 enum dispatch_group group;
50775 enum insn_path path;
50776 int byte_len;
50777 int imm_bytes;
50778 } sched_insn_info;
50780 /* Linked list of dispatch windows. This is a two way list of
50781 dispatch windows of a basic block. It contains information about
50782 the number of uops in the window and the total number of
50783 instructions and of bytes in the object code for this dispatch
50784 window. */
50785 typedef struct dispatch_windows_s {
50786 int num_insn; /* Number of insn in the window. */
50787 int num_uops; /* Number of uops in the window. */
50788 int window_size; /* Number of bytes in the window. */
50789 int window_num; /* Window number between 0 or 1. */
50790 int num_imm; /* Number of immediates in an insn. */
50791 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50792 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50793 int imm_size; /* Total immediates in the window. */
50794 int num_loads; /* Total memory loads in the window. */
50795 int num_stores; /* Total memory stores in the window. */
50796 int violation; /* Violation exists in window. */
50797 sched_insn_info *window; /* Pointer to the window. */
50798 struct dispatch_windows_s *next;
50799 struct dispatch_windows_s *prev;
50800 } dispatch_windows;
50802 /* Immediate valuse used in an insn. */
50803 typedef struct imm_info_s
50805 int imm;
50806 int imm32;
50807 int imm64;
50808 } imm_info;
50810 static dispatch_windows *dispatch_window_list;
50811 static dispatch_windows *dispatch_window_list1;
50813 /* Get dispatch group of insn. */
50815 static enum dispatch_group
50816 get_mem_group (rtx_insn *insn)
50818 enum attr_memory memory;
50820 if (INSN_CODE (insn) < 0)
50821 return disp_no_group;
50822 memory = get_attr_memory (insn);
50823 if (memory == MEMORY_STORE)
50824 return disp_store;
50826 if (memory == MEMORY_LOAD)
50827 return disp_load;
50829 if (memory == MEMORY_BOTH)
50830 return disp_load_store;
50832 return disp_no_group;
50835 /* Return true if insn is a compare instruction. */
50837 static bool
50838 is_cmp (rtx_insn *insn)
50840 enum attr_type type;
50842 type = get_attr_type (insn);
50843 return (type == TYPE_TEST
50844 || type == TYPE_ICMP
50845 || type == TYPE_FCMP
50846 || GET_CODE (PATTERN (insn)) == COMPARE);
50849 /* Return true if a dispatch violation encountered. */
50851 static bool
50852 dispatch_violation (void)
50854 if (dispatch_window_list->next)
50855 return dispatch_window_list->next->violation;
50856 return dispatch_window_list->violation;
50859 /* Return true if insn is a branch instruction. */
50861 static bool
50862 is_branch (rtx_insn *insn)
50864 return (CALL_P (insn) || JUMP_P (insn));
50867 /* Return true if insn is a prefetch instruction. */
50869 static bool
50870 is_prefetch (rtx_insn *insn)
50872 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
50875 /* This function initializes a dispatch window and the list container holding a
50876 pointer to the window. */
50878 static void
50879 init_window (int window_num)
50881 int i;
50882 dispatch_windows *new_list;
50884 if (window_num == 0)
50885 new_list = dispatch_window_list;
50886 else
50887 new_list = dispatch_window_list1;
50889 new_list->num_insn = 0;
50890 new_list->num_uops = 0;
50891 new_list->window_size = 0;
50892 new_list->next = NULL;
50893 new_list->prev = NULL;
50894 new_list->window_num = window_num;
50895 new_list->num_imm = 0;
50896 new_list->num_imm_32 = 0;
50897 new_list->num_imm_64 = 0;
50898 new_list->imm_size = 0;
50899 new_list->num_loads = 0;
50900 new_list->num_stores = 0;
50901 new_list->violation = false;
50903 for (i = 0; i < MAX_INSN; i++)
50905 new_list->window[i].insn = NULL;
50906 new_list->window[i].group = disp_no_group;
50907 new_list->window[i].path = no_path;
50908 new_list->window[i].byte_len = 0;
50909 new_list->window[i].imm_bytes = 0;
50911 return;
50914 /* This function allocates and initializes a dispatch window and the
50915 list container holding a pointer to the window. */
50917 static dispatch_windows *
50918 allocate_window (void)
50920 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
50921 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
50923 return new_list;
50926 /* This routine initializes the dispatch scheduling information. It
50927 initiates building dispatch scheduler tables and constructs the
50928 first dispatch window. */
50930 static void
50931 init_dispatch_sched (void)
50933 /* Allocate a dispatch list and a window. */
50934 dispatch_window_list = allocate_window ();
50935 dispatch_window_list1 = allocate_window ();
50936 init_window (0);
50937 init_window (1);
50940 /* This function returns true if a branch is detected. End of a basic block
50941 does not have to be a branch, but here we assume only branches end a
50942 window. */
50944 static bool
50945 is_end_basic_block (enum dispatch_group group)
50947 return group == disp_branch;
50950 /* This function is called when the end of a window processing is reached. */
50952 static void
50953 process_end_window (void)
50955 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
50956 if (dispatch_window_list->next)
50958 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
50959 gcc_assert (dispatch_window_list->window_size
50960 + dispatch_window_list1->window_size <= 48);
50961 init_window (1);
50963 init_window (0);
50966 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
50967 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
50968 for 48 bytes of instructions. Note that these windows are not dispatch
50969 windows that their sizes are DISPATCH_WINDOW_SIZE. */
50971 static dispatch_windows *
50972 allocate_next_window (int window_num)
50974 if (window_num == 0)
50976 if (dispatch_window_list->next)
50977 init_window (1);
50978 init_window (0);
50979 return dispatch_window_list;
50982 dispatch_window_list->next = dispatch_window_list1;
50983 dispatch_window_list1->prev = dispatch_window_list;
50985 return dispatch_window_list1;
50988 /* Compute number of immediate operands of an instruction. */
50990 static void
50991 find_constant (rtx in_rtx, imm_info *imm_values)
50993 if (INSN_P (in_rtx))
50994 in_rtx = PATTERN (in_rtx);
50995 subrtx_iterator::array_type array;
50996 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
50997 if (const_rtx x = *iter)
50998 switch (GET_CODE (x))
51000 case CONST:
51001 case SYMBOL_REF:
51002 case CONST_INT:
51003 (imm_values->imm)++;
51004 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
51005 (imm_values->imm32)++;
51006 else
51007 (imm_values->imm64)++;
51008 break;
51010 case CONST_DOUBLE:
51011 case CONST_WIDE_INT:
51012 (imm_values->imm)++;
51013 (imm_values->imm64)++;
51014 break;
51016 case CODE_LABEL:
51017 if (LABEL_KIND (x) == LABEL_NORMAL)
51019 (imm_values->imm)++;
51020 (imm_values->imm32)++;
51022 break;
51024 default:
51025 break;
51029 /* Return total size of immediate operands of an instruction along with number
51030 of corresponding immediate-operands. It initializes its parameters to zero
51031 befor calling FIND_CONSTANT.
51032 INSN is the input instruction. IMM is the total of immediates.
51033 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
51034 bit immediates. */
51036 static int
51037 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
51039 imm_info imm_values = {0, 0, 0};
51041 find_constant (insn, &imm_values);
51042 *imm = imm_values.imm;
51043 *imm32 = imm_values.imm32;
51044 *imm64 = imm_values.imm64;
51045 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
51048 /* This function indicates if an operand of an instruction is an
51049 immediate. */
51051 static bool
51052 has_immediate (rtx_insn *insn)
51054 int num_imm_operand;
51055 int num_imm32_operand;
51056 int num_imm64_operand;
51058 if (insn)
51059 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51060 &num_imm64_operand);
51061 return false;
51064 /* Return single or double path for instructions. */
51066 static enum insn_path
51067 get_insn_path (rtx_insn *insn)
51069 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
51071 if ((int)path == 0)
51072 return path_single;
51074 if ((int)path == 1)
51075 return path_double;
51077 return path_multi;
51080 /* Return insn dispatch group. */
51082 static enum dispatch_group
51083 get_insn_group (rtx_insn *insn)
51085 enum dispatch_group group = get_mem_group (insn);
51086 if (group)
51087 return group;
51089 if (is_branch (insn))
51090 return disp_branch;
51092 if (is_cmp (insn))
51093 return disp_cmp;
51095 if (has_immediate (insn))
51096 return disp_imm;
51098 if (is_prefetch (insn))
51099 return disp_prefetch;
51101 return disp_no_group;
51104 /* Count number of GROUP restricted instructions in a dispatch
51105 window WINDOW_LIST. */
51107 static int
51108 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
51110 enum dispatch_group group = get_insn_group (insn);
51111 int imm_size;
51112 int num_imm_operand;
51113 int num_imm32_operand;
51114 int num_imm64_operand;
51116 if (group == disp_no_group)
51117 return 0;
51119 if (group == disp_imm)
51121 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51122 &num_imm64_operand);
51123 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
51124 || num_imm_operand + window_list->num_imm > MAX_IMM
51125 || (num_imm32_operand > 0
51126 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
51127 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
51128 || (num_imm64_operand > 0
51129 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
51130 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
51131 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
51132 && num_imm64_operand > 0
51133 && ((window_list->num_imm_64 > 0
51134 && window_list->num_insn >= 2)
51135 || window_list->num_insn >= 3)))
51136 return BIG;
51138 return 1;
51141 if ((group == disp_load_store
51142 && (window_list->num_loads >= MAX_LOAD
51143 || window_list->num_stores >= MAX_STORE))
51144 || ((group == disp_load
51145 || group == disp_prefetch)
51146 && window_list->num_loads >= MAX_LOAD)
51147 || (group == disp_store
51148 && window_list->num_stores >= MAX_STORE))
51149 return BIG;
51151 return 1;
51154 /* This function returns true if insn satisfies dispatch rules on the
51155 last window scheduled. */
51157 static bool
51158 fits_dispatch_window (rtx_insn *insn)
51160 dispatch_windows *window_list = dispatch_window_list;
51161 dispatch_windows *window_list_next = dispatch_window_list->next;
51162 unsigned int num_restrict;
51163 enum dispatch_group group = get_insn_group (insn);
51164 enum insn_path path = get_insn_path (insn);
51165 int sum;
51167 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
51168 instructions should be given the lowest priority in the
51169 scheduling process in Haifa scheduler to make sure they will be
51170 scheduled in the same dispatch window as the reference to them. */
51171 if (group == disp_jcc || group == disp_cmp)
51172 return false;
51174 /* Check nonrestricted. */
51175 if (group == disp_no_group || group == disp_branch)
51176 return true;
51178 /* Get last dispatch window. */
51179 if (window_list_next)
51180 window_list = window_list_next;
51182 if (window_list->window_num == 1)
51184 sum = window_list->prev->window_size + window_list->window_size;
51186 if (sum == 32
51187 || (min_insn_size (insn) + sum) >= 48)
51188 /* Window 1 is full. Go for next window. */
51189 return true;
51192 num_restrict = count_num_restricted (insn, window_list);
51194 if (num_restrict > num_allowable_groups[group])
51195 return false;
51197 /* See if it fits in the first window. */
51198 if (window_list->window_num == 0)
51200 /* The first widow should have only single and double path
51201 uops. */
51202 if (path == path_double
51203 && (window_list->num_uops + 2) > MAX_INSN)
51204 return false;
51205 else if (path != path_single)
51206 return false;
51208 return true;
51211 /* Add an instruction INSN with NUM_UOPS micro-operations to the
51212 dispatch window WINDOW_LIST. */
51214 static void
51215 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
51217 int byte_len = min_insn_size (insn);
51218 int num_insn = window_list->num_insn;
51219 int imm_size;
51220 sched_insn_info *window = window_list->window;
51221 enum dispatch_group group = get_insn_group (insn);
51222 enum insn_path path = get_insn_path (insn);
51223 int num_imm_operand;
51224 int num_imm32_operand;
51225 int num_imm64_operand;
51227 if (!window_list->violation && group != disp_cmp
51228 && !fits_dispatch_window (insn))
51229 window_list->violation = true;
51231 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51232 &num_imm64_operand);
51234 /* Initialize window with new instruction. */
51235 window[num_insn].insn = insn;
51236 window[num_insn].byte_len = byte_len;
51237 window[num_insn].group = group;
51238 window[num_insn].path = path;
51239 window[num_insn].imm_bytes = imm_size;
51241 window_list->window_size += byte_len;
51242 window_list->num_insn = num_insn + 1;
51243 window_list->num_uops = window_list->num_uops + num_uops;
51244 window_list->imm_size += imm_size;
51245 window_list->num_imm += num_imm_operand;
51246 window_list->num_imm_32 += num_imm32_operand;
51247 window_list->num_imm_64 += num_imm64_operand;
51249 if (group == disp_store)
51250 window_list->num_stores += 1;
51251 else if (group == disp_load
51252 || group == disp_prefetch)
51253 window_list->num_loads += 1;
51254 else if (group == disp_load_store)
51256 window_list->num_stores += 1;
51257 window_list->num_loads += 1;
51261 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51262 If the total bytes of instructions or the number of instructions in
51263 the window exceed allowable, it allocates a new window. */
51265 static void
51266 add_to_dispatch_window (rtx_insn *insn)
51268 int byte_len;
51269 dispatch_windows *window_list;
51270 dispatch_windows *next_list;
51271 dispatch_windows *window0_list;
51272 enum insn_path path;
51273 enum dispatch_group insn_group;
51274 bool insn_fits;
51275 int num_insn;
51276 int num_uops;
51277 int window_num;
51278 int insn_num_uops;
51279 int sum;
51281 if (INSN_CODE (insn) < 0)
51282 return;
51284 byte_len = min_insn_size (insn);
51285 window_list = dispatch_window_list;
51286 next_list = window_list->next;
51287 path = get_insn_path (insn);
51288 insn_group = get_insn_group (insn);
51290 /* Get the last dispatch window. */
51291 if (next_list)
51292 window_list = dispatch_window_list->next;
51294 if (path == path_single)
51295 insn_num_uops = 1;
51296 else if (path == path_double)
51297 insn_num_uops = 2;
51298 else
51299 insn_num_uops = (int) path;
51301 /* If current window is full, get a new window.
51302 Window number zero is full, if MAX_INSN uops are scheduled in it.
51303 Window number one is full, if window zero's bytes plus window
51304 one's bytes is 32, or if the bytes of the new instruction added
51305 to the total makes it greater than 48, or it has already MAX_INSN
51306 instructions in it. */
51307 num_insn = window_list->num_insn;
51308 num_uops = window_list->num_uops;
51309 window_num = window_list->window_num;
51310 insn_fits = fits_dispatch_window (insn);
51312 if (num_insn >= MAX_INSN
51313 || num_uops + insn_num_uops > MAX_INSN
51314 || !(insn_fits))
51316 window_num = ~window_num & 1;
51317 window_list = allocate_next_window (window_num);
51320 if (window_num == 0)
51322 add_insn_window (insn, window_list, insn_num_uops);
51323 if (window_list->num_insn >= MAX_INSN
51324 && insn_group == disp_branch)
51326 process_end_window ();
51327 return;
51330 else if (window_num == 1)
51332 window0_list = window_list->prev;
51333 sum = window0_list->window_size + window_list->window_size;
51334 if (sum == 32
51335 || (byte_len + sum) >= 48)
51337 process_end_window ();
51338 window_list = dispatch_window_list;
51341 add_insn_window (insn, window_list, insn_num_uops);
51343 else
51344 gcc_unreachable ();
51346 if (is_end_basic_block (insn_group))
51348 /* End of basic block is reached do end-basic-block process. */
51349 process_end_window ();
51350 return;
51354 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51356 DEBUG_FUNCTION static void
51357 debug_dispatch_window_file (FILE *file, int window_num)
51359 dispatch_windows *list;
51360 int i;
51362 if (window_num == 0)
51363 list = dispatch_window_list;
51364 else
51365 list = dispatch_window_list1;
51367 fprintf (file, "Window #%d:\n", list->window_num);
51368 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51369 list->num_insn, list->num_uops, list->window_size);
51370 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51371 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51373 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51374 list->num_stores);
51375 fprintf (file, " insn info:\n");
51377 for (i = 0; i < MAX_INSN; i++)
51379 if (!list->window[i].insn)
51380 break;
51381 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51382 i, group_name[list->window[i].group],
51383 i, (void *)list->window[i].insn,
51384 i, list->window[i].path,
51385 i, list->window[i].byte_len,
51386 i, list->window[i].imm_bytes);
51390 /* Print to stdout a dispatch window. */
51392 DEBUG_FUNCTION void
51393 debug_dispatch_window (int window_num)
51395 debug_dispatch_window_file (stdout, window_num);
51398 /* Print INSN dispatch information to FILE. */
51400 DEBUG_FUNCTION static void
51401 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51403 int byte_len;
51404 enum insn_path path;
51405 enum dispatch_group group;
51406 int imm_size;
51407 int num_imm_operand;
51408 int num_imm32_operand;
51409 int num_imm64_operand;
51411 if (INSN_CODE (insn) < 0)
51412 return;
51414 byte_len = min_insn_size (insn);
51415 path = get_insn_path (insn);
51416 group = get_insn_group (insn);
51417 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51418 &num_imm64_operand);
51420 fprintf (file, " insn info:\n");
51421 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51422 group_name[group], path, byte_len);
51423 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51424 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51427 /* Print to STDERR the status of the ready list with respect to
51428 dispatch windows. */
51430 DEBUG_FUNCTION void
51431 debug_ready_dispatch (void)
51433 int i;
51434 int no_ready = number_in_ready ();
51436 fprintf (stdout, "Number of ready: %d\n", no_ready);
51438 for (i = 0; i < no_ready; i++)
51439 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51442 /* This routine is the driver of the dispatch scheduler. */
51444 static void
51445 do_dispatch (rtx_insn *insn, int mode)
51447 if (mode == DISPATCH_INIT)
51448 init_dispatch_sched ();
51449 else if (mode == ADD_TO_DISPATCH_WINDOW)
51450 add_to_dispatch_window (insn);
51453 /* Return TRUE if Dispatch Scheduling is supported. */
51455 static bool
51456 has_dispatch (rtx_insn *insn, int action)
51458 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51459 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51460 switch (action)
51462 default:
51463 return false;
51465 case IS_DISPATCH_ON:
51466 return true;
51468 case IS_CMP:
51469 return is_cmp (insn);
51471 case DISPATCH_VIOLATION:
51472 return dispatch_violation ();
51474 case FITS_DISPATCH_WINDOW:
51475 return fits_dispatch_window (insn);
51478 return false;
51481 /* Implementation of reassociation_width target hook used by
51482 reassoc phase to identify parallelism level in reassociated
51483 tree. Statements tree_code is passed in OPC. Arguments type
51484 is passed in MODE.
51486 Currently parallel reassociation is enabled for Atom
51487 processors only and we set reassociation width to be 2
51488 because Atom may issue up to 2 instructions per cycle.
51490 Return value should be fixed if parallel reassociation is
51491 enabled for other processors. */
51493 static int
51494 ix86_reassociation_width (unsigned int, machine_mode mode)
51496 /* Vector part. */
51497 if (VECTOR_MODE_P (mode))
51499 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51500 return 2;
51501 else
51502 return 1;
51505 /* Scalar part. */
51506 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51507 return 2;
51508 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51509 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51510 else
51511 return 1;
51514 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51515 place emms and femms instructions. */
51517 static machine_mode
51518 ix86_preferred_simd_mode (machine_mode mode)
51520 if (!TARGET_SSE)
51521 return word_mode;
51523 switch (mode)
51525 case E_QImode:
51526 return TARGET_AVX512BW ? V64QImode :
51527 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51528 case E_HImode:
51529 return TARGET_AVX512BW ? V32HImode :
51530 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51531 case E_SImode:
51532 return TARGET_AVX512F ? V16SImode :
51533 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51534 case E_DImode:
51535 return TARGET_AVX512F ? V8DImode :
51536 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51538 case E_SFmode:
51539 if (TARGET_AVX512F)
51540 return V16SFmode;
51541 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51542 return V8SFmode;
51543 else
51544 return V4SFmode;
51546 case E_DFmode:
51547 if (TARGET_AVX512F)
51548 return V8DFmode;
51549 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51550 return V4DFmode;
51551 else if (TARGET_SSE2)
51552 return V2DFmode;
51553 /* FALLTHRU */
51555 default:
51556 return word_mode;
51560 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51561 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51562 256bit and 128bit vectors. */
51564 static unsigned int
51565 ix86_autovectorize_vector_sizes (void)
51567 return TARGET_AVX512F ? 64 | 32 | 16 :
51568 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51571 /* Implemenation of targetm.vectorize.get_mask_mode. */
51573 static machine_mode
51574 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51576 unsigned elem_size = vector_size / nunits;
51578 /* Scalar mask case. */
51579 if ((TARGET_AVX512F && vector_size == 64)
51580 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51582 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51583 return smallest_mode_for_size (nunits, MODE_INT);
51586 machine_mode elem_mode
51587 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
51589 gcc_assert (elem_size * nunits == vector_size);
51591 return mode_for_vector (elem_mode, nunits);
51596 /* Return class of registers which could be used for pseudo of MODE
51597 and of class RCLASS for spilling instead of memory. Return NO_REGS
51598 if it is not possible or non-profitable. */
51600 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51602 static reg_class_t
51603 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51605 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51606 && TARGET_SSE2
51607 && TARGET_INTER_UNIT_MOVES_TO_VEC
51608 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51609 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51610 && INTEGER_CLASS_P (rclass))
51611 return ALL_SSE_REGS;
51612 return NO_REGS;
51615 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51616 but returns a lower bound. */
51618 static unsigned int
51619 ix86_max_noce_ifcvt_seq_cost (edge e)
51621 bool predictable_p = predictable_edge_p (e);
51623 enum compiler_param param
51624 = (predictable_p
51625 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51626 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51628 /* If we have a parameter set, use that, otherwise take a guess using
51629 BRANCH_COST. */
51630 if (global_options_set.x_param_values[param])
51631 return PARAM_VALUE (param);
51632 else
51633 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51636 /* Return true if SEQ is a good candidate as a replacement for the
51637 if-convertible sequence described in IF_INFO. */
51639 static bool
51640 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51642 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51644 int cmov_cnt = 0;
51645 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51646 Maybe we should allow even more conditional moves as long as they
51647 are used far enough not to stall the CPU, or also consider
51648 IF_INFO->TEST_BB succ edge probabilities. */
51649 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51651 rtx set = single_set (insn);
51652 if (!set)
51653 continue;
51654 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51655 continue;
51656 rtx src = SET_SRC (set);
51657 machine_mode mode = GET_MODE (src);
51658 if (GET_MODE_CLASS (mode) != MODE_INT
51659 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51660 continue;
51661 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51662 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51663 continue;
51664 /* insn is CMOV or FCMOV. */
51665 if (++cmov_cnt > 1)
51666 return false;
51669 return default_noce_conversion_profitable_p (seq, if_info);
51672 /* Implement targetm.vectorize.init_cost. */
51674 static void *
51675 ix86_init_cost (struct loop *)
51677 unsigned *cost = XNEWVEC (unsigned, 3);
51678 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51679 return cost;
51682 /* Implement targetm.vectorize.add_stmt_cost. */
51684 static unsigned
51685 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51686 struct _stmt_vec_info *stmt_info, int misalign,
51687 enum vect_cost_model_location where)
51689 unsigned *cost = (unsigned *) data;
51690 unsigned retval = 0;
51692 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51693 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51695 /* Penalize DFmode vector operations for Bonnell. */
51696 if (TARGET_BONNELL && kind == vector_stmt
51697 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51698 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51700 /* Statements in an inner loop relative to the loop being
51701 vectorized are weighted more heavily. The value here is
51702 arbitrary and could potentially be improved with analysis. */
51703 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51704 count *= 50; /* FIXME. */
51706 retval = (unsigned) (count * stmt_cost);
51708 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51709 for Silvermont as it has out of order integer pipeline and can execute
51710 2 scalar instruction per tick, but has in order SIMD pipeline. */
51711 if ((TARGET_SILVERMONT || TARGET_INTEL)
51712 && stmt_info && stmt_info->stmt)
51714 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51715 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51716 retval = (retval * 17) / 10;
51719 cost[where] += retval;
51721 return retval;
51724 /* Implement targetm.vectorize.finish_cost. */
51726 static void
51727 ix86_finish_cost (void *data, unsigned *prologue_cost,
51728 unsigned *body_cost, unsigned *epilogue_cost)
51730 unsigned *cost = (unsigned *) data;
51731 *prologue_cost = cost[vect_prologue];
51732 *body_cost = cost[vect_body];
51733 *epilogue_cost = cost[vect_epilogue];
51736 /* Implement targetm.vectorize.destroy_cost_data. */
51738 static void
51739 ix86_destroy_cost_data (void *data)
51741 free (data);
51744 /* Validate target specific memory model bits in VAL. */
51746 static unsigned HOST_WIDE_INT
51747 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51749 enum memmodel model = memmodel_from_int (val);
51750 bool strong;
51752 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51753 |MEMMODEL_MASK)
51754 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51756 warning (OPT_Winvalid_memory_model,
51757 "Unknown architecture specific memory model");
51758 return MEMMODEL_SEQ_CST;
51760 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51761 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51763 warning (OPT_Winvalid_memory_model,
51764 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51765 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51767 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51769 warning (OPT_Winvalid_memory_model,
51770 "HLE_RELEASE not used with RELEASE or stronger memory model");
51771 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51773 return val;
51776 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51777 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51778 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51779 or number of vecsize_mangle variants that should be emitted. */
51781 static int
51782 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51783 struct cgraph_simd_clone *clonei,
51784 tree base_type, int num)
51786 int ret = 1;
51788 if (clonei->simdlen
51789 && (clonei->simdlen < 2
51790 || clonei->simdlen > 1024
51791 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51793 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51794 "unsupported simdlen %d", clonei->simdlen);
51795 return 0;
51798 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51799 if (TREE_CODE (ret_type) != VOID_TYPE)
51800 switch (TYPE_MODE (ret_type))
51802 case E_QImode:
51803 case E_HImode:
51804 case E_SImode:
51805 case E_DImode:
51806 case E_SFmode:
51807 case E_DFmode:
51808 /* case E_SCmode: */
51809 /* case E_DCmode: */
51810 break;
51811 default:
51812 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51813 "unsupported return type %qT for simd\n", ret_type);
51814 return 0;
51817 tree t;
51818 int i;
51820 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
51821 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
51822 switch (TYPE_MODE (TREE_TYPE (t)))
51824 case E_QImode:
51825 case E_HImode:
51826 case E_SImode:
51827 case E_DImode:
51828 case E_SFmode:
51829 case E_DFmode:
51830 /* case E_SCmode: */
51831 /* case E_DCmode: */
51832 break;
51833 default:
51834 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51835 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
51836 return 0;
51839 if (clonei->cilk_elemental)
51841 /* Parse here processor clause. If not present, default to 'b'. */
51842 clonei->vecsize_mangle = 'b';
51844 else if (!TREE_PUBLIC (node->decl))
51846 /* If the function isn't exported, we can pick up just one ISA
51847 for the clones. */
51848 if (TARGET_AVX512F)
51849 clonei->vecsize_mangle = 'e';
51850 else if (TARGET_AVX2)
51851 clonei->vecsize_mangle = 'd';
51852 else if (TARGET_AVX)
51853 clonei->vecsize_mangle = 'c';
51854 else
51855 clonei->vecsize_mangle = 'b';
51856 ret = 1;
51858 else
51860 clonei->vecsize_mangle = "bcde"[num];
51861 ret = 4;
51863 clonei->mask_mode = VOIDmode;
51864 switch (clonei->vecsize_mangle)
51866 case 'b':
51867 clonei->vecsize_int = 128;
51868 clonei->vecsize_float = 128;
51869 break;
51870 case 'c':
51871 clonei->vecsize_int = 128;
51872 clonei->vecsize_float = 256;
51873 break;
51874 case 'd':
51875 clonei->vecsize_int = 256;
51876 clonei->vecsize_float = 256;
51877 break;
51878 case 'e':
51879 clonei->vecsize_int = 512;
51880 clonei->vecsize_float = 512;
51881 if (TYPE_MODE (base_type) == QImode)
51882 clonei->mask_mode = DImode;
51883 else
51884 clonei->mask_mode = SImode;
51885 break;
51887 if (clonei->simdlen == 0)
51889 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
51890 clonei->simdlen = clonei->vecsize_int;
51891 else
51892 clonei->simdlen = clonei->vecsize_float;
51893 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
51895 else if (clonei->simdlen > 16)
51897 /* For compatibility with ICC, use the same upper bounds
51898 for simdlen. In particular, for CTYPE below, use the return type,
51899 unless the function returns void, in that case use the characteristic
51900 type. If it is possible for given SIMDLEN to pass CTYPE value
51901 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
51902 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
51903 emit corresponding clone. */
51904 tree ctype = ret_type;
51905 if (TREE_CODE (ret_type) == VOID_TYPE)
51906 ctype = base_type;
51907 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
51908 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
51909 cnt /= clonei->vecsize_int;
51910 else
51911 cnt /= clonei->vecsize_float;
51912 if (cnt > (TARGET_64BIT ? 16 : 8))
51914 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51915 "unsupported simdlen %d", clonei->simdlen);
51916 return 0;
51919 return ret;
51922 /* Add target attribute to SIMD clone NODE if needed. */
51924 static void
51925 ix86_simd_clone_adjust (struct cgraph_node *node)
51927 const char *str = NULL;
51928 gcc_assert (node->decl == cfun->decl);
51929 switch (node->simdclone->vecsize_mangle)
51931 case 'b':
51932 if (!TARGET_SSE2)
51933 str = "sse2";
51934 break;
51935 case 'c':
51936 if (!TARGET_AVX)
51937 str = "avx";
51938 break;
51939 case 'd':
51940 if (!TARGET_AVX2)
51941 str = "avx2";
51942 break;
51943 case 'e':
51944 if (!TARGET_AVX512F)
51945 str = "avx512f";
51946 break;
51947 default:
51948 gcc_unreachable ();
51950 if (str == NULL)
51951 return;
51952 push_cfun (NULL);
51953 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
51954 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
51955 gcc_assert (ok);
51956 pop_cfun ();
51957 ix86_reset_previous_fndecl ();
51958 ix86_set_current_function (node->decl);
51961 /* If SIMD clone NODE can't be used in a vectorized loop
51962 in current function, return -1, otherwise return a badness of using it
51963 (0 if it is most desirable from vecsize_mangle point of view, 1
51964 slightly less desirable, etc.). */
51966 static int
51967 ix86_simd_clone_usable (struct cgraph_node *node)
51969 switch (node->simdclone->vecsize_mangle)
51971 case 'b':
51972 if (!TARGET_SSE2)
51973 return -1;
51974 if (!TARGET_AVX)
51975 return 0;
51976 return TARGET_AVX2 ? 2 : 1;
51977 case 'c':
51978 if (!TARGET_AVX)
51979 return -1;
51980 return TARGET_AVX2 ? 1 : 0;
51981 case 'd':
51982 if (!TARGET_AVX2)
51983 return -1;
51984 return 0;
51985 case 'e':
51986 if (!TARGET_AVX512F)
51987 return -1;
51988 return 0;
51989 default:
51990 gcc_unreachable ();
51994 /* This function adjusts the unroll factor based on
51995 the hardware capabilities. For ex, bdver3 has
51996 a loop buffer which makes unrolling of smaller
51997 loops less important. This function decides the
51998 unroll factor using number of memory references
51999 (value 32 is used) as a heuristic. */
52001 static unsigned
52002 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
52004 basic_block *bbs;
52005 rtx_insn *insn;
52006 unsigned i;
52007 unsigned mem_count = 0;
52009 if (!TARGET_ADJUST_UNROLL)
52010 return nunroll;
52012 /* Count the number of memory references within the loop body.
52013 This value determines the unrolling factor for bdver3 and bdver4
52014 architectures. */
52015 subrtx_iterator::array_type array;
52016 bbs = get_loop_body (loop);
52017 for (i = 0; i < loop->num_nodes; i++)
52018 FOR_BB_INSNS (bbs[i], insn)
52019 if (NONDEBUG_INSN_P (insn))
52020 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
52021 if (const_rtx x = *iter)
52022 if (MEM_P (x))
52024 machine_mode mode = GET_MODE (x);
52025 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
52026 if (n_words > 4)
52027 mem_count += 2;
52028 else
52029 mem_count += 1;
52031 free (bbs);
52033 if (mem_count && mem_count <=32)
52034 return 32/mem_count;
52036 return nunroll;
52040 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
52042 static bool
52043 ix86_float_exceptions_rounding_supported_p (void)
52045 /* For x87 floating point with standard excess precision handling,
52046 there is no adddf3 pattern (since x87 floating point only has
52047 XFmode operations) so the default hook implementation gets this
52048 wrong. */
52049 return TARGET_80387 || TARGET_SSE_MATH;
52052 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
52054 static void
52055 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
52057 if (!TARGET_80387 && !TARGET_SSE_MATH)
52058 return;
52059 tree exceptions_var = create_tmp_var_raw (integer_type_node);
52060 if (TARGET_80387)
52062 tree fenv_index_type = build_index_type (size_int (6));
52063 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
52064 tree fenv_var = create_tmp_var_raw (fenv_type);
52065 TREE_ADDRESSABLE (fenv_var) = 1;
52066 tree fenv_ptr = build_pointer_type (fenv_type);
52067 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
52068 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
52069 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
52070 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
52071 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
52072 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
52073 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
52074 tree hold_fnclex = build_call_expr (fnclex, 0);
52075 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
52076 NULL_TREE, NULL_TREE);
52077 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
52078 hold_fnclex);
52079 *clear = build_call_expr (fnclex, 0);
52080 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
52081 tree fnstsw_call = build_call_expr (fnstsw, 0);
52082 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
52083 sw_var, fnstsw_call);
52084 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
52085 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
52086 exceptions_var, exceptions_x87);
52087 *update = build2 (COMPOUND_EXPR, integer_type_node,
52088 sw_mod, update_mod);
52089 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
52090 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
52092 if (TARGET_SSE_MATH)
52094 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
52095 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
52096 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
52097 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
52098 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
52099 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
52100 mxcsr_orig_var, stmxcsr_hold_call);
52101 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
52102 mxcsr_orig_var,
52103 build_int_cst (unsigned_type_node, 0x1f80));
52104 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
52105 build_int_cst (unsigned_type_node, 0xffffffc0));
52106 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
52107 mxcsr_mod_var, hold_mod_val);
52108 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52109 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
52110 hold_assign_orig, hold_assign_mod);
52111 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
52112 ldmxcsr_hold_call);
52113 if (*hold)
52114 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
52115 else
52116 *hold = hold_all;
52117 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52118 if (*clear)
52119 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
52120 ldmxcsr_clear_call);
52121 else
52122 *clear = ldmxcsr_clear_call;
52123 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
52124 tree exceptions_sse = fold_convert (integer_type_node,
52125 stxmcsr_update_call);
52126 if (*update)
52128 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
52129 exceptions_var, exceptions_sse);
52130 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
52131 exceptions_var, exceptions_mod);
52132 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
52133 exceptions_assign);
52135 else
52136 *update = build2 (MODIFY_EXPR, integer_type_node,
52137 exceptions_var, exceptions_sse);
52138 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
52139 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52140 ldmxcsr_update_call);
52142 tree atomic_feraiseexcept
52143 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
52144 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
52145 1, exceptions_var);
52146 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52147 atomic_feraiseexcept_call);
52150 /* Return mode to be used for bounds or VOIDmode
52151 if bounds are not supported. */
52153 static machine_mode
52154 ix86_mpx_bound_mode ()
52156 /* Do not support pointer checker if MPX
52157 is not enabled. */
52158 if (!TARGET_MPX)
52160 if (flag_check_pointer_bounds)
52161 warning (0, "Pointer Checker requires MPX support on this target."
52162 " Use -mmpx options to enable MPX.");
52163 return VOIDmode;
52166 return BNDmode;
52169 /* Return constant used to statically initialize constant bounds.
52171 This function is used to create special bound values. For now
52172 only INIT bounds and NONE bounds are expected. More special
52173 values may be added later. */
52175 static tree
52176 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
52178 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
52179 : build_zero_cst (pointer_sized_int_node);
52180 tree high = ub ? build_zero_cst (pointer_sized_int_node)
52181 : build_minus_one_cst (pointer_sized_int_node);
52183 /* This function is supposed to be used to create INIT and
52184 NONE bounds only. */
52185 gcc_assert ((lb == 0 && ub == -1)
52186 || (lb == -1 && ub == 0));
52188 return build_complex (NULL, low, high);
52191 /* Generate a list of statements STMTS to initialize pointer bounds
52192 variable VAR with bounds LB and UB. Return the number of generated
52193 statements. */
52195 static int
52196 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
52198 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
52199 tree lhs, modify, var_p;
52201 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
52202 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
52204 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
52205 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
52206 append_to_statement_list (modify, stmts);
52208 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
52209 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
52210 TYPE_SIZE_UNIT (pointer_sized_int_node)));
52211 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
52212 append_to_statement_list (modify, stmts);
52214 return 2;
52217 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
52218 /* For i386, common symbol is local only for non-PIE binaries. For
52219 x86-64, common symbol is local only for non-PIE binaries or linker
52220 supports copy reloc in PIE binaries. */
52222 static bool
52223 ix86_binds_local_p (const_tree exp)
52225 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
52226 (!flag_pic
52227 || (TARGET_64BIT
52228 && HAVE_LD_PIE_COPYRELOC != 0)));
52230 #endif
52232 /* If MEM is in the form of [base+offset], extract the two parts
52233 of address and set to BASE and OFFSET, otherwise return false. */
52235 static bool
52236 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52238 rtx addr;
52240 gcc_assert (MEM_P (mem));
52242 addr = XEXP (mem, 0);
52244 if (GET_CODE (addr) == CONST)
52245 addr = XEXP (addr, 0);
52247 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52249 *base = addr;
52250 *offset = const0_rtx;
52251 return true;
52254 if (GET_CODE (addr) == PLUS
52255 && (REG_P (XEXP (addr, 0))
52256 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52257 && CONST_INT_P (XEXP (addr, 1)))
52259 *base = XEXP (addr, 0);
52260 *offset = XEXP (addr, 1);
52261 return true;
52264 return false;
52267 /* Given OPERANDS of consecutive load/store, check if we can merge
52268 them into move multiple. LOAD is true if they are load instructions.
52269 MODE is the mode of memory operands. */
52271 bool
52272 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52273 machine_mode mode)
52275 HOST_WIDE_INT offval_1, offval_2, msize;
52276 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52278 if (load)
52280 mem_1 = operands[1];
52281 mem_2 = operands[3];
52282 reg_1 = operands[0];
52283 reg_2 = operands[2];
52285 else
52287 mem_1 = operands[0];
52288 mem_2 = operands[2];
52289 reg_1 = operands[1];
52290 reg_2 = operands[3];
52293 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52295 if (REGNO (reg_1) != REGNO (reg_2))
52296 return false;
52298 /* Check if the addresses are in the form of [base+offset]. */
52299 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52300 return false;
52301 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52302 return false;
52304 /* Check if the bases are the same. */
52305 if (!rtx_equal_p (base_1, base_2))
52306 return false;
52308 offval_1 = INTVAL (offset_1);
52309 offval_2 = INTVAL (offset_2);
52310 msize = GET_MODE_SIZE (mode);
52311 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52312 if (offval_1 + msize != offval_2)
52313 return false;
52315 return true;
52318 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52320 static bool
52321 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52322 optimization_type opt_type)
52324 switch (op)
52326 case asin_optab:
52327 case acos_optab:
52328 case log1p_optab:
52329 case exp_optab:
52330 case exp10_optab:
52331 case exp2_optab:
52332 case expm1_optab:
52333 case ldexp_optab:
52334 case scalb_optab:
52335 case round_optab:
52336 return opt_type == OPTIMIZE_FOR_SPEED;
52338 case rint_optab:
52339 if (SSE_FLOAT_MODE_P (mode1)
52340 && TARGET_SSE_MATH
52341 && !flag_trapping_math
52342 && !TARGET_SSE4_1)
52343 return opt_type == OPTIMIZE_FOR_SPEED;
52344 return true;
52346 case floor_optab:
52347 case ceil_optab:
52348 case btrunc_optab:
52349 if (SSE_FLOAT_MODE_P (mode1)
52350 && TARGET_SSE_MATH
52351 && !flag_trapping_math
52352 && TARGET_SSE4_1)
52353 return true;
52354 return opt_type == OPTIMIZE_FOR_SPEED;
52356 case rsqrt_optab:
52357 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52359 default:
52360 return true;
52364 /* Address space support.
52366 This is not "far pointers" in the 16-bit sense, but an easy way
52367 to use %fs and %gs segment prefixes. Therefore:
52369 (a) All address spaces have the same modes,
52370 (b) All address spaces have the same addresss forms,
52371 (c) While %fs and %gs are technically subsets of the generic
52372 address space, they are probably not subsets of each other.
52373 (d) Since we have no access to the segment base register values
52374 without resorting to a system call, we cannot convert a
52375 non-default address space to a default address space.
52376 Therefore we do not claim %fs or %gs are subsets of generic.
52378 Therefore we can (mostly) use the default hooks. */
52380 /* All use of segmentation is assumed to make address 0 valid. */
52382 static bool
52383 ix86_addr_space_zero_address_valid (addr_space_t as)
52385 return as != ADDR_SPACE_GENERIC;
52388 static void
52389 ix86_init_libfuncs (void)
52391 if (TARGET_64BIT)
52393 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52394 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52396 else
52398 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52399 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52402 #if TARGET_MACHO
52403 darwin_rename_builtins ();
52404 #endif
52407 /* Generate call to __divmoddi4. */
52409 static void
52410 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52411 rtx op0, rtx op1,
52412 rtx *quot_p, rtx *rem_p)
52414 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52416 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52417 mode, 3,
52418 op0, GET_MODE (op0),
52419 op1, GET_MODE (op1),
52420 XEXP (rem, 0), Pmode);
52421 *quot_p = quot;
52422 *rem_p = rem;
52425 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52426 FPU, assume that the fpcw is set to extended precision; when using
52427 only SSE, rounding is correct; when using both SSE and the FPU,
52428 the rounding precision is indeterminate, since either may be chosen
52429 apparently at random. */
52431 static enum flt_eval_method
52432 ix86_excess_precision (enum excess_precision_type type)
52434 switch (type)
52436 case EXCESS_PRECISION_TYPE_FAST:
52437 /* The fastest type to promote to will always be the native type,
52438 whether that occurs with implicit excess precision or
52439 otherwise. */
52440 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52441 case EXCESS_PRECISION_TYPE_STANDARD:
52442 case EXCESS_PRECISION_TYPE_IMPLICIT:
52443 /* Otherwise, the excess precision we want when we are
52444 in a standards compliant mode, and the implicit precision we
52445 provide would be identical were it not for the unpredictable
52446 cases. */
52447 if (!TARGET_80387)
52448 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52449 else if (!TARGET_MIX_SSE_I387)
52451 if (!TARGET_SSE_MATH)
52452 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52453 else if (TARGET_SSE2)
52454 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52457 /* If we are in standards compliant mode, but we know we will
52458 calculate in unpredictable precision, return
52459 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52460 excess precision if the target can't guarantee it will honor
52461 it. */
52462 return (type == EXCESS_PRECISION_TYPE_STANDARD
52463 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52464 : FLT_EVAL_METHOD_UNPREDICTABLE);
52465 default:
52466 gcc_unreachable ();
52469 return FLT_EVAL_METHOD_UNPREDICTABLE;
52472 /* Target-specific selftests. */
52474 #if CHECKING_P
52476 namespace selftest {
52478 /* Verify that hard regs are dumped as expected (in compact mode). */
52480 static void
52481 ix86_test_dumping_hard_regs ()
52483 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52484 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52487 /* Test dumping an insn with repeated references to the same SCRATCH,
52488 to verify the rtx_reuse code. */
52490 static void
52491 ix86_test_dumping_memory_blockage ()
52493 set_new_first_and_last_insn (NULL, NULL);
52495 rtx pat = gen_memory_blockage ();
52496 rtx_reuse_manager r;
52497 r.preprocess (pat);
52499 /* Verify that the repeated references to the SCRATCH show use
52500 reuse IDS. The first should be prefixed with a reuse ID,
52501 and the second should be dumped as a "reuse_rtx" of that ID.
52502 The expected string assumes Pmode == DImode. */
52503 if (Pmode == DImode)
52504 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52505 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52506 " (unspec:BLK [\n"
52507 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52508 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52511 /* Verify loading an RTL dump; specifically a dump of copying
52512 a param on x86_64 from a hard reg into the frame.
52513 This test is target-specific since the dump contains target-specific
52514 hard reg names. */
52516 static void
52517 ix86_test_loading_dump_fragment_1 ()
52519 rtl_dump_test t (SELFTEST_LOCATION,
52520 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52522 rtx_insn *insn = get_insn_by_uid (1);
52524 /* The block structure and indentation here is purely for
52525 readability; it mirrors the structure of the rtx. */
52526 tree mem_expr;
52528 rtx pat = PATTERN (insn);
52529 ASSERT_EQ (SET, GET_CODE (pat));
52531 rtx dest = SET_DEST (pat);
52532 ASSERT_EQ (MEM, GET_CODE (dest));
52533 /* Verify the "/c" was parsed. */
52534 ASSERT_TRUE (RTX_FLAG (dest, call));
52535 ASSERT_EQ (SImode, GET_MODE (dest));
52537 rtx addr = XEXP (dest, 0);
52538 ASSERT_EQ (PLUS, GET_CODE (addr));
52539 ASSERT_EQ (DImode, GET_MODE (addr));
52541 rtx lhs = XEXP (addr, 0);
52542 /* Verify that the "frame" REG was consolidated. */
52543 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52546 rtx rhs = XEXP (addr, 1);
52547 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52548 ASSERT_EQ (-4, INTVAL (rhs));
52551 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52552 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52553 /* "i" should have been handled by synthesizing a global int
52554 variable named "i". */
52555 mem_expr = MEM_EXPR (dest);
52556 ASSERT_NE (mem_expr, NULL);
52557 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52558 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52559 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52560 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52561 /* "+0". */
52562 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52563 ASSERT_EQ (0, MEM_OFFSET (dest));
52564 /* "S4". */
52565 ASSERT_EQ (4, MEM_SIZE (dest));
52566 /* "A32. */
52567 ASSERT_EQ (32, MEM_ALIGN (dest));
52570 rtx src = SET_SRC (pat);
52571 ASSERT_EQ (REG, GET_CODE (src));
52572 ASSERT_EQ (SImode, GET_MODE (src));
52573 ASSERT_EQ (5, REGNO (src));
52574 tree reg_expr = REG_EXPR (src);
52575 /* "i" here should point to the same var as for the MEM_EXPR. */
52576 ASSERT_EQ (reg_expr, mem_expr);
52581 /* Verify that the RTL loader copes with a call_insn dump.
52582 This test is target-specific since the dump contains a target-specific
52583 hard reg name. */
52585 static void
52586 ix86_test_loading_call_insn ()
52588 /* The test dump includes register "xmm0", where requires TARGET_SSE
52589 to exist. */
52590 if (!TARGET_SSE)
52591 return;
52593 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52595 rtx_insn *insn = get_insns ();
52596 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52598 /* "/j". */
52599 ASSERT_TRUE (RTX_FLAG (insn, jump));
52601 rtx pat = PATTERN (insn);
52602 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52604 /* Verify REG_NOTES. */
52606 /* "(expr_list:REG_CALL_DECL". */
52607 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52608 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52609 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52611 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52612 rtx_expr_list *note1 = note0->next ();
52613 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52615 ASSERT_EQ (NULL, note1->next ());
52618 /* Verify CALL_INSN_FUNCTION_USAGE. */
52620 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52621 rtx_expr_list *usage
52622 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52623 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52624 ASSERT_EQ (DFmode, GET_MODE (usage));
52625 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52626 ASSERT_EQ (NULL, usage->next ());
52630 /* Verify that the RTL loader copes a dump from print_rtx_function.
52631 This test is target-specific since the dump contains target-specific
52632 hard reg names. */
52634 static void
52635 ix86_test_loading_full_dump ()
52637 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52639 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52641 rtx_insn *insn_1 = get_insn_by_uid (1);
52642 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52644 rtx_insn *insn_7 = get_insn_by_uid (7);
52645 ASSERT_EQ (INSN, GET_CODE (insn_7));
52646 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52648 rtx_insn *insn_15 = get_insn_by_uid (15);
52649 ASSERT_EQ (INSN, GET_CODE (insn_15));
52650 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52652 /* Verify crtl->return_rtx. */
52653 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52654 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52655 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52658 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52659 In particular, verify that it correctly loads the 2nd operand.
52660 This test is target-specific since these are machine-specific
52661 operands (and enums). */
52663 static void
52664 ix86_test_loading_unspec ()
52666 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52668 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52670 ASSERT_TRUE (cfun);
52672 /* Test of an UNSPEC. */
52673 rtx_insn *insn = get_insns ();
52674 ASSERT_EQ (INSN, GET_CODE (insn));
52675 rtx set = single_set (insn);
52676 ASSERT_NE (NULL, set);
52677 rtx dst = SET_DEST (set);
52678 ASSERT_EQ (MEM, GET_CODE (dst));
52679 rtx src = SET_SRC (set);
52680 ASSERT_EQ (UNSPEC, GET_CODE (src));
52681 ASSERT_EQ (BLKmode, GET_MODE (src));
52682 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52684 rtx v0 = XVECEXP (src, 0, 0);
52686 /* Verify that the two uses of the first SCRATCH have pointer
52687 equality. */
52688 rtx scratch_a = XEXP (dst, 0);
52689 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52691 rtx scratch_b = XEXP (v0, 0);
52692 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52694 ASSERT_EQ (scratch_a, scratch_b);
52696 /* Verify that the two mems are thus treated as equal. */
52697 ASSERT_TRUE (rtx_equal_p (dst, v0));
52699 /* Verify the the insn is recognized. */
52700 ASSERT_NE(-1, recog_memoized (insn));
52702 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52703 insn = NEXT_INSN (insn);
52704 ASSERT_EQ (INSN, GET_CODE (insn));
52706 set = single_set (insn);
52707 ASSERT_NE (NULL, set);
52709 src = SET_SRC (set);
52710 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52711 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52714 /* Run all target-specific selftests. */
52716 static void
52717 ix86_run_selftests (void)
52719 ix86_test_dumping_hard_regs ();
52720 ix86_test_dumping_memory_blockage ();
52722 /* Various tests of loading RTL dumps, here because they contain
52723 ix86-isms (e.g. names of hard regs). */
52724 ix86_test_loading_dump_fragment_1 ();
52725 ix86_test_loading_call_insn ();
52726 ix86_test_loading_full_dump ();
52727 ix86_test_loading_unspec ();
52730 } // namespace selftest
52732 #endif /* CHECKING_P */
52734 /* Initialize the GCC target structure. */
52735 #undef TARGET_RETURN_IN_MEMORY
52736 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52738 #undef TARGET_LEGITIMIZE_ADDRESS
52739 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52741 #undef TARGET_ATTRIBUTE_TABLE
52742 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52743 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52744 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52745 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52746 # undef TARGET_MERGE_DECL_ATTRIBUTES
52747 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52748 #endif
52750 #undef TARGET_COMP_TYPE_ATTRIBUTES
52751 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52753 #undef TARGET_INIT_BUILTINS
52754 #define TARGET_INIT_BUILTINS ix86_init_builtins
52755 #undef TARGET_BUILTIN_DECL
52756 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52757 #undef TARGET_EXPAND_BUILTIN
52758 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52760 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52761 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52762 ix86_builtin_vectorized_function
52764 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52765 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52767 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52768 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52770 #undef TARGET_BUILTIN_RECIPROCAL
52771 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52773 #undef TARGET_ASM_FUNCTION_EPILOGUE
52774 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52776 #undef TARGET_ENCODE_SECTION_INFO
52777 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52778 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52779 #else
52780 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52781 #endif
52783 #undef TARGET_ASM_OPEN_PAREN
52784 #define TARGET_ASM_OPEN_PAREN ""
52785 #undef TARGET_ASM_CLOSE_PAREN
52786 #define TARGET_ASM_CLOSE_PAREN ""
52788 #undef TARGET_ASM_BYTE_OP
52789 #define TARGET_ASM_BYTE_OP ASM_BYTE
52791 #undef TARGET_ASM_ALIGNED_HI_OP
52792 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52793 #undef TARGET_ASM_ALIGNED_SI_OP
52794 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52795 #ifdef ASM_QUAD
52796 #undef TARGET_ASM_ALIGNED_DI_OP
52797 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52798 #endif
52800 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52801 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52803 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52804 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52806 #undef TARGET_ASM_UNALIGNED_HI_OP
52807 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52808 #undef TARGET_ASM_UNALIGNED_SI_OP
52809 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52810 #undef TARGET_ASM_UNALIGNED_DI_OP
52811 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52813 #undef TARGET_PRINT_OPERAND
52814 #define TARGET_PRINT_OPERAND ix86_print_operand
52815 #undef TARGET_PRINT_OPERAND_ADDRESS
52816 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52817 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52818 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
52819 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
52820 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
52822 #undef TARGET_SCHED_INIT_GLOBAL
52823 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
52824 #undef TARGET_SCHED_ADJUST_COST
52825 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
52826 #undef TARGET_SCHED_ISSUE_RATE
52827 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
52828 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
52829 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
52830 ia32_multipass_dfa_lookahead
52831 #undef TARGET_SCHED_MACRO_FUSION_P
52832 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
52833 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
52834 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
52836 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
52837 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
52839 #undef TARGET_MEMMODEL_CHECK
52840 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
52842 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
52843 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
52845 #ifdef HAVE_AS_TLS
52846 #undef TARGET_HAVE_TLS
52847 #define TARGET_HAVE_TLS true
52848 #endif
52849 #undef TARGET_CANNOT_FORCE_CONST_MEM
52850 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
52851 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
52852 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
52854 #undef TARGET_DELEGITIMIZE_ADDRESS
52855 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
52857 #undef TARGET_MS_BITFIELD_LAYOUT_P
52858 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
52860 #if TARGET_MACHO
52861 #undef TARGET_BINDS_LOCAL_P
52862 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
52863 #else
52864 #undef TARGET_BINDS_LOCAL_P
52865 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
52866 #endif
52867 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52868 #undef TARGET_BINDS_LOCAL_P
52869 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
52870 #endif
52872 #undef TARGET_ASM_OUTPUT_MI_THUNK
52873 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
52874 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
52875 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
52877 #undef TARGET_ASM_FILE_START
52878 #define TARGET_ASM_FILE_START x86_file_start
52880 #undef TARGET_OPTION_OVERRIDE
52881 #define TARGET_OPTION_OVERRIDE ix86_option_override
52883 #undef TARGET_REGISTER_MOVE_COST
52884 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
52885 #undef TARGET_MEMORY_MOVE_COST
52886 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
52887 #undef TARGET_RTX_COSTS
52888 #define TARGET_RTX_COSTS ix86_rtx_costs
52889 #undef TARGET_ADDRESS_COST
52890 #define TARGET_ADDRESS_COST ix86_address_cost
52892 #undef TARGET_FLAGS_REGNUM
52893 #define TARGET_FLAGS_REGNUM FLAGS_REG
52894 #undef TARGET_FIXED_CONDITION_CODE_REGS
52895 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
52896 #undef TARGET_CC_MODES_COMPATIBLE
52897 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
52899 #undef TARGET_MACHINE_DEPENDENT_REORG
52900 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
52902 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
52903 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
52905 #undef TARGET_BUILD_BUILTIN_VA_LIST
52906 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
52908 #undef TARGET_FOLD_BUILTIN
52909 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
52911 #undef TARGET_GIMPLE_FOLD_BUILTIN
52912 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
52914 #undef TARGET_COMPARE_VERSION_PRIORITY
52915 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
52917 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
52918 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
52919 ix86_generate_version_dispatcher_body
52921 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
52922 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
52923 ix86_get_function_versions_dispatcher
52925 #undef TARGET_ENUM_VA_LIST_P
52926 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
52928 #undef TARGET_FN_ABI_VA_LIST
52929 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
52931 #undef TARGET_CANONICAL_VA_LIST_TYPE
52932 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
52934 #undef TARGET_EXPAND_BUILTIN_VA_START
52935 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
52937 #undef TARGET_MD_ASM_ADJUST
52938 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
52940 #undef TARGET_C_EXCESS_PRECISION
52941 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
52942 #undef TARGET_PROMOTE_PROTOTYPES
52943 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
52944 #undef TARGET_SETUP_INCOMING_VARARGS
52945 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
52946 #undef TARGET_MUST_PASS_IN_STACK
52947 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
52948 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
52949 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
52950 #undef TARGET_FUNCTION_ARG_ADVANCE
52951 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
52952 #undef TARGET_FUNCTION_ARG
52953 #define TARGET_FUNCTION_ARG ix86_function_arg
52954 #undef TARGET_INIT_PIC_REG
52955 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
52956 #undef TARGET_USE_PSEUDO_PIC_REG
52957 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
52958 #undef TARGET_FUNCTION_ARG_BOUNDARY
52959 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
52960 #undef TARGET_PASS_BY_REFERENCE
52961 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
52962 #undef TARGET_INTERNAL_ARG_POINTER
52963 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
52964 #undef TARGET_UPDATE_STACK_BOUNDARY
52965 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
52966 #undef TARGET_GET_DRAP_RTX
52967 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
52968 #undef TARGET_STRICT_ARGUMENT_NAMING
52969 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
52970 #undef TARGET_STATIC_CHAIN
52971 #define TARGET_STATIC_CHAIN ix86_static_chain
52972 #undef TARGET_TRAMPOLINE_INIT
52973 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
52974 #undef TARGET_RETURN_POPS_ARGS
52975 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
52977 #undef TARGET_WARN_FUNC_RETURN
52978 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
52980 #undef TARGET_LEGITIMATE_COMBINED_INSN
52981 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
52983 #undef TARGET_ASAN_SHADOW_OFFSET
52984 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
52986 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
52987 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
52989 #undef TARGET_SCALAR_MODE_SUPPORTED_P
52990 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
52992 #undef TARGET_VECTOR_MODE_SUPPORTED_P
52993 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
52995 #undef TARGET_C_MODE_FOR_SUFFIX
52996 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
52998 #ifdef HAVE_AS_TLS
52999 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
53000 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
53001 #endif
53003 #ifdef SUBTARGET_INSERT_ATTRIBUTES
53004 #undef TARGET_INSERT_ATTRIBUTES
53005 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
53006 #endif
53008 #undef TARGET_MANGLE_TYPE
53009 #define TARGET_MANGLE_TYPE ix86_mangle_type
53011 #undef TARGET_STACK_PROTECT_GUARD
53012 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
53014 #if !TARGET_MACHO
53015 #undef TARGET_STACK_PROTECT_FAIL
53016 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
53017 #endif
53019 #undef TARGET_FUNCTION_VALUE
53020 #define TARGET_FUNCTION_VALUE ix86_function_value
53022 #undef TARGET_FUNCTION_VALUE_REGNO_P
53023 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
53025 #undef TARGET_PROMOTE_FUNCTION_MODE
53026 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
53028 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
53029 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
53031 #undef TARGET_MEMBER_TYPE_FORCES_BLK
53032 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
53034 #undef TARGET_INSTANTIATE_DECLS
53035 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
53037 #undef TARGET_SECONDARY_RELOAD
53038 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
53040 #undef TARGET_CLASS_MAX_NREGS
53041 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
53043 #undef TARGET_PREFERRED_RELOAD_CLASS
53044 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
53045 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
53046 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
53047 #undef TARGET_CLASS_LIKELY_SPILLED_P
53048 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
53050 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
53051 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
53052 ix86_builtin_vectorization_cost
53053 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
53054 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
53055 ix86_vectorize_vec_perm_const_ok
53056 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
53057 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
53058 ix86_preferred_simd_mode
53059 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
53060 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
53061 ix86_autovectorize_vector_sizes
53062 #undef TARGET_VECTORIZE_GET_MASK_MODE
53063 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
53064 #undef TARGET_VECTORIZE_INIT_COST
53065 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
53066 #undef TARGET_VECTORIZE_ADD_STMT_COST
53067 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
53068 #undef TARGET_VECTORIZE_FINISH_COST
53069 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
53070 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
53071 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
53073 #undef TARGET_SET_CURRENT_FUNCTION
53074 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
53076 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
53077 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
53079 #undef TARGET_OPTION_SAVE
53080 #define TARGET_OPTION_SAVE ix86_function_specific_save
53082 #undef TARGET_OPTION_RESTORE
53083 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
53085 #undef TARGET_OPTION_POST_STREAM_IN
53086 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
53088 #undef TARGET_OPTION_PRINT
53089 #define TARGET_OPTION_PRINT ix86_function_specific_print
53091 #undef TARGET_OPTION_FUNCTION_VERSIONS
53092 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
53094 #undef TARGET_CAN_INLINE_P
53095 #define TARGET_CAN_INLINE_P ix86_can_inline_p
53097 #undef TARGET_LEGITIMATE_ADDRESS_P
53098 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
53100 #undef TARGET_REGISTER_PRIORITY
53101 #define TARGET_REGISTER_PRIORITY ix86_register_priority
53103 #undef TARGET_REGISTER_USAGE_LEVELING_P
53104 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
53106 #undef TARGET_LEGITIMATE_CONSTANT_P
53107 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
53109 #undef TARGET_COMPUTE_FRAME_LAYOUT
53110 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
53112 #undef TARGET_FRAME_POINTER_REQUIRED
53113 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
53115 #undef TARGET_CAN_ELIMINATE
53116 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
53118 #undef TARGET_EXTRA_LIVE_ON_ENTRY
53119 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
53121 #undef TARGET_ASM_CODE_END
53122 #define TARGET_ASM_CODE_END ix86_code_end
53124 #undef TARGET_CONDITIONAL_REGISTER_USAGE
53125 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
53127 #undef TARGET_LOOP_UNROLL_ADJUST
53128 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
53130 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
53131 #undef TARGET_SPILL_CLASS
53132 #define TARGET_SPILL_CLASS ix86_spill_class
53134 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
53135 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
53136 ix86_simd_clone_compute_vecsize_and_simdlen
53138 #undef TARGET_SIMD_CLONE_ADJUST
53139 #define TARGET_SIMD_CLONE_ADJUST \
53140 ix86_simd_clone_adjust
53142 #undef TARGET_SIMD_CLONE_USABLE
53143 #define TARGET_SIMD_CLONE_USABLE \
53144 ix86_simd_clone_usable
53146 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
53147 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
53148 ix86_float_exceptions_rounding_supported_p
53150 #undef TARGET_MODE_EMIT
53151 #define TARGET_MODE_EMIT ix86_emit_mode_set
53153 #undef TARGET_MODE_NEEDED
53154 #define TARGET_MODE_NEEDED ix86_mode_needed
53156 #undef TARGET_MODE_AFTER
53157 #define TARGET_MODE_AFTER ix86_mode_after
53159 #undef TARGET_MODE_ENTRY
53160 #define TARGET_MODE_ENTRY ix86_mode_entry
53162 #undef TARGET_MODE_EXIT
53163 #define TARGET_MODE_EXIT ix86_mode_exit
53165 #undef TARGET_MODE_PRIORITY
53166 #define TARGET_MODE_PRIORITY ix86_mode_priority
53168 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
53169 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
53171 #undef TARGET_LOAD_BOUNDS_FOR_ARG
53172 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
53174 #undef TARGET_STORE_BOUNDS_FOR_ARG
53175 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
53177 #undef TARGET_LOAD_RETURNED_BOUNDS
53178 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
53180 #undef TARGET_STORE_RETURNED_BOUNDS
53181 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
53183 #undef TARGET_CHKP_BOUND_MODE
53184 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
53186 #undef TARGET_BUILTIN_CHKP_FUNCTION
53187 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
53189 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
53190 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
53192 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
53193 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
53195 #undef TARGET_CHKP_INITIALIZE_BOUNDS
53196 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
53198 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
53199 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
53201 #undef TARGET_OFFLOAD_OPTIONS
53202 #define TARGET_OFFLOAD_OPTIONS \
53203 ix86_offload_options
53205 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
53206 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
53208 #undef TARGET_OPTAB_SUPPORTED_P
53209 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
53211 #undef TARGET_HARD_REGNO_SCRATCH_OK
53212 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
53214 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
53215 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
53217 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
53218 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
53220 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
53221 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
53223 #undef TARGET_INIT_LIBFUNCS
53224 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
53226 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
53227 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
53229 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
53230 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
53232 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
53233 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
53235 #if CHECKING_P
53236 #undef TARGET_RUN_TARGET_SELFTESTS
53237 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
53238 #endif /* #if CHECKING_P */
53240 struct gcc_target targetm = TARGET_INITIALIZER;
53242 #include "gt-i386.h"