[2/77] Add an E_ prefix to case statements
[official-gcc.git] / gcc / config / i386 / i386.c
blob69d6be4c3d44817541e8d54f9e75b4509bd63556
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 static rtx legitimize_dllimport_symbol (rtx, bool);
96 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
97 static rtx legitimize_pe_coff_symbol (rtx, bool);
98 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
99 static bool ix86_save_reg (unsigned int, bool, bool);
100 static bool ix86_function_naked (const_tree);
102 #ifndef CHECK_STACK_LIMIT
103 #define CHECK_STACK_LIMIT (-1)
104 #endif
106 /* Return index of given mode in mult and division cost tables. */
107 #define MODE_INDEX(mode) \
108 ((mode) == QImode ? 0 \
109 : (mode) == HImode ? 1 \
110 : (mode) == SImode ? 2 \
111 : (mode) == DImode ? 3 \
112 : 4)
114 /* Processor costs (relative to an add) */
115 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
116 #define COSTS_N_BYTES(N) ((N) * 2)
118 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
120 static stringop_algs ix86_size_memcpy[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
123 static stringop_algs ix86_size_memset[2] = {
124 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
127 const
128 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
129 COSTS_N_BYTES (2), /* cost of an add instruction */
130 COSTS_N_BYTES (3), /* cost of a lea instruction */
131 COSTS_N_BYTES (2), /* variable shift costs */
132 COSTS_N_BYTES (3), /* constant shift costs */
133 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
134 COSTS_N_BYTES (3), /* HI */
135 COSTS_N_BYTES (3), /* SI */
136 COSTS_N_BYTES (3), /* DI */
137 COSTS_N_BYTES (5)}, /* other */
138 0, /* cost of multiply per each bit set */
139 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
140 COSTS_N_BYTES (3), /* HI */
141 COSTS_N_BYTES (3), /* SI */
142 COSTS_N_BYTES (3), /* DI */
143 COSTS_N_BYTES (5)}, /* other */
144 COSTS_N_BYTES (3), /* cost of movsx */
145 COSTS_N_BYTES (3), /* cost of movzx */
146 0, /* "large" insn */
147 2, /* MOVE_RATIO */
148 2, /* cost for loading QImode using movzbl */
149 {2, 2, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 2, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {2, 2, 2}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {2, 2, 2}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 3, /* cost of moving MMX register */
159 {3, 3}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {3, 3}, /* cost of storing MMX registers
162 in SImode and DImode */
163 3, /* cost of moving SSE register */
164 {3, 3, 3}, /* cost of loading SSE registers
165 in SImode, DImode and TImode */
166 {3, 3, 3}, /* cost of storing SSE registers
167 in SImode, DImode and TImode */
168 3, /* MMX or SSE register to integer */
169 0, /* size of l1 cache */
170 0, /* size of l2 cache */
171 0, /* size of prefetch block */
172 0, /* number of parallel prefetches */
173 2, /* Branch cost */
174 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
175 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
176 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
177 COSTS_N_BYTES (2), /* cost of FABS instruction. */
178 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
179 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
180 ix86_size_memcpy,
181 ix86_size_memset,
182 1, /* scalar_stmt_cost. */
183 1, /* scalar load_cost. */
184 1, /* scalar_store_cost. */
185 1, /* vec_stmt_cost. */
186 1, /* vec_to_scalar_cost. */
187 1, /* scalar_to_vec_cost. */
188 1, /* vec_align_load_cost. */
189 1, /* vec_unalign_load_cost. */
190 1, /* vec_store_cost. */
191 1, /* cond_taken_branch_cost. */
192 1, /* cond_not_taken_branch_cost. */
195 /* Processor costs (relative to an add) */
196 static stringop_algs i386_memcpy[2] = {
197 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
198 DUMMY_STRINGOP_ALGS};
199 static stringop_algs i386_memset[2] = {
200 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
201 DUMMY_STRINGOP_ALGS};
203 static const
204 struct processor_costs i386_cost = { /* 386 specific costs */
205 COSTS_N_INSNS (1), /* cost of an add instruction */
206 COSTS_N_INSNS (1), /* cost of a lea instruction */
207 COSTS_N_INSNS (3), /* variable shift costs */
208 COSTS_N_INSNS (2), /* constant shift costs */
209 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
210 COSTS_N_INSNS (6), /* HI */
211 COSTS_N_INSNS (6), /* SI */
212 COSTS_N_INSNS (6), /* DI */
213 COSTS_N_INSNS (6)}, /* other */
214 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
215 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
216 COSTS_N_INSNS (23), /* HI */
217 COSTS_N_INSNS (23), /* SI */
218 COSTS_N_INSNS (23), /* DI */
219 COSTS_N_INSNS (23)}, /* other */
220 COSTS_N_INSNS (3), /* cost of movsx */
221 COSTS_N_INSNS (2), /* cost of movzx */
222 15, /* "large" insn */
223 3, /* MOVE_RATIO */
224 4, /* cost for loading QImode using movzbl */
225 {2, 4, 2}, /* cost of loading integer registers
226 in QImode, HImode and SImode.
227 Relative to reg-reg move (2). */
228 {2, 4, 2}, /* cost of storing integer registers */
229 2, /* cost of reg,reg fld/fst */
230 {8, 8, 8}, /* cost of loading fp registers
231 in SFmode, DFmode and XFmode */
232 {8, 8, 8}, /* cost of storing fp registers
233 in SFmode, DFmode and XFmode */
234 2, /* cost of moving MMX register */
235 {4, 8}, /* cost of loading MMX registers
236 in SImode and DImode */
237 {4, 8}, /* cost of storing MMX registers
238 in SImode and DImode */
239 2, /* cost of moving SSE register */
240 {4, 8, 16}, /* cost of loading SSE registers
241 in SImode, DImode and TImode */
242 {4, 8, 16}, /* cost of storing SSE registers
243 in SImode, DImode and TImode */
244 3, /* MMX or SSE register to integer */
245 0, /* size of l1 cache */
246 0, /* size of l2 cache */
247 0, /* size of prefetch block */
248 0, /* number of parallel prefetches */
249 1, /* Branch cost */
250 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
251 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
252 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
253 COSTS_N_INSNS (22), /* cost of FABS instruction. */
254 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
255 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
256 i386_memcpy,
257 i386_memset,
258 1, /* scalar_stmt_cost. */
259 1, /* scalar load_cost. */
260 1, /* scalar_store_cost. */
261 1, /* vec_stmt_cost. */
262 1, /* vec_to_scalar_cost. */
263 1, /* scalar_to_vec_cost. */
264 1, /* vec_align_load_cost. */
265 2, /* vec_unalign_load_cost. */
266 1, /* vec_store_cost. */
267 3, /* cond_taken_branch_cost. */
268 1, /* cond_not_taken_branch_cost. */
271 static stringop_algs i486_memcpy[2] = {
272 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
273 DUMMY_STRINGOP_ALGS};
274 static stringop_algs i486_memset[2] = {
275 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
276 DUMMY_STRINGOP_ALGS};
278 static const
279 struct processor_costs i486_cost = { /* 486 specific costs */
280 COSTS_N_INSNS (1), /* cost of an add instruction */
281 COSTS_N_INSNS (1), /* cost of a lea instruction */
282 COSTS_N_INSNS (3), /* variable shift costs */
283 COSTS_N_INSNS (2), /* constant shift costs */
284 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
285 COSTS_N_INSNS (12), /* HI */
286 COSTS_N_INSNS (12), /* SI */
287 COSTS_N_INSNS (12), /* DI */
288 COSTS_N_INSNS (12)}, /* other */
289 1, /* cost of multiply per each bit set */
290 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
291 COSTS_N_INSNS (40), /* HI */
292 COSTS_N_INSNS (40), /* SI */
293 COSTS_N_INSNS (40), /* DI */
294 COSTS_N_INSNS (40)}, /* other */
295 COSTS_N_INSNS (3), /* cost of movsx */
296 COSTS_N_INSNS (2), /* cost of movzx */
297 15, /* "large" insn */
298 3, /* MOVE_RATIO */
299 4, /* cost for loading QImode using movzbl */
300 {2, 4, 2}, /* cost of loading integer registers
301 in QImode, HImode and SImode.
302 Relative to reg-reg move (2). */
303 {2, 4, 2}, /* cost of storing integer registers */
304 2, /* cost of reg,reg fld/fst */
305 {8, 8, 8}, /* cost of loading fp registers
306 in SFmode, DFmode and XFmode */
307 {8, 8, 8}, /* cost of storing fp registers
308 in SFmode, DFmode and XFmode */
309 2, /* cost of moving MMX register */
310 {4, 8}, /* cost of loading MMX registers
311 in SImode and DImode */
312 {4, 8}, /* cost of storing MMX registers
313 in SImode and DImode */
314 2, /* cost of moving SSE register */
315 {4, 8, 16}, /* cost of loading SSE registers
316 in SImode, DImode and TImode */
317 {4, 8, 16}, /* cost of storing SSE registers
318 in SImode, DImode and TImode */
319 3, /* MMX or SSE register to integer */
320 4, /* size of l1 cache. 486 has 8kB cache
321 shared for code and data, so 4kB is
322 not really precise. */
323 4, /* size of l2 cache */
324 0, /* size of prefetch block */
325 0, /* number of parallel prefetches */
326 1, /* Branch cost */
327 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
328 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
329 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
330 COSTS_N_INSNS (3), /* cost of FABS instruction. */
331 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
332 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
333 i486_memcpy,
334 i486_memset,
335 1, /* scalar_stmt_cost. */
336 1, /* scalar load_cost. */
337 1, /* scalar_store_cost. */
338 1, /* vec_stmt_cost. */
339 1, /* vec_to_scalar_cost. */
340 1, /* scalar_to_vec_cost. */
341 1, /* vec_align_load_cost. */
342 2, /* vec_unalign_load_cost. */
343 1, /* vec_store_cost. */
344 3, /* cond_taken_branch_cost. */
345 1, /* cond_not_taken_branch_cost. */
348 static stringop_algs pentium_memcpy[2] = {
349 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
350 DUMMY_STRINGOP_ALGS};
351 static stringop_algs pentium_memset[2] = {
352 {libcall, {{-1, rep_prefix_4_byte, false}}},
353 DUMMY_STRINGOP_ALGS};
355 static const
356 struct processor_costs pentium_cost = {
357 COSTS_N_INSNS (1), /* cost of an add instruction */
358 COSTS_N_INSNS (1), /* cost of a lea instruction */
359 COSTS_N_INSNS (4), /* variable shift costs */
360 COSTS_N_INSNS (1), /* constant shift costs */
361 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
362 COSTS_N_INSNS (11), /* HI */
363 COSTS_N_INSNS (11), /* SI */
364 COSTS_N_INSNS (11), /* DI */
365 COSTS_N_INSNS (11)}, /* other */
366 0, /* cost of multiply per each bit set */
367 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
368 COSTS_N_INSNS (25), /* HI */
369 COSTS_N_INSNS (25), /* SI */
370 COSTS_N_INSNS (25), /* DI */
371 COSTS_N_INSNS (25)}, /* other */
372 COSTS_N_INSNS (3), /* cost of movsx */
373 COSTS_N_INSNS (2), /* cost of movzx */
374 8, /* "large" insn */
375 6, /* MOVE_RATIO */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, /* cost of moving SSE register */
392 {4, 8, 16}, /* cost of loading SSE registers
393 in SImode, DImode and TImode */
394 {4, 8, 16}, /* cost of storing SSE registers
395 in SImode, DImode and TImode */
396 3, /* MMX or SSE register to integer */
397 8, /* size of l1 cache. */
398 8, /* size of l2 cache */
399 0, /* size of prefetch block */
400 0, /* number of parallel prefetches */
401 2, /* Branch cost */
402 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
403 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
404 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
405 COSTS_N_INSNS (1), /* cost of FABS instruction. */
406 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
407 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
408 pentium_memcpy,
409 pentium_memset,
410 1, /* scalar_stmt_cost. */
411 1, /* scalar load_cost. */
412 1, /* scalar_store_cost. */
413 1, /* vec_stmt_cost. */
414 1, /* vec_to_scalar_cost. */
415 1, /* scalar_to_vec_cost. */
416 1, /* vec_align_load_cost. */
417 2, /* vec_unalign_load_cost. */
418 1, /* vec_store_cost. */
419 3, /* cond_taken_branch_cost. */
420 1, /* cond_not_taken_branch_cost. */
423 static const
424 struct processor_costs lakemont_cost = {
425 COSTS_N_INSNS (1), /* cost of an add instruction */
426 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
427 COSTS_N_INSNS (1), /* variable shift costs */
428 COSTS_N_INSNS (1), /* constant shift costs */
429 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
430 COSTS_N_INSNS (11), /* HI */
431 COSTS_N_INSNS (11), /* SI */
432 COSTS_N_INSNS (11), /* DI */
433 COSTS_N_INSNS (11)}, /* other */
434 0, /* cost of multiply per each bit set */
435 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
436 COSTS_N_INSNS (25), /* HI */
437 COSTS_N_INSNS (25), /* SI */
438 COSTS_N_INSNS (25), /* DI */
439 COSTS_N_INSNS (25)}, /* other */
440 COSTS_N_INSNS (3), /* cost of movsx */
441 COSTS_N_INSNS (2), /* cost of movzx */
442 8, /* "large" insn */
443 17, /* MOVE_RATIO */
444 6, /* cost for loading QImode using movzbl */
445 {2, 4, 2}, /* cost of loading integer registers
446 in QImode, HImode and SImode.
447 Relative to reg-reg move (2). */
448 {2, 4, 2}, /* cost of storing integer registers */
449 2, /* cost of reg,reg fld/fst */
450 {2, 2, 6}, /* cost of loading fp registers
451 in SFmode, DFmode and XFmode */
452 {4, 4, 6}, /* cost of storing fp registers
453 in SFmode, DFmode and XFmode */
454 8, /* cost of moving MMX register */
455 {8, 8}, /* cost of loading MMX registers
456 in SImode and DImode */
457 {8, 8}, /* cost of storing MMX registers
458 in SImode and DImode */
459 2, /* cost of moving SSE register */
460 {4, 8, 16}, /* cost of loading SSE registers
461 in SImode, DImode and TImode */
462 {4, 8, 16}, /* cost of storing SSE registers
463 in SImode, DImode and TImode */
464 3, /* MMX or SSE register to integer */
465 8, /* size of l1 cache. */
466 8, /* size of l2 cache */
467 0, /* size of prefetch block */
468 0, /* number of parallel prefetches */
469 2, /* Branch cost */
470 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
471 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
472 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
473 COSTS_N_INSNS (1), /* cost of FABS instruction. */
474 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
475 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
476 pentium_memcpy,
477 pentium_memset,
478 1, /* scalar_stmt_cost. */
479 1, /* scalar load_cost. */
480 1, /* scalar_store_cost. */
481 1, /* vec_stmt_cost. */
482 1, /* vec_to_scalar_cost. */
483 1, /* scalar_to_vec_cost. */
484 1, /* vec_align_load_cost. */
485 2, /* vec_unalign_load_cost. */
486 1, /* vec_store_cost. */
487 3, /* cond_taken_branch_cost. */
488 1, /* cond_not_taken_branch_cost. */
491 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
492 (we ensure the alignment). For small blocks inline loop is still a
493 noticeable win, for bigger blocks either rep movsl or rep movsb is
494 way to go. Rep movsb has apparently more expensive startup time in CPU,
495 but after 4K the difference is down in the noise. */
496 static stringop_algs pentiumpro_memcpy[2] = {
497 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
498 {8192, rep_prefix_4_byte, false},
499 {-1, rep_prefix_1_byte, false}}},
500 DUMMY_STRINGOP_ALGS};
501 static stringop_algs pentiumpro_memset[2] = {
502 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
503 {8192, rep_prefix_4_byte, false},
504 {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS};
506 static const
507 struct processor_costs pentiumpro_cost = {
508 COSTS_N_INSNS (1), /* cost of an add instruction */
509 COSTS_N_INSNS (1), /* cost of a lea instruction */
510 COSTS_N_INSNS (1), /* variable shift costs */
511 COSTS_N_INSNS (1), /* constant shift costs */
512 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
513 COSTS_N_INSNS (4), /* HI */
514 COSTS_N_INSNS (4), /* SI */
515 COSTS_N_INSNS (4), /* DI */
516 COSTS_N_INSNS (4)}, /* other */
517 0, /* cost of multiply per each bit set */
518 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
519 COSTS_N_INSNS (17), /* HI */
520 COSTS_N_INSNS (17), /* SI */
521 COSTS_N_INSNS (17), /* DI */
522 COSTS_N_INSNS (17)}, /* other */
523 COSTS_N_INSNS (1), /* cost of movsx */
524 COSTS_N_INSNS (1), /* cost of movzx */
525 8, /* "large" insn */
526 6, /* MOVE_RATIO */
527 2, /* cost for loading QImode using movzbl */
528 {4, 4, 4}, /* cost of loading integer registers
529 in QImode, HImode and SImode.
530 Relative to reg-reg move (2). */
531 {2, 2, 2}, /* cost of storing integer registers */
532 2, /* cost of reg,reg fld/fst */
533 {2, 2, 6}, /* cost of loading fp registers
534 in SFmode, DFmode and XFmode */
535 {4, 4, 6}, /* cost of storing fp registers
536 in SFmode, DFmode and XFmode */
537 2, /* cost of moving MMX register */
538 {2, 2}, /* cost of loading MMX registers
539 in SImode and DImode */
540 {2, 2}, /* cost of storing MMX registers
541 in SImode and DImode */
542 2, /* cost of moving SSE register */
543 {2, 2, 8}, /* cost of loading SSE registers
544 in SImode, DImode and TImode */
545 {2, 2, 8}, /* cost of storing SSE registers
546 in SImode, DImode and TImode */
547 3, /* MMX or SSE register to integer */
548 8, /* size of l1 cache. */
549 256, /* size of l2 cache */
550 32, /* size of prefetch block */
551 6, /* number of parallel prefetches */
552 2, /* Branch cost */
553 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
554 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
555 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
556 COSTS_N_INSNS (2), /* cost of FABS instruction. */
557 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
558 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
561 1, /* scalar_stmt_cost. */
562 1, /* scalar load_cost. */
563 1, /* scalar_store_cost. */
564 1, /* vec_stmt_cost. */
565 1, /* vec_to_scalar_cost. */
566 1, /* scalar_to_vec_cost. */
567 1, /* vec_align_load_cost. */
568 2, /* vec_unalign_load_cost. */
569 1, /* vec_store_cost. */
570 3, /* cond_taken_branch_cost. */
571 1, /* cond_not_taken_branch_cost. */
574 static stringop_algs geode_memcpy[2] = {
575 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS};
577 static stringop_algs geode_memset[2] = {
578 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
579 DUMMY_STRINGOP_ALGS};
580 static const
581 struct processor_costs geode_cost = {
582 COSTS_N_INSNS (1), /* cost of an add instruction */
583 COSTS_N_INSNS (1), /* cost of a lea instruction */
584 COSTS_N_INSNS (2), /* variable shift costs */
585 COSTS_N_INSNS (1), /* constant shift costs */
586 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
587 COSTS_N_INSNS (4), /* HI */
588 COSTS_N_INSNS (7), /* SI */
589 COSTS_N_INSNS (7), /* DI */
590 COSTS_N_INSNS (7)}, /* other */
591 0, /* cost of multiply per each bit set */
592 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
593 COSTS_N_INSNS (23), /* HI */
594 COSTS_N_INSNS (39), /* SI */
595 COSTS_N_INSNS (39), /* DI */
596 COSTS_N_INSNS (39)}, /* other */
597 COSTS_N_INSNS (1), /* cost of movsx */
598 COSTS_N_INSNS (1), /* cost of movzx */
599 8, /* "large" insn */
600 4, /* MOVE_RATIO */
601 1, /* cost for loading QImode using movzbl */
602 {1, 1, 1}, /* cost of loading integer registers
603 in QImode, HImode and SImode.
604 Relative to reg-reg move (2). */
605 {1, 1, 1}, /* cost of storing integer registers */
606 1, /* cost of reg,reg fld/fst */
607 {1, 1, 1}, /* cost of loading fp registers
608 in SFmode, DFmode and XFmode */
609 {4, 6, 6}, /* cost of storing fp registers
610 in SFmode, DFmode and XFmode */
612 2, /* cost of moving MMX register */
613 {2, 2}, /* cost of loading MMX registers
614 in SImode and DImode */
615 {2, 2}, /* cost of storing MMX registers
616 in SImode and DImode */
617 2, /* cost of moving SSE register */
618 {2, 2, 8}, /* cost of loading SSE registers
619 in SImode, DImode and TImode */
620 {2, 2, 8}, /* cost of storing SSE registers
621 in SImode, DImode and TImode */
622 3, /* MMX or SSE register to integer */
623 64, /* size of l1 cache. */
624 128, /* size of l2 cache. */
625 32, /* size of prefetch block */
626 1, /* number of parallel prefetches */
627 1, /* Branch cost */
628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
629 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
630 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
631 COSTS_N_INSNS (1), /* cost of FABS instruction. */
632 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
633 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
634 geode_memcpy,
635 geode_memset,
636 1, /* scalar_stmt_cost. */
637 1, /* scalar load_cost. */
638 1, /* scalar_store_cost. */
639 1, /* vec_stmt_cost. */
640 1, /* vec_to_scalar_cost. */
641 1, /* scalar_to_vec_cost. */
642 1, /* vec_align_load_cost. */
643 2, /* vec_unalign_load_cost. */
644 1, /* vec_store_cost. */
645 3, /* cond_taken_branch_cost. */
646 1, /* cond_not_taken_branch_cost. */
649 static stringop_algs k6_memcpy[2] = {
650 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static stringop_algs k6_memset[2] = {
653 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
654 DUMMY_STRINGOP_ALGS};
655 static const
656 struct processor_costs k6_cost = {
657 COSTS_N_INSNS (1), /* cost of an add instruction */
658 COSTS_N_INSNS (2), /* cost of a lea instruction */
659 COSTS_N_INSNS (1), /* variable shift costs */
660 COSTS_N_INSNS (1), /* constant shift costs */
661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
662 COSTS_N_INSNS (3), /* HI */
663 COSTS_N_INSNS (3), /* SI */
664 COSTS_N_INSNS (3), /* DI */
665 COSTS_N_INSNS (3)}, /* other */
666 0, /* cost of multiply per each bit set */
667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
668 COSTS_N_INSNS (18), /* HI */
669 COSTS_N_INSNS (18), /* SI */
670 COSTS_N_INSNS (18), /* DI */
671 COSTS_N_INSNS (18)}, /* other */
672 COSTS_N_INSNS (2), /* cost of movsx */
673 COSTS_N_INSNS (2), /* cost of movzx */
674 8, /* "large" insn */
675 4, /* MOVE_RATIO */
676 3, /* cost for loading QImode using movzbl */
677 {4, 5, 4}, /* cost of loading integer registers
678 in QImode, HImode and SImode.
679 Relative to reg-reg move (2). */
680 {2, 3, 2}, /* cost of storing integer registers */
681 4, /* cost of reg,reg fld/fst */
682 {6, 6, 6}, /* cost of loading fp registers
683 in SFmode, DFmode and XFmode */
684 {4, 4, 4}, /* cost of storing fp registers
685 in SFmode, DFmode and XFmode */
686 2, /* cost of moving MMX register */
687 {2, 2}, /* cost of loading MMX registers
688 in SImode and DImode */
689 {2, 2}, /* cost of storing MMX registers
690 in SImode and DImode */
691 2, /* cost of moving SSE register */
692 {2, 2, 8}, /* cost of loading SSE registers
693 in SImode, DImode and TImode */
694 {2, 2, 8}, /* cost of storing SSE registers
695 in SImode, DImode and TImode */
696 6, /* MMX or SSE register to integer */
697 32, /* size of l1 cache. */
698 32, /* size of l2 cache. Some models
699 have integrated l2 cache, but
700 optimizing for k6 is not important
701 enough to worry about that. */
702 32, /* size of prefetch block */
703 1, /* number of parallel prefetches */
704 1, /* Branch cost */
705 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
706 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
707 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
708 COSTS_N_INSNS (2), /* cost of FABS instruction. */
709 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
710 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
711 k6_memcpy,
712 k6_memset,
713 1, /* scalar_stmt_cost. */
714 1, /* scalar load_cost. */
715 1, /* scalar_store_cost. */
716 1, /* vec_stmt_cost. */
717 1, /* vec_to_scalar_cost. */
718 1, /* scalar_to_vec_cost. */
719 1, /* vec_align_load_cost. */
720 2, /* vec_unalign_load_cost. */
721 1, /* vec_store_cost. */
722 3, /* cond_taken_branch_cost. */
723 1, /* cond_not_taken_branch_cost. */
726 /* For some reason, Athlon deals better with REP prefix (relative to loops)
727 compared to K8. Alignment becomes important after 8 bytes for memcpy and
728 128 bytes for memset. */
729 static stringop_algs athlon_memcpy[2] = {
730 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 DUMMY_STRINGOP_ALGS};
732 static stringop_algs athlon_memset[2] = {
733 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
734 DUMMY_STRINGOP_ALGS};
735 static const
736 struct processor_costs athlon_cost = {
737 COSTS_N_INSNS (1), /* cost of an add instruction */
738 COSTS_N_INSNS (2), /* cost of a lea instruction */
739 COSTS_N_INSNS (1), /* variable shift costs */
740 COSTS_N_INSNS (1), /* constant shift costs */
741 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
742 COSTS_N_INSNS (5), /* HI */
743 COSTS_N_INSNS (5), /* SI */
744 COSTS_N_INSNS (5), /* DI */
745 COSTS_N_INSNS (5)}, /* other */
746 0, /* cost of multiply per each bit set */
747 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
748 COSTS_N_INSNS (26), /* HI */
749 COSTS_N_INSNS (42), /* SI */
750 COSTS_N_INSNS (74), /* DI */
751 COSTS_N_INSNS (74)}, /* other */
752 COSTS_N_INSNS (1), /* cost of movsx */
753 COSTS_N_INSNS (1), /* cost of movzx */
754 8, /* "large" insn */
755 9, /* MOVE_RATIO */
756 4, /* cost for loading QImode using movzbl */
757 {3, 4, 3}, /* cost of loading integer registers
758 in QImode, HImode and SImode.
759 Relative to reg-reg move (2). */
760 {3, 4, 3}, /* cost of storing integer registers */
761 4, /* cost of reg,reg fld/fst */
762 {4, 4, 12}, /* cost of loading fp registers
763 in SFmode, DFmode and XFmode */
764 {6, 6, 8}, /* cost of storing fp registers
765 in SFmode, DFmode and XFmode */
766 2, /* cost of moving MMX register */
767 {4, 4}, /* cost of loading MMX registers
768 in SImode and DImode */
769 {4, 4}, /* cost of storing MMX registers
770 in SImode and DImode */
771 2, /* cost of moving SSE register */
772 {4, 4, 6}, /* cost of loading SSE registers
773 in SImode, DImode and TImode */
774 {4, 4, 5}, /* cost of storing SSE registers
775 in SImode, DImode and TImode */
776 5, /* MMX or SSE register to integer */
777 64, /* size of l1 cache. */
778 256, /* size of l2 cache. */
779 64, /* size of prefetch block */
780 6, /* number of parallel prefetches */
781 5, /* Branch cost */
782 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
783 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
784 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
785 COSTS_N_INSNS (2), /* cost of FABS instruction. */
786 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
787 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
788 athlon_memcpy,
789 athlon_memset,
790 1, /* scalar_stmt_cost. */
791 1, /* scalar load_cost. */
792 1, /* scalar_store_cost. */
793 1, /* vec_stmt_cost. */
794 1, /* vec_to_scalar_cost. */
795 1, /* scalar_to_vec_cost. */
796 1, /* vec_align_load_cost. */
797 2, /* vec_unalign_load_cost. */
798 1, /* vec_store_cost. */
799 3, /* cond_taken_branch_cost. */
800 1, /* cond_not_taken_branch_cost. */
803 /* K8 has optimized REP instruction for medium sized blocks, but for very
804 small blocks it is better to use loop. For large blocks, libcall can
805 do nontemporary accesses and beat inline considerably. */
806 static stringop_algs k8_memcpy[2] = {
807 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
808 {-1, rep_prefix_4_byte, false}}},
809 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
810 {-1, libcall, false}}}};
811 static stringop_algs k8_memset[2] = {
812 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
813 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
814 {libcall, {{48, unrolled_loop, false},
815 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
816 static const
817 struct processor_costs k8_cost = {
818 COSTS_N_INSNS (1), /* cost of an add instruction */
819 COSTS_N_INSNS (2), /* cost of a lea instruction */
820 COSTS_N_INSNS (1), /* variable shift costs */
821 COSTS_N_INSNS (1), /* constant shift costs */
822 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
823 COSTS_N_INSNS (4), /* HI */
824 COSTS_N_INSNS (3), /* SI */
825 COSTS_N_INSNS (4), /* DI */
826 COSTS_N_INSNS (5)}, /* other */
827 0, /* cost of multiply per each bit set */
828 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
829 COSTS_N_INSNS (26), /* HI */
830 COSTS_N_INSNS (42), /* SI */
831 COSTS_N_INSNS (74), /* DI */
832 COSTS_N_INSNS (74)}, /* other */
833 COSTS_N_INSNS (1), /* cost of movsx */
834 COSTS_N_INSNS (1), /* cost of movzx */
835 8, /* "large" insn */
836 9, /* MOVE_RATIO */
837 4, /* cost for loading QImode using movzbl */
838 {3, 4, 3}, /* cost of loading integer registers
839 in QImode, HImode and SImode.
840 Relative to reg-reg move (2). */
841 {3, 4, 3}, /* cost of storing integer registers */
842 4, /* cost of reg,reg fld/fst */
843 {4, 4, 12}, /* cost of loading fp registers
844 in SFmode, DFmode and XFmode */
845 {6, 6, 8}, /* cost of storing fp registers
846 in SFmode, DFmode and XFmode */
847 2, /* cost of moving MMX register */
848 {3, 3}, /* cost of loading MMX registers
849 in SImode and DImode */
850 {4, 4}, /* cost of storing MMX registers
851 in SImode and DImode */
852 2, /* cost of moving SSE register */
853 {4, 3, 6}, /* cost of loading SSE registers
854 in SImode, DImode and TImode */
855 {4, 4, 5}, /* cost of storing SSE registers
856 in SImode, DImode and TImode */
857 5, /* MMX or SSE register to integer */
858 64, /* size of l1 cache. */
859 512, /* size of l2 cache. */
860 64, /* size of prefetch block */
861 /* New AMD processors never drop prefetches; if they cannot be performed
862 immediately, they are queued. We set number of simultaneous prefetches
863 to a large constant to reflect this (it probably is not a good idea not
864 to limit number of prefetches at all, as their execution also takes some
865 time). */
866 100, /* number of parallel prefetches */
867 3, /* Branch cost */
868 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
869 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
870 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
871 COSTS_N_INSNS (2), /* cost of FABS instruction. */
872 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
873 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
875 k8_memcpy,
876 k8_memset,
877 4, /* scalar_stmt_cost. */
878 2, /* scalar load_cost. */
879 2, /* scalar_store_cost. */
880 5, /* vec_stmt_cost. */
881 0, /* vec_to_scalar_cost. */
882 2, /* scalar_to_vec_cost. */
883 2, /* vec_align_load_cost. */
884 3, /* vec_unalign_load_cost. */
885 3, /* vec_store_cost. */
886 3, /* cond_taken_branch_cost. */
887 2, /* cond_not_taken_branch_cost. */
890 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
891 very small blocks it is better to use loop. For large blocks, libcall can
892 do nontemporary accesses and beat inline considerably. */
893 static stringop_algs amdfam10_memcpy[2] = {
894 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
895 {-1, rep_prefix_4_byte, false}}},
896 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
898 static stringop_algs amdfam10_memset[2] = {
899 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
900 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
901 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
902 {-1, libcall, false}}}};
903 struct processor_costs amdfam10_cost = {
904 COSTS_N_INSNS (1), /* cost of an add instruction */
905 COSTS_N_INSNS (2), /* cost of a lea instruction */
906 COSTS_N_INSNS (1), /* variable shift costs */
907 COSTS_N_INSNS (1), /* constant shift costs */
908 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
909 COSTS_N_INSNS (4), /* HI */
910 COSTS_N_INSNS (3), /* SI */
911 COSTS_N_INSNS (4), /* DI */
912 COSTS_N_INSNS (5)}, /* other */
913 0, /* cost of multiply per each bit set */
914 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
915 COSTS_N_INSNS (35), /* HI */
916 COSTS_N_INSNS (51), /* SI */
917 COSTS_N_INSNS (83), /* DI */
918 COSTS_N_INSNS (83)}, /* other */
919 COSTS_N_INSNS (1), /* cost of movsx */
920 COSTS_N_INSNS (1), /* cost of movzx */
921 8, /* "large" insn */
922 9, /* MOVE_RATIO */
923 4, /* cost for loading QImode using movzbl */
924 {3, 4, 3}, /* cost of loading integer registers
925 in QImode, HImode and SImode.
926 Relative to reg-reg move (2). */
927 {3, 4, 3}, /* cost of storing integer registers */
928 4, /* cost of reg,reg fld/fst */
929 {4, 4, 12}, /* cost of loading fp registers
930 in SFmode, DFmode and XFmode */
931 {6, 6, 8}, /* cost of storing fp registers
932 in SFmode, DFmode and XFmode */
933 2, /* cost of moving MMX register */
934 {3, 3}, /* cost of loading MMX registers
935 in SImode and DImode */
936 {4, 4}, /* cost of storing MMX registers
937 in SImode and DImode */
938 2, /* cost of moving SSE register */
939 {4, 4, 3}, /* cost of loading SSE registers
940 in SImode, DImode and TImode */
941 {4, 4, 5}, /* cost of storing SSE registers
942 in SImode, DImode and TImode */
943 3, /* MMX or SSE register to integer */
944 /* On K8:
945 MOVD reg64, xmmreg Double FSTORE 4
946 MOVD reg32, xmmreg Double FSTORE 4
947 On AMDFAM10:
948 MOVD reg64, xmmreg Double FADD 3
949 1/1 1/1
950 MOVD reg32, xmmreg Double FADD 3
951 1/1 1/1 */
952 64, /* size of l1 cache. */
953 512, /* size of l2 cache. */
954 64, /* size of prefetch block */
955 /* New AMD processors never drop prefetches; if they cannot be performed
956 immediately, they are queued. We set number of simultaneous prefetches
957 to a large constant to reflect this (it probably is not a good idea not
958 to limit number of prefetches at all, as their execution also takes some
959 time). */
960 100, /* number of parallel prefetches */
961 2, /* Branch cost */
962 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
963 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
964 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
965 COSTS_N_INSNS (2), /* cost of FABS instruction. */
966 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
967 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
969 amdfam10_memcpy,
970 amdfam10_memset,
971 4, /* scalar_stmt_cost. */
972 2, /* scalar load_cost. */
973 2, /* scalar_store_cost. */
974 6, /* vec_stmt_cost. */
975 0, /* vec_to_scalar_cost. */
976 2, /* scalar_to_vec_cost. */
977 2, /* vec_align_load_cost. */
978 2, /* vec_unalign_load_cost. */
979 2, /* vec_store_cost. */
980 2, /* cond_taken_branch_cost. */
981 1, /* cond_not_taken_branch_cost. */
984 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
985 very small blocks it is better to use loop. For large blocks, libcall
986 can do nontemporary accesses and beat inline considerably. */
987 static stringop_algs bdver1_memcpy[2] = {
988 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
989 {-1, rep_prefix_4_byte, false}}},
990 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
991 {-1, libcall, false}}}};
992 static stringop_algs bdver1_memset[2] = {
993 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
994 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
995 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
996 {-1, libcall, false}}}};
998 const struct processor_costs bdver1_cost = {
999 COSTS_N_INSNS (1), /* cost of an add instruction */
1000 COSTS_N_INSNS (1), /* cost of a lea instruction */
1001 COSTS_N_INSNS (1), /* variable shift costs */
1002 COSTS_N_INSNS (1), /* constant shift costs */
1003 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1004 COSTS_N_INSNS (4), /* HI */
1005 COSTS_N_INSNS (4), /* SI */
1006 COSTS_N_INSNS (6), /* DI */
1007 COSTS_N_INSNS (6)}, /* other */
1008 0, /* cost of multiply per each bit set */
1009 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1010 COSTS_N_INSNS (35), /* HI */
1011 COSTS_N_INSNS (51), /* SI */
1012 COSTS_N_INSNS (83), /* DI */
1013 COSTS_N_INSNS (83)}, /* other */
1014 COSTS_N_INSNS (1), /* cost of movsx */
1015 COSTS_N_INSNS (1), /* cost of movzx */
1016 8, /* "large" insn */
1017 9, /* MOVE_RATIO */
1018 4, /* cost for loading QImode using movzbl */
1019 {5, 5, 4}, /* cost of loading integer registers
1020 in QImode, HImode and SImode.
1021 Relative to reg-reg move (2). */
1022 {4, 4, 4}, /* cost of storing integer registers */
1023 2, /* cost of reg,reg fld/fst */
1024 {5, 5, 12}, /* cost of loading fp registers
1025 in SFmode, DFmode and XFmode */
1026 {4, 4, 8}, /* cost of storing fp registers
1027 in SFmode, DFmode and XFmode */
1028 2, /* cost of moving MMX register */
1029 {4, 4}, /* cost of loading MMX registers
1030 in SImode and DImode */
1031 {4, 4}, /* cost of storing MMX registers
1032 in SImode and DImode */
1033 2, /* cost of moving SSE register */
1034 {4, 4, 4}, /* cost of loading SSE registers
1035 in SImode, DImode and TImode */
1036 {4, 4, 4}, /* cost of storing SSE registers
1037 in SImode, DImode and TImode */
1038 2, /* MMX or SSE register to integer */
1039 /* On K8:
1040 MOVD reg64, xmmreg Double FSTORE 4
1041 MOVD reg32, xmmreg Double FSTORE 4
1042 On AMDFAM10:
1043 MOVD reg64, xmmreg Double FADD 3
1044 1/1 1/1
1045 MOVD reg32, xmmreg Double FADD 3
1046 1/1 1/1 */
1047 16, /* size of l1 cache. */
1048 2048, /* size of l2 cache. */
1049 64, /* size of prefetch block */
1050 /* New AMD processors never drop prefetches; if they cannot be performed
1051 immediately, they are queued. We set number of simultaneous prefetches
1052 to a large constant to reflect this (it probably is not a good idea not
1053 to limit number of prefetches at all, as their execution also takes some
1054 time). */
1055 100, /* number of parallel prefetches */
1056 2, /* Branch cost */
1057 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1058 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1059 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1060 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1061 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1062 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1064 bdver1_memcpy,
1065 bdver1_memset,
1066 6, /* scalar_stmt_cost. */
1067 4, /* scalar load_cost. */
1068 4, /* scalar_store_cost. */
1069 6, /* vec_stmt_cost. */
1070 0, /* vec_to_scalar_cost. */
1071 2, /* scalar_to_vec_cost. */
1072 4, /* vec_align_load_cost. */
1073 4, /* vec_unalign_load_cost. */
1074 4, /* vec_store_cost. */
1075 4, /* cond_taken_branch_cost. */
1076 2, /* cond_not_taken_branch_cost. */
1079 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1080 very small blocks it is better to use loop. For large blocks, libcall
1081 can do nontemporary accesses and beat inline considerably. */
1083 static stringop_algs bdver2_memcpy[2] = {
1084 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1085 {-1, rep_prefix_4_byte, false}}},
1086 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1087 {-1, libcall, false}}}};
1088 static stringop_algs bdver2_memset[2] = {
1089 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1090 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1091 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1092 {-1, libcall, false}}}};
1094 const struct processor_costs bdver2_cost = {
1095 COSTS_N_INSNS (1), /* cost of an add instruction */
1096 COSTS_N_INSNS (1), /* cost of a lea instruction */
1097 COSTS_N_INSNS (1), /* variable shift costs */
1098 COSTS_N_INSNS (1), /* constant shift costs */
1099 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1100 COSTS_N_INSNS (4), /* HI */
1101 COSTS_N_INSNS (4), /* SI */
1102 COSTS_N_INSNS (6), /* DI */
1103 COSTS_N_INSNS (6)}, /* other */
1104 0, /* cost of multiply per each bit set */
1105 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1106 COSTS_N_INSNS (35), /* HI */
1107 COSTS_N_INSNS (51), /* SI */
1108 COSTS_N_INSNS (83), /* DI */
1109 COSTS_N_INSNS (83)}, /* other */
1110 COSTS_N_INSNS (1), /* cost of movsx */
1111 COSTS_N_INSNS (1), /* cost of movzx */
1112 8, /* "large" insn */
1113 9, /* MOVE_RATIO */
1114 4, /* cost for loading QImode using movzbl */
1115 {5, 5, 4}, /* cost of loading integer registers
1116 in QImode, HImode and SImode.
1117 Relative to reg-reg move (2). */
1118 {4, 4, 4}, /* cost of storing integer registers */
1119 2, /* cost of reg,reg fld/fst */
1120 {5, 5, 12}, /* cost of loading fp registers
1121 in SFmode, DFmode and XFmode */
1122 {4, 4, 8}, /* cost of storing fp registers
1123 in SFmode, DFmode and XFmode */
1124 2, /* cost of moving MMX register */
1125 {4, 4}, /* cost of loading MMX registers
1126 in SImode and DImode */
1127 {4, 4}, /* cost of storing MMX registers
1128 in SImode and DImode */
1129 2, /* cost of moving SSE register */
1130 {4, 4, 4}, /* cost of loading SSE registers
1131 in SImode, DImode and TImode */
1132 {4, 4, 4}, /* cost of storing SSE registers
1133 in SImode, DImode and TImode */
1134 2, /* MMX or SSE register to integer */
1135 /* On K8:
1136 MOVD reg64, xmmreg Double FSTORE 4
1137 MOVD reg32, xmmreg Double FSTORE 4
1138 On AMDFAM10:
1139 MOVD reg64, xmmreg Double FADD 3
1140 1/1 1/1
1141 MOVD reg32, xmmreg Double FADD 3
1142 1/1 1/1 */
1143 16, /* size of l1 cache. */
1144 2048, /* size of l2 cache. */
1145 64, /* size of prefetch block */
1146 /* New AMD processors never drop prefetches; if they cannot be performed
1147 immediately, they are queued. We set number of simultaneous prefetches
1148 to a large constant to reflect this (it probably is not a good idea not
1149 to limit number of prefetches at all, as their execution also takes some
1150 time). */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1160 bdver2_memcpy,
1161 bdver2_memset,
1162 6, /* scalar_stmt_cost. */
1163 4, /* scalar load_cost. */
1164 4, /* scalar_store_cost. */
1165 6, /* vec_stmt_cost. */
1166 0, /* vec_to_scalar_cost. */
1167 2, /* scalar_to_vec_cost. */
1168 4, /* vec_align_load_cost. */
1169 4, /* vec_unalign_load_cost. */
1170 4, /* vec_store_cost. */
1171 4, /* cond_taken_branch_cost. */
1172 2, /* cond_not_taken_branch_cost. */
1176 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1177 very small blocks it is better to use loop. For large blocks, libcall
1178 can do nontemporary accesses and beat inline considerably. */
1179 static stringop_algs bdver3_memcpy[2] = {
1180 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1181 {-1, rep_prefix_4_byte, false}}},
1182 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1183 {-1, libcall, false}}}};
1184 static stringop_algs bdver3_memset[2] = {
1185 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1186 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1187 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1188 {-1, libcall, false}}}};
1189 struct processor_costs bdver3_cost = {
1190 COSTS_N_INSNS (1), /* cost of an add instruction */
1191 COSTS_N_INSNS (1), /* cost of a lea instruction */
1192 COSTS_N_INSNS (1), /* variable shift costs */
1193 COSTS_N_INSNS (1), /* constant shift costs */
1194 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1195 COSTS_N_INSNS (4), /* HI */
1196 COSTS_N_INSNS (4), /* SI */
1197 COSTS_N_INSNS (6), /* DI */
1198 COSTS_N_INSNS (6)}, /* other */
1199 0, /* cost of multiply per each bit set */
1200 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1201 COSTS_N_INSNS (35), /* HI */
1202 COSTS_N_INSNS (51), /* SI */
1203 COSTS_N_INSNS (83), /* DI */
1204 COSTS_N_INSNS (83)}, /* other */
1205 COSTS_N_INSNS (1), /* cost of movsx */
1206 COSTS_N_INSNS (1), /* cost of movzx */
1207 8, /* "large" insn */
1208 9, /* MOVE_RATIO */
1209 4, /* cost for loading QImode using movzbl */
1210 {5, 5, 4}, /* cost of loading integer registers
1211 in QImode, HImode and SImode.
1212 Relative to reg-reg move (2). */
1213 {4, 4, 4}, /* cost of storing integer registers */
1214 2, /* cost of reg,reg fld/fst */
1215 {5, 5, 12}, /* cost of loading fp registers
1216 in SFmode, DFmode and XFmode */
1217 {4, 4, 8}, /* cost of storing fp registers
1218 in SFmode, DFmode and XFmode */
1219 2, /* cost of moving MMX register */
1220 {4, 4}, /* cost of loading MMX registers
1221 in SImode and DImode */
1222 {4, 4}, /* cost of storing MMX registers
1223 in SImode and DImode */
1224 2, /* cost of moving SSE register */
1225 {4, 4, 4}, /* cost of loading SSE registers
1226 in SImode, DImode and TImode */
1227 {4, 4, 4}, /* cost of storing SSE registers
1228 in SImode, DImode and TImode */
1229 2, /* MMX or SSE register to integer */
1230 16, /* size of l1 cache. */
1231 2048, /* size of l2 cache. */
1232 64, /* size of prefetch block */
1233 /* New AMD processors never drop prefetches; if they cannot be performed
1234 immediately, they are queued. We set number of simultaneous prefetches
1235 to a large constant to reflect this (it probably is not a good idea not
1236 to limit number of prefetches at all, as their execution also takes some
1237 time). */
1238 100, /* number of parallel prefetches */
1239 2, /* Branch cost */
1240 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1241 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1242 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1243 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1244 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1245 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1247 bdver3_memcpy,
1248 bdver3_memset,
1249 6, /* scalar_stmt_cost. */
1250 4, /* scalar load_cost. */
1251 4, /* scalar_store_cost. */
1252 6, /* vec_stmt_cost. */
1253 0, /* vec_to_scalar_cost. */
1254 2, /* scalar_to_vec_cost. */
1255 4, /* vec_align_load_cost. */
1256 4, /* vec_unalign_load_cost. */
1257 4, /* vec_store_cost. */
1258 4, /* cond_taken_branch_cost. */
1259 2, /* cond_not_taken_branch_cost. */
1262 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1263 very small blocks it is better to use loop. For large blocks, libcall
1264 can do nontemporary accesses and beat inline considerably. */
1265 static stringop_algs bdver4_memcpy[2] = {
1266 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1267 {-1, rep_prefix_4_byte, false}}},
1268 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1269 {-1, libcall, false}}}};
1270 static stringop_algs bdver4_memset[2] = {
1271 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1272 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1273 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1274 {-1, libcall, false}}}};
1275 struct processor_costs bdver4_cost = {
1276 COSTS_N_INSNS (1), /* cost of an add instruction */
1277 COSTS_N_INSNS (1), /* cost of a lea instruction */
1278 COSTS_N_INSNS (1), /* variable shift costs */
1279 COSTS_N_INSNS (1), /* constant shift costs */
1280 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1281 COSTS_N_INSNS (4), /* HI */
1282 COSTS_N_INSNS (4), /* SI */
1283 COSTS_N_INSNS (6), /* DI */
1284 COSTS_N_INSNS (6)}, /* other */
1285 0, /* cost of multiply per each bit set */
1286 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1287 COSTS_N_INSNS (35), /* HI */
1288 COSTS_N_INSNS (51), /* SI */
1289 COSTS_N_INSNS (83), /* DI */
1290 COSTS_N_INSNS (83)}, /* other */
1291 COSTS_N_INSNS (1), /* cost of movsx */
1292 COSTS_N_INSNS (1), /* cost of movzx */
1293 8, /* "large" insn */
1294 9, /* MOVE_RATIO */
1295 4, /* cost for loading QImode using movzbl */
1296 {5, 5, 4}, /* cost of loading integer registers
1297 in QImode, HImode and SImode.
1298 Relative to reg-reg move (2). */
1299 {4, 4, 4}, /* cost of storing integer registers */
1300 2, /* cost of reg,reg fld/fst */
1301 {5, 5, 12}, /* cost of loading fp registers
1302 in SFmode, DFmode and XFmode */
1303 {4, 4, 8}, /* cost of storing fp registers
1304 in SFmode, DFmode and XFmode */
1305 2, /* cost of moving MMX register */
1306 {4, 4}, /* cost of loading MMX registers
1307 in SImode and DImode */
1308 {4, 4}, /* cost of storing MMX registers
1309 in SImode and DImode */
1310 2, /* cost of moving SSE register */
1311 {4, 4, 4}, /* cost of loading SSE registers
1312 in SImode, DImode and TImode */
1313 {4, 4, 4}, /* cost of storing SSE registers
1314 in SImode, DImode and TImode */
1315 2, /* MMX or SSE register to integer */
1316 16, /* size of l1 cache. */
1317 2048, /* size of l2 cache. */
1318 64, /* size of prefetch block */
1319 /* New AMD processors never drop prefetches; if they cannot be performed
1320 immediately, they are queued. We set number of simultaneous prefetches
1321 to a large constant to reflect this (it probably is not a good idea not
1322 to limit number of prefetches at all, as their execution also takes some
1323 time). */
1324 100, /* number of parallel prefetches */
1325 2, /* Branch cost */
1326 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1327 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1328 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1329 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1330 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1331 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1333 bdver4_memcpy,
1334 bdver4_memset,
1335 6, /* scalar_stmt_cost. */
1336 4, /* scalar load_cost. */
1337 4, /* scalar_store_cost. */
1338 6, /* vec_stmt_cost. */
1339 0, /* vec_to_scalar_cost. */
1340 2, /* scalar_to_vec_cost. */
1341 4, /* vec_align_load_cost. */
1342 4, /* vec_unalign_load_cost. */
1343 4, /* vec_store_cost. */
1344 4, /* cond_taken_branch_cost. */
1345 2, /* cond_not_taken_branch_cost. */
1349 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1350 very small blocks it is better to use loop. For large blocks, libcall
1351 can do nontemporary accesses and beat inline considerably. */
1352 static stringop_algs znver1_memcpy[2] = {
1353 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1354 {-1, rep_prefix_4_byte, false}}},
1355 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1356 {-1, libcall, false}}}};
1357 static stringop_algs znver1_memset[2] = {
1358 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1359 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1360 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1361 {-1, libcall, false}}}};
1362 struct processor_costs znver1_cost = {
1363 COSTS_N_INSNS (1), /* cost of an add instruction. */
1364 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1365 COSTS_N_INSNS (1), /* variable shift costs. */
1366 COSTS_N_INSNS (1), /* constant shift costs. */
1367 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1368 COSTS_N_INSNS (3), /* HI. */
1369 COSTS_N_INSNS (3), /* SI. */
1370 COSTS_N_INSNS (4), /* DI. */
1371 COSTS_N_INSNS (4)}, /* other. */
1372 0, /* cost of multiply per each bit
1373 set. */
1374 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1375 COSTS_N_INSNS (35), /* HI. */
1376 COSTS_N_INSNS (51), /* SI. */
1377 COSTS_N_INSNS (83), /* DI. */
1378 COSTS_N_INSNS (83)}, /* other. */
1379 COSTS_N_INSNS (1), /* cost of movsx. */
1380 COSTS_N_INSNS (1), /* cost of movzx. */
1381 8, /* "large" insn. */
1382 9, /* MOVE_RATIO. */
1383 4, /* cost for loading QImode using
1384 movzbl. */
1385 {5, 5, 4}, /* cost of loading integer registers
1386 in QImode, HImode and SImode.
1387 Relative to reg-reg move (2). */
1388 {4, 4, 4}, /* cost of storing integer
1389 registers. */
1390 2, /* cost of reg,reg fld/fst. */
1391 {5, 5, 12}, /* cost of loading fp registers
1392 in SFmode, DFmode and XFmode. */
1393 {4, 4, 8}, /* cost of storing fp registers
1394 in SFmode, DFmode and XFmode. */
1395 2, /* cost of moving MMX register. */
1396 {4, 4}, /* cost of loading MMX registers
1397 in SImode and DImode. */
1398 {4, 4}, /* cost of storing MMX registers
1399 in SImode and DImode. */
1400 2, /* cost of moving SSE register. */
1401 {4, 4, 4}, /* cost of loading SSE registers
1402 in SImode, DImode and TImode. */
1403 {4, 4, 4}, /* cost of storing SSE registers
1404 in SImode, DImode and TImode. */
1405 2, /* MMX or SSE register to integer. */
1406 32, /* size of l1 cache. */
1407 512, /* size of l2 cache. */
1408 64, /* size of prefetch block. */
1409 /* New AMD processors never drop prefetches; if they cannot be performed
1410 immediately, they are queued. We set number of simultaneous prefetches
1411 to a large constant to reflect this (it probably is not a good idea not
1412 to limit number of prefetches at all, as their execution also takes some
1413 time). */
1414 100, /* number of parallel prefetches. */
1415 2, /* Branch cost. */
1416 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1417 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1418 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1421 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1423 znver1_memcpy,
1424 znver1_memset,
1425 6, /* scalar_stmt_cost. */
1426 4, /* scalar load_cost. */
1427 4, /* scalar_store_cost. */
1428 6, /* vec_stmt_cost. */
1429 0, /* vec_to_scalar_cost. */
1430 2, /* scalar_to_vec_cost. */
1431 4, /* vec_align_load_cost. */
1432 4, /* vec_unalign_load_cost. */
1433 4, /* vec_store_cost. */
1434 4, /* cond_taken_branch_cost. */
1435 2, /* cond_not_taken_branch_cost. */
1438 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1439 very small blocks it is better to use loop. For large blocks, libcall can
1440 do nontemporary accesses and beat inline considerably. */
1441 static stringop_algs btver1_memcpy[2] = {
1442 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1443 {-1, rep_prefix_4_byte, false}}},
1444 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1445 {-1, libcall, false}}}};
1446 static stringop_algs btver1_memset[2] = {
1447 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1448 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1449 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1450 {-1, libcall, false}}}};
1451 const struct processor_costs btver1_cost = {
1452 COSTS_N_INSNS (1), /* cost of an add instruction */
1453 COSTS_N_INSNS (2), /* cost of a lea instruction */
1454 COSTS_N_INSNS (1), /* variable shift costs */
1455 COSTS_N_INSNS (1), /* constant shift costs */
1456 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1457 COSTS_N_INSNS (4), /* HI */
1458 COSTS_N_INSNS (3), /* SI */
1459 COSTS_N_INSNS (4), /* DI */
1460 COSTS_N_INSNS (5)}, /* other */
1461 0, /* cost of multiply per each bit set */
1462 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1463 COSTS_N_INSNS (35), /* HI */
1464 COSTS_N_INSNS (51), /* SI */
1465 COSTS_N_INSNS (83), /* DI */
1466 COSTS_N_INSNS (83)}, /* other */
1467 COSTS_N_INSNS (1), /* cost of movsx */
1468 COSTS_N_INSNS (1), /* cost of movzx */
1469 8, /* "large" insn */
1470 9, /* MOVE_RATIO */
1471 4, /* cost for loading QImode using movzbl */
1472 {3, 4, 3}, /* cost of loading integer registers
1473 in QImode, HImode and SImode.
1474 Relative to reg-reg move (2). */
1475 {3, 4, 3}, /* cost of storing integer registers */
1476 4, /* cost of reg,reg fld/fst */
1477 {4, 4, 12}, /* cost of loading fp registers
1478 in SFmode, DFmode and XFmode */
1479 {6, 6, 8}, /* cost of storing fp registers
1480 in SFmode, DFmode and XFmode */
1481 2, /* cost of moving MMX register */
1482 {3, 3}, /* cost of loading MMX registers
1483 in SImode and DImode */
1484 {4, 4}, /* cost of storing MMX registers
1485 in SImode and DImode */
1486 2, /* cost of moving SSE register */
1487 {4, 4, 3}, /* cost of loading SSE registers
1488 in SImode, DImode and TImode */
1489 {4, 4, 5}, /* cost of storing SSE registers
1490 in SImode, DImode and TImode */
1491 3, /* MMX or SSE register to integer */
1492 /* On K8:
1493 MOVD reg64, xmmreg Double FSTORE 4
1494 MOVD reg32, xmmreg Double FSTORE 4
1495 On AMDFAM10:
1496 MOVD reg64, xmmreg Double FADD 3
1497 1/1 1/1
1498 MOVD reg32, xmmreg Double FADD 3
1499 1/1 1/1 */
1500 32, /* size of l1 cache. */
1501 512, /* size of l2 cache. */
1502 64, /* size of prefetch block */
1503 100, /* number of parallel prefetches */
1504 2, /* Branch cost */
1505 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1506 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1507 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1508 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1509 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1510 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1512 btver1_memcpy,
1513 btver1_memset,
1514 4, /* scalar_stmt_cost. */
1515 2, /* scalar load_cost. */
1516 2, /* scalar_store_cost. */
1517 6, /* vec_stmt_cost. */
1518 0, /* vec_to_scalar_cost. */
1519 2, /* scalar_to_vec_cost. */
1520 2, /* vec_align_load_cost. */
1521 2, /* vec_unalign_load_cost. */
1522 2, /* vec_store_cost. */
1523 2, /* cond_taken_branch_cost. */
1524 1, /* cond_not_taken_branch_cost. */
1527 static stringop_algs btver2_memcpy[2] = {
1528 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1529 {-1, rep_prefix_4_byte, false}}},
1530 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1531 {-1, libcall, false}}}};
1532 static stringop_algs btver2_memset[2] = {
1533 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1534 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1535 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1536 {-1, libcall, false}}}};
1537 const struct processor_costs btver2_cost = {
1538 COSTS_N_INSNS (1), /* cost of an add instruction */
1539 COSTS_N_INSNS (2), /* cost of a lea instruction */
1540 COSTS_N_INSNS (1), /* variable shift costs */
1541 COSTS_N_INSNS (1), /* constant shift costs */
1542 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1543 COSTS_N_INSNS (4), /* HI */
1544 COSTS_N_INSNS (3), /* SI */
1545 COSTS_N_INSNS (4), /* DI */
1546 COSTS_N_INSNS (5)}, /* other */
1547 0, /* cost of multiply per each bit set */
1548 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1549 COSTS_N_INSNS (35), /* HI */
1550 COSTS_N_INSNS (51), /* SI */
1551 COSTS_N_INSNS (83), /* DI */
1552 COSTS_N_INSNS (83)}, /* other */
1553 COSTS_N_INSNS (1), /* cost of movsx */
1554 COSTS_N_INSNS (1), /* cost of movzx */
1555 8, /* "large" insn */
1556 9, /* MOVE_RATIO */
1557 4, /* cost for loading QImode using movzbl */
1558 {3, 4, 3}, /* cost of loading integer registers
1559 in QImode, HImode and SImode.
1560 Relative to reg-reg move (2). */
1561 {3, 4, 3}, /* cost of storing integer registers */
1562 4, /* cost of reg,reg fld/fst */
1563 {4, 4, 12}, /* cost of loading fp registers
1564 in SFmode, DFmode and XFmode */
1565 {6, 6, 8}, /* cost of storing fp registers
1566 in SFmode, DFmode and XFmode */
1567 2, /* cost of moving MMX register */
1568 {3, 3}, /* cost of loading MMX registers
1569 in SImode and DImode */
1570 {4, 4}, /* cost of storing MMX registers
1571 in SImode and DImode */
1572 2, /* cost of moving SSE register */
1573 {4, 4, 3}, /* cost of loading SSE registers
1574 in SImode, DImode and TImode */
1575 {4, 4, 5}, /* cost of storing SSE registers
1576 in SImode, DImode and TImode */
1577 3, /* MMX or SSE register to integer */
1578 /* On K8:
1579 MOVD reg64, xmmreg Double FSTORE 4
1580 MOVD reg32, xmmreg Double FSTORE 4
1581 On AMDFAM10:
1582 MOVD reg64, xmmreg Double FADD 3
1583 1/1 1/1
1584 MOVD reg32, xmmreg Double FADD 3
1585 1/1 1/1 */
1586 32, /* size of l1 cache. */
1587 2048, /* size of l2 cache. */
1588 64, /* size of prefetch block */
1589 100, /* number of parallel prefetches */
1590 2, /* Branch cost */
1591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1597 btver2_memcpy,
1598 btver2_memset,
1599 4, /* scalar_stmt_cost. */
1600 2, /* scalar load_cost. */
1601 2, /* scalar_store_cost. */
1602 6, /* vec_stmt_cost. */
1603 0, /* vec_to_scalar_cost. */
1604 2, /* scalar_to_vec_cost. */
1605 2, /* vec_align_load_cost. */
1606 2, /* vec_unalign_load_cost. */
1607 2, /* vec_store_cost. */
1608 2, /* cond_taken_branch_cost. */
1609 1, /* cond_not_taken_branch_cost. */
1612 static stringop_algs pentium4_memcpy[2] = {
1613 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1614 DUMMY_STRINGOP_ALGS};
1615 static stringop_algs pentium4_memset[2] = {
1616 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1617 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1618 DUMMY_STRINGOP_ALGS};
1620 static const
1621 struct processor_costs pentium4_cost = {
1622 COSTS_N_INSNS (1), /* cost of an add instruction */
1623 COSTS_N_INSNS (3), /* cost of a lea instruction */
1624 COSTS_N_INSNS (4), /* variable shift costs */
1625 COSTS_N_INSNS (4), /* constant shift costs */
1626 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1627 COSTS_N_INSNS (15), /* HI */
1628 COSTS_N_INSNS (15), /* SI */
1629 COSTS_N_INSNS (15), /* DI */
1630 COSTS_N_INSNS (15)}, /* other */
1631 0, /* cost of multiply per each bit set */
1632 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1633 COSTS_N_INSNS (56), /* HI */
1634 COSTS_N_INSNS (56), /* SI */
1635 COSTS_N_INSNS (56), /* DI */
1636 COSTS_N_INSNS (56)}, /* other */
1637 COSTS_N_INSNS (1), /* cost of movsx */
1638 COSTS_N_INSNS (1), /* cost of movzx */
1639 16, /* "large" insn */
1640 6, /* MOVE_RATIO */
1641 2, /* cost for loading QImode using movzbl */
1642 {4, 5, 4}, /* cost of loading integer registers
1643 in QImode, HImode and SImode.
1644 Relative to reg-reg move (2). */
1645 {2, 3, 2}, /* cost of storing integer registers */
1646 2, /* cost of reg,reg fld/fst */
1647 {2, 2, 6}, /* cost of loading fp registers
1648 in SFmode, DFmode and XFmode */
1649 {4, 4, 6}, /* cost of storing fp registers
1650 in SFmode, DFmode and XFmode */
1651 2, /* cost of moving MMX register */
1652 {2, 2}, /* cost of loading MMX registers
1653 in SImode and DImode */
1654 {2, 2}, /* cost of storing MMX registers
1655 in SImode and DImode */
1656 12, /* cost of moving SSE register */
1657 {12, 12, 12}, /* cost of loading SSE registers
1658 in SImode, DImode and TImode */
1659 {2, 2, 8}, /* cost of storing SSE registers
1660 in SImode, DImode and TImode */
1661 10, /* MMX or SSE register to integer */
1662 8, /* size of l1 cache. */
1663 256, /* size of l2 cache. */
1664 64, /* size of prefetch block */
1665 6, /* number of parallel prefetches */
1666 2, /* Branch cost */
1667 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1668 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1669 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1670 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1671 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1672 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1673 pentium4_memcpy,
1674 pentium4_memset,
1675 1, /* scalar_stmt_cost. */
1676 1, /* scalar load_cost. */
1677 1, /* scalar_store_cost. */
1678 1, /* vec_stmt_cost. */
1679 1, /* vec_to_scalar_cost. */
1680 1, /* scalar_to_vec_cost. */
1681 1, /* vec_align_load_cost. */
1682 2, /* vec_unalign_load_cost. */
1683 1, /* vec_store_cost. */
1684 3, /* cond_taken_branch_cost. */
1685 1, /* cond_not_taken_branch_cost. */
1688 static stringop_algs nocona_memcpy[2] = {
1689 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1690 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1691 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1693 static stringop_algs nocona_memset[2] = {
1694 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1695 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1696 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1697 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1699 static const
1700 struct processor_costs nocona_cost = {
1701 COSTS_N_INSNS (1), /* cost of an add instruction */
1702 COSTS_N_INSNS (1), /* cost of a lea instruction */
1703 COSTS_N_INSNS (1), /* variable shift costs */
1704 COSTS_N_INSNS (1), /* constant shift costs */
1705 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1706 COSTS_N_INSNS (10), /* HI */
1707 COSTS_N_INSNS (10), /* SI */
1708 COSTS_N_INSNS (10), /* DI */
1709 COSTS_N_INSNS (10)}, /* other */
1710 0, /* cost of multiply per each bit set */
1711 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1712 COSTS_N_INSNS (66), /* HI */
1713 COSTS_N_INSNS (66), /* SI */
1714 COSTS_N_INSNS (66), /* DI */
1715 COSTS_N_INSNS (66)}, /* other */
1716 COSTS_N_INSNS (1), /* cost of movsx */
1717 COSTS_N_INSNS (1), /* cost of movzx */
1718 16, /* "large" insn */
1719 17, /* MOVE_RATIO */
1720 4, /* cost for loading QImode using movzbl */
1721 {4, 4, 4}, /* cost of loading integer registers
1722 in QImode, HImode and SImode.
1723 Relative to reg-reg move (2). */
1724 {4, 4, 4}, /* cost of storing integer registers */
1725 3, /* cost of reg,reg fld/fst */
1726 {12, 12, 12}, /* cost of loading fp registers
1727 in SFmode, DFmode and XFmode */
1728 {4, 4, 4}, /* cost of storing fp registers
1729 in SFmode, DFmode and XFmode */
1730 6, /* cost of moving MMX register */
1731 {12, 12}, /* cost of loading MMX registers
1732 in SImode and DImode */
1733 {12, 12}, /* cost of storing MMX registers
1734 in SImode and DImode */
1735 6, /* cost of moving SSE register */
1736 {12, 12, 12}, /* cost of loading SSE registers
1737 in SImode, DImode and TImode */
1738 {12, 12, 12}, /* cost of storing SSE registers
1739 in SImode, DImode and TImode */
1740 8, /* MMX or SSE register to integer */
1741 8, /* size of l1 cache. */
1742 1024, /* size of l2 cache. */
1743 64, /* size of prefetch block */
1744 8, /* number of parallel prefetches */
1745 1, /* Branch cost */
1746 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1747 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1748 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1749 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1750 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1751 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1752 nocona_memcpy,
1753 nocona_memset,
1754 1, /* scalar_stmt_cost. */
1755 1, /* scalar load_cost. */
1756 1, /* scalar_store_cost. */
1757 1, /* vec_stmt_cost. */
1758 1, /* vec_to_scalar_cost. */
1759 1, /* scalar_to_vec_cost. */
1760 1, /* vec_align_load_cost. */
1761 2, /* vec_unalign_load_cost. */
1762 1, /* vec_store_cost. */
1763 3, /* cond_taken_branch_cost. */
1764 1, /* cond_not_taken_branch_cost. */
1767 static stringop_algs atom_memcpy[2] = {
1768 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1769 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1770 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1771 static stringop_algs atom_memset[2] = {
1772 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1773 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1774 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1775 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1776 static const
1777 struct processor_costs atom_cost = {
1778 COSTS_N_INSNS (1), /* cost of an add instruction */
1779 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1780 COSTS_N_INSNS (1), /* variable shift costs */
1781 COSTS_N_INSNS (1), /* constant shift costs */
1782 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1783 COSTS_N_INSNS (4), /* HI */
1784 COSTS_N_INSNS (3), /* SI */
1785 COSTS_N_INSNS (4), /* DI */
1786 COSTS_N_INSNS (2)}, /* other */
1787 0, /* cost of multiply per each bit set */
1788 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1789 COSTS_N_INSNS (26), /* HI */
1790 COSTS_N_INSNS (42), /* SI */
1791 COSTS_N_INSNS (74), /* DI */
1792 COSTS_N_INSNS (74)}, /* other */
1793 COSTS_N_INSNS (1), /* cost of movsx */
1794 COSTS_N_INSNS (1), /* cost of movzx */
1795 8, /* "large" insn */
1796 17, /* MOVE_RATIO */
1797 4, /* cost for loading QImode using movzbl */
1798 {4, 4, 4}, /* cost of loading integer registers
1799 in QImode, HImode and SImode.
1800 Relative to reg-reg move (2). */
1801 {4, 4, 4}, /* cost of storing integer registers */
1802 4, /* cost of reg,reg fld/fst */
1803 {12, 12, 12}, /* cost of loading fp registers
1804 in SFmode, DFmode and XFmode */
1805 {6, 6, 8}, /* cost of storing fp registers
1806 in SFmode, DFmode and XFmode */
1807 2, /* cost of moving MMX register */
1808 {8, 8}, /* cost of loading MMX registers
1809 in SImode and DImode */
1810 {8, 8}, /* cost of storing MMX registers
1811 in SImode and DImode */
1812 2, /* cost of moving SSE register */
1813 {8, 8, 8}, /* cost of loading SSE registers
1814 in SImode, DImode and TImode */
1815 {8, 8, 8}, /* cost of storing SSE registers
1816 in SImode, DImode and TImode */
1817 5, /* MMX or SSE register to integer */
1818 32, /* size of l1 cache. */
1819 256, /* size of l2 cache. */
1820 64, /* size of prefetch block */
1821 6, /* number of parallel prefetches */
1822 3, /* Branch cost */
1823 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1824 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1825 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1826 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1827 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1828 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1829 atom_memcpy,
1830 atom_memset,
1831 1, /* scalar_stmt_cost. */
1832 1, /* scalar load_cost. */
1833 1, /* scalar_store_cost. */
1834 1, /* vec_stmt_cost. */
1835 1, /* vec_to_scalar_cost. */
1836 1, /* scalar_to_vec_cost. */
1837 1, /* vec_align_load_cost. */
1838 2, /* vec_unalign_load_cost. */
1839 1, /* vec_store_cost. */
1840 3, /* cond_taken_branch_cost. */
1841 1, /* cond_not_taken_branch_cost. */
1844 static stringop_algs slm_memcpy[2] = {
1845 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1846 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1847 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1848 static stringop_algs slm_memset[2] = {
1849 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1850 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1851 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1852 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1853 static const
1854 struct processor_costs slm_cost = {
1855 COSTS_N_INSNS (1), /* cost of an add instruction */
1856 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1857 COSTS_N_INSNS (1), /* variable shift costs */
1858 COSTS_N_INSNS (1), /* constant shift costs */
1859 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1860 COSTS_N_INSNS (3), /* HI */
1861 COSTS_N_INSNS (3), /* SI */
1862 COSTS_N_INSNS (4), /* DI */
1863 COSTS_N_INSNS (2)}, /* other */
1864 0, /* cost of multiply per each bit set */
1865 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1866 COSTS_N_INSNS (26), /* HI */
1867 COSTS_N_INSNS (42), /* SI */
1868 COSTS_N_INSNS (74), /* DI */
1869 COSTS_N_INSNS (74)}, /* other */
1870 COSTS_N_INSNS (1), /* cost of movsx */
1871 COSTS_N_INSNS (1), /* cost of movzx */
1872 8, /* "large" insn */
1873 17, /* MOVE_RATIO */
1874 4, /* cost for loading QImode using movzbl */
1875 {4, 4, 4}, /* cost of loading integer registers
1876 in QImode, HImode and SImode.
1877 Relative to reg-reg move (2). */
1878 {4, 4, 4}, /* cost of storing integer registers */
1879 4, /* cost of reg,reg fld/fst */
1880 {12, 12, 12}, /* cost of loading fp registers
1881 in SFmode, DFmode and XFmode */
1882 {6, 6, 8}, /* cost of storing fp registers
1883 in SFmode, DFmode and XFmode */
1884 2, /* cost of moving MMX register */
1885 {8, 8}, /* cost of loading MMX registers
1886 in SImode and DImode */
1887 {8, 8}, /* cost of storing MMX registers
1888 in SImode and DImode */
1889 2, /* cost of moving SSE register */
1890 {8, 8, 8}, /* cost of loading SSE registers
1891 in SImode, DImode and TImode */
1892 {8, 8, 8}, /* cost of storing SSE registers
1893 in SImode, DImode and TImode */
1894 5, /* MMX or SSE register to integer */
1895 32, /* size of l1 cache. */
1896 256, /* size of l2 cache. */
1897 64, /* size of prefetch block */
1898 6, /* number of parallel prefetches */
1899 3, /* Branch cost */
1900 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1901 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1902 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1903 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1904 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1905 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1906 slm_memcpy,
1907 slm_memset,
1908 1, /* scalar_stmt_cost. */
1909 1, /* scalar load_cost. */
1910 1, /* scalar_store_cost. */
1911 1, /* vec_stmt_cost. */
1912 4, /* vec_to_scalar_cost. */
1913 1, /* scalar_to_vec_cost. */
1914 1, /* vec_align_load_cost. */
1915 2, /* vec_unalign_load_cost. */
1916 1, /* vec_store_cost. */
1917 3, /* cond_taken_branch_cost. */
1918 1, /* cond_not_taken_branch_cost. */
1921 static stringop_algs intel_memcpy[2] = {
1922 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1923 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1924 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1925 static stringop_algs intel_memset[2] = {
1926 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1927 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1928 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1929 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1930 static const
1931 struct processor_costs intel_cost = {
1932 COSTS_N_INSNS (1), /* cost of an add instruction */
1933 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1934 COSTS_N_INSNS (1), /* variable shift costs */
1935 COSTS_N_INSNS (1), /* constant shift costs */
1936 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1937 COSTS_N_INSNS (3), /* HI */
1938 COSTS_N_INSNS (3), /* SI */
1939 COSTS_N_INSNS (4), /* DI */
1940 COSTS_N_INSNS (2)}, /* other */
1941 0, /* cost of multiply per each bit set */
1942 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1943 COSTS_N_INSNS (26), /* HI */
1944 COSTS_N_INSNS (42), /* SI */
1945 COSTS_N_INSNS (74), /* DI */
1946 COSTS_N_INSNS (74)}, /* other */
1947 COSTS_N_INSNS (1), /* cost of movsx */
1948 COSTS_N_INSNS (1), /* cost of movzx */
1949 8, /* "large" insn */
1950 17, /* MOVE_RATIO */
1951 4, /* cost for loading QImode using movzbl */
1952 {4, 4, 4}, /* cost of loading integer registers
1953 in QImode, HImode and SImode.
1954 Relative to reg-reg move (2). */
1955 {4, 4, 4}, /* cost of storing integer registers */
1956 4, /* cost of reg,reg fld/fst */
1957 {12, 12, 12}, /* cost of loading fp registers
1958 in SFmode, DFmode and XFmode */
1959 {6, 6, 8}, /* cost of storing fp registers
1960 in SFmode, DFmode and XFmode */
1961 2, /* cost of moving MMX register */
1962 {8, 8}, /* cost of loading MMX registers
1963 in SImode and DImode */
1964 {8, 8}, /* cost of storing MMX registers
1965 in SImode and DImode */
1966 2, /* cost of moving SSE register */
1967 {8, 8, 8}, /* cost of loading SSE registers
1968 in SImode, DImode and TImode */
1969 {8, 8, 8}, /* cost of storing SSE registers
1970 in SImode, DImode and TImode */
1971 5, /* MMX or SSE register to integer */
1972 32, /* size of l1 cache. */
1973 256, /* size of l2 cache. */
1974 64, /* size of prefetch block */
1975 6, /* number of parallel prefetches */
1976 3, /* Branch cost */
1977 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1978 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1979 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1980 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1981 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1982 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1983 intel_memcpy,
1984 intel_memset,
1985 1, /* scalar_stmt_cost. */
1986 1, /* scalar load_cost. */
1987 1, /* scalar_store_cost. */
1988 1, /* vec_stmt_cost. */
1989 4, /* vec_to_scalar_cost. */
1990 1, /* scalar_to_vec_cost. */
1991 1, /* vec_align_load_cost. */
1992 2, /* vec_unalign_load_cost. */
1993 1, /* vec_store_cost. */
1994 3, /* cond_taken_branch_cost. */
1995 1, /* cond_not_taken_branch_cost. */
1998 /* Generic should produce code tuned for Core-i7 (and newer chips)
1999 and btver1 (and newer chips). */
2001 static stringop_algs generic_memcpy[2] = {
2002 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2003 {-1, libcall, false}}},
2004 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2005 {-1, libcall, false}}}};
2006 static stringop_algs generic_memset[2] = {
2007 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2008 {-1, libcall, false}}},
2009 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2010 {-1, libcall, false}}}};
2011 static const
2012 struct processor_costs generic_cost = {
2013 COSTS_N_INSNS (1), /* cost of an add instruction */
2014 /* On all chips taken into consideration lea is 2 cycles and more. With
2015 this cost however our current implementation of synth_mult results in
2016 use of unnecessary temporary registers causing regression on several
2017 SPECfp benchmarks. */
2018 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2019 COSTS_N_INSNS (1), /* variable shift costs */
2020 COSTS_N_INSNS (1), /* constant shift costs */
2021 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2022 COSTS_N_INSNS (4), /* HI */
2023 COSTS_N_INSNS (3), /* SI */
2024 COSTS_N_INSNS (4), /* DI */
2025 COSTS_N_INSNS (2)}, /* other */
2026 0, /* cost of multiply per each bit set */
2027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2028 COSTS_N_INSNS (26), /* HI */
2029 COSTS_N_INSNS (42), /* SI */
2030 COSTS_N_INSNS (74), /* DI */
2031 COSTS_N_INSNS (74)}, /* other */
2032 COSTS_N_INSNS (1), /* cost of movsx */
2033 COSTS_N_INSNS (1), /* cost of movzx */
2034 8, /* "large" insn */
2035 17, /* MOVE_RATIO */
2036 4, /* cost for loading QImode using movzbl */
2037 {4, 4, 4}, /* cost of loading integer registers
2038 in QImode, HImode and SImode.
2039 Relative to reg-reg move (2). */
2040 {4, 4, 4}, /* cost of storing integer registers */
2041 4, /* cost of reg,reg fld/fst */
2042 {12, 12, 12}, /* cost of loading fp registers
2043 in SFmode, DFmode and XFmode */
2044 {6, 6, 8}, /* cost of storing fp registers
2045 in SFmode, DFmode and XFmode */
2046 2, /* cost of moving MMX register */
2047 {8, 8}, /* cost of loading MMX registers
2048 in SImode and DImode */
2049 {8, 8}, /* cost of storing MMX registers
2050 in SImode and DImode */
2051 2, /* cost of moving SSE register */
2052 {8, 8, 8}, /* cost of loading SSE registers
2053 in SImode, DImode and TImode */
2054 {8, 8, 8}, /* cost of storing SSE registers
2055 in SImode, DImode and TImode */
2056 5, /* MMX or SSE register to integer */
2057 32, /* size of l1 cache. */
2058 512, /* size of l2 cache. */
2059 64, /* size of prefetch block */
2060 6, /* number of parallel prefetches */
2061 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2062 value is increased to perhaps more appropriate value of 5. */
2063 3, /* Branch cost */
2064 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2065 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2066 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2067 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2068 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2069 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2070 generic_memcpy,
2071 generic_memset,
2072 1, /* scalar_stmt_cost. */
2073 1, /* scalar load_cost. */
2074 1, /* scalar_store_cost. */
2075 1, /* vec_stmt_cost. */
2076 1, /* vec_to_scalar_cost. */
2077 1, /* scalar_to_vec_cost. */
2078 1, /* vec_align_load_cost. */
2079 2, /* vec_unalign_load_cost. */
2080 1, /* vec_store_cost. */
2081 3, /* cond_taken_branch_cost. */
2082 1, /* cond_not_taken_branch_cost. */
2085 /* core_cost should produce code tuned for Core familly of CPUs. */
2086 static stringop_algs core_memcpy[2] = {
2087 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2088 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2089 {-1, libcall, false}}}};
2090 static stringop_algs core_memset[2] = {
2091 {libcall, {{6, loop_1_byte, true},
2092 {24, loop, true},
2093 {8192, rep_prefix_4_byte, true},
2094 {-1, libcall, false}}},
2095 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2096 {-1, libcall, false}}}};
2098 static const
2099 struct processor_costs core_cost = {
2100 COSTS_N_INSNS (1), /* cost of an add instruction */
2101 /* On all chips taken into consideration lea is 2 cycles and more. With
2102 this cost however our current implementation of synth_mult results in
2103 use of unnecessary temporary registers causing regression on several
2104 SPECfp benchmarks. */
2105 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2106 COSTS_N_INSNS (1), /* variable shift costs */
2107 COSTS_N_INSNS (1), /* constant shift costs */
2108 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2109 COSTS_N_INSNS (4), /* HI */
2110 COSTS_N_INSNS (3), /* SI */
2111 COSTS_N_INSNS (4), /* DI */
2112 COSTS_N_INSNS (2)}, /* other */
2113 0, /* cost of multiply per each bit set */
2114 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2115 COSTS_N_INSNS (26), /* HI */
2116 COSTS_N_INSNS (42), /* SI */
2117 COSTS_N_INSNS (74), /* DI */
2118 COSTS_N_INSNS (74)}, /* other */
2119 COSTS_N_INSNS (1), /* cost of movsx */
2120 COSTS_N_INSNS (1), /* cost of movzx */
2121 8, /* "large" insn */
2122 17, /* MOVE_RATIO */
2123 4, /* cost for loading QImode using movzbl */
2124 {4, 4, 4}, /* cost of loading integer registers
2125 in QImode, HImode and SImode.
2126 Relative to reg-reg move (2). */
2127 {4, 4, 4}, /* cost of storing integer registers */
2128 4, /* cost of reg,reg fld/fst */
2129 {12, 12, 12}, /* cost of loading fp registers
2130 in SFmode, DFmode and XFmode */
2131 {6, 6, 8}, /* cost of storing fp registers
2132 in SFmode, DFmode and XFmode */
2133 2, /* cost of moving MMX register */
2134 {8, 8}, /* cost of loading MMX registers
2135 in SImode and DImode */
2136 {8, 8}, /* cost of storing MMX registers
2137 in SImode and DImode */
2138 2, /* cost of moving SSE register */
2139 {8, 8, 8}, /* cost of loading SSE registers
2140 in SImode, DImode and TImode */
2141 {8, 8, 8}, /* cost of storing SSE registers
2142 in SImode, DImode and TImode */
2143 5, /* MMX or SSE register to integer */
2144 64, /* size of l1 cache. */
2145 512, /* size of l2 cache. */
2146 64, /* size of prefetch block */
2147 6, /* number of parallel prefetches */
2148 /* FIXME perhaps more appropriate value is 5. */
2149 3, /* Branch cost */
2150 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2151 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2152 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2153 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2154 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2155 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2156 core_memcpy,
2157 core_memset,
2158 1, /* scalar_stmt_cost. */
2159 1, /* scalar load_cost. */
2160 1, /* scalar_store_cost. */
2161 1, /* vec_stmt_cost. */
2162 1, /* vec_to_scalar_cost. */
2163 1, /* scalar_to_vec_cost. */
2164 1, /* vec_align_load_cost. */
2165 2, /* vec_unalign_load_cost. */
2166 1, /* vec_store_cost. */
2167 3, /* cond_taken_branch_cost. */
2168 1, /* cond_not_taken_branch_cost. */
2172 /* Set by -mtune. */
2173 const struct processor_costs *ix86_tune_cost = &pentium_cost;
2175 /* Set by -mtune or -Os. */
2176 const struct processor_costs *ix86_cost = &pentium_cost;
2178 /* Processor feature/optimization bitmasks. */
2179 #define m_386 (1U<<PROCESSOR_I386)
2180 #define m_486 (1U<<PROCESSOR_I486)
2181 #define m_PENT (1U<<PROCESSOR_PENTIUM)
2182 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
2183 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
2184 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
2185 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
2186 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2187 #define m_CORE2 (1U<<PROCESSOR_CORE2)
2188 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
2189 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
2190 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
2191 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
2192 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
2193 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
2194 #define m_KNL (1U<<PROCESSOR_KNL)
2195 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
2196 #define m_INTEL (1U<<PROCESSOR_INTEL)
2198 #define m_GEODE (1U<<PROCESSOR_GEODE)
2199 #define m_K6 (1U<<PROCESSOR_K6)
2200 #define m_K6_GEODE (m_K6 | m_GEODE)
2201 #define m_K8 (1U<<PROCESSOR_K8)
2202 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
2203 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2204 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
2205 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
2206 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
2207 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
2208 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
2209 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
2210 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
2211 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
2212 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
2213 #define m_BTVER (m_BTVER1 | m_BTVER2)
2214 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
2215 | m_ZNVER1)
2217 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
2219 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
2220 #undef DEF_TUNE
2221 #define DEF_TUNE(tune, name, selector) name,
2222 #include "x86-tune.def"
2223 #undef DEF_TUNE
2226 /* Feature tests against the various tunings. */
2227 unsigned char ix86_tune_features[X86_TUNE_LAST];
2229 /* Feature tests against the various tunings used to create ix86_tune_features
2230 based on the processor mask. */
2231 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2232 #undef DEF_TUNE
2233 #define DEF_TUNE(tune, name, selector) selector,
2234 #include "x86-tune.def"
2235 #undef DEF_TUNE
2238 /* Feature tests against the various architecture variations. */
2239 unsigned char ix86_arch_features[X86_ARCH_LAST];
2241 /* Feature tests against the various architecture variations, used to create
2242 ix86_arch_features based on the processor mask. */
2243 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2244 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2245 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
2247 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2248 ~m_386,
2250 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2251 ~(m_386 | m_486),
2253 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2254 ~m_386,
2256 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2257 ~m_386,
2260 /* In case the average insn count for single function invocation is
2261 lower than this constant, emit fast (but longer) prologue and
2262 epilogue code. */
2263 #define FAST_PROLOGUE_INSN_COUNT 20
2265 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2266 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2267 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2268 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2270 /* Array of the smallest class containing reg number REGNO, indexed by
2271 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2273 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2275 /* ax, dx, cx, bx */
2276 AREG, DREG, CREG, BREG,
2277 /* si, di, bp, sp */
2278 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2279 /* FP registers */
2280 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2281 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2282 /* arg pointer */
2283 NON_Q_REGS,
2284 /* flags, fpsr, fpcr, frame */
2285 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2286 /* SSE registers */
2287 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2288 SSE_REGS, SSE_REGS,
2289 /* MMX registers */
2290 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2291 MMX_REGS, MMX_REGS,
2292 /* REX registers */
2293 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2294 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2295 /* SSE REX registers */
2296 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2297 SSE_REGS, SSE_REGS,
2298 /* AVX-512 SSE registers */
2299 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2300 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2301 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2302 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2303 /* Mask registers. */
2304 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2305 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2306 /* MPX bound registers */
2307 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
2310 /* The "default" register map used in 32bit mode. */
2312 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2314 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2315 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2316 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2317 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2318 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2319 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2320 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2321 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2322 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2323 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2324 101, 102, 103, 104, /* bound registers */
2327 /* The "default" register map used in 64bit mode. */
2329 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2331 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2332 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2333 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2334 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2335 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2336 8,9,10,11,12,13,14,15, /* extended integer registers */
2337 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2338 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2339 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2340 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2341 126, 127, 128, 129, /* bound registers */
2344 /* Define the register numbers to be used in Dwarf debugging information.
2345 The SVR4 reference port C compiler uses the following register numbers
2346 in its Dwarf output code:
2347 0 for %eax (gcc regno = 0)
2348 1 for %ecx (gcc regno = 2)
2349 2 for %edx (gcc regno = 1)
2350 3 for %ebx (gcc regno = 3)
2351 4 for %esp (gcc regno = 7)
2352 5 for %ebp (gcc regno = 6)
2353 6 for %esi (gcc regno = 4)
2354 7 for %edi (gcc regno = 5)
2355 The following three DWARF register numbers are never generated by
2356 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2357 believes these numbers have these meanings.
2358 8 for %eip (no gcc equivalent)
2359 9 for %eflags (gcc regno = 17)
2360 10 for %trapno (no gcc equivalent)
2361 It is not at all clear how we should number the FP stack registers
2362 for the x86 architecture. If the version of SDB on x86/svr4 were
2363 a bit less brain dead with respect to floating-point then we would
2364 have a precedent to follow with respect to DWARF register numbers
2365 for x86 FP registers, but the SDB on x86/svr4 is so completely
2366 broken with respect to FP registers that it is hardly worth thinking
2367 of it as something to strive for compatibility with.
2368 The version of x86/svr4 SDB I have at the moment does (partially)
2369 seem to believe that DWARF register number 11 is associated with
2370 the x86 register %st(0), but that's about all. Higher DWARF
2371 register numbers don't seem to be associated with anything in
2372 particular, and even for DWARF regno 11, SDB only seems to under-
2373 stand that it should say that a variable lives in %st(0) (when
2374 asked via an `=' command) if we said it was in DWARF regno 11,
2375 but SDB still prints garbage when asked for the value of the
2376 variable in question (via a `/' command).
2377 (Also note that the labels SDB prints for various FP stack regs
2378 when doing an `x' command are all wrong.)
2379 Note that these problems generally don't affect the native SVR4
2380 C compiler because it doesn't allow the use of -O with -g and
2381 because when it is *not* optimizing, it allocates a memory
2382 location for each floating-point variable, and the memory
2383 location is what gets described in the DWARF AT_location
2384 attribute for the variable in question.
2385 Regardless of the severe mental illness of the x86/svr4 SDB, we
2386 do something sensible here and we use the following DWARF
2387 register numbers. Note that these are all stack-top-relative
2388 numbers.
2389 11 for %st(0) (gcc regno = 8)
2390 12 for %st(1) (gcc regno = 9)
2391 13 for %st(2) (gcc regno = 10)
2392 14 for %st(3) (gcc regno = 11)
2393 15 for %st(4) (gcc regno = 12)
2394 16 for %st(5) (gcc regno = 13)
2395 17 for %st(6) (gcc regno = 14)
2396 18 for %st(7) (gcc regno = 15)
2398 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2400 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2401 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2402 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2403 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2404 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2405 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2406 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2407 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2408 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2409 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2410 101, 102, 103, 104, /* bound registers */
2413 /* Define parameter passing and return registers. */
2415 static int const x86_64_int_parameter_registers[6] =
2417 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2420 static int const x86_64_ms_abi_int_parameter_registers[4] =
2422 CX_REG, DX_REG, R8_REG, R9_REG
2425 static int const x86_64_int_return_registers[4] =
2427 AX_REG, DX_REG, DI_REG, SI_REG
2430 /* Additional registers that are clobbered by SYSV calls. */
2432 #define NUM_X86_64_MS_CLOBBERED_REGS 12
2433 static int const x86_64_ms_sysv_extra_clobbered_registers
2434 [NUM_X86_64_MS_CLOBBERED_REGS] =
2436 SI_REG, DI_REG,
2437 XMM6_REG, XMM7_REG,
2438 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2439 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2442 enum xlogue_stub {
2443 XLOGUE_STUB_SAVE,
2444 XLOGUE_STUB_RESTORE,
2445 XLOGUE_STUB_RESTORE_TAIL,
2446 XLOGUE_STUB_SAVE_HFP,
2447 XLOGUE_STUB_RESTORE_HFP,
2448 XLOGUE_STUB_RESTORE_HFP_TAIL,
2450 XLOGUE_STUB_COUNT
2453 enum xlogue_stub_sets {
2454 XLOGUE_SET_ALIGNED,
2455 XLOGUE_SET_ALIGNED_PLUS_8,
2456 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
2457 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
2459 XLOGUE_SET_COUNT
2462 /* Register save/restore layout used by out-of-line stubs. */
2463 class xlogue_layout {
2464 public:
2465 struct reginfo
2467 unsigned regno;
2468 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
2469 rsi) to where each register is stored. */
2472 unsigned get_nregs () const {return m_nregs;}
2473 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
2475 const reginfo &get_reginfo (unsigned reg) const
2477 gcc_assert (reg < m_nregs);
2478 return m_regs[reg];
2481 static const char *get_stub_name (enum xlogue_stub stub,
2482 unsigned n_extra_args);
2484 /* Returns an rtx for the stub's symbol based upon
2485 1.) the specified stub (save, restore or restore_ret) and
2486 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
2487 3.) rather or not stack alignment is being performed. */
2488 static rtx get_stub_rtx (enum xlogue_stub stub);
2490 /* Returns the amount of stack space (including padding) that the stub
2491 needs to store registers based upon data in the machine_function. */
2492 HOST_WIDE_INT get_stack_space_used () const
2494 const struct machine_function *m = cfun->machine;
2495 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
2497 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
2498 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
2501 /* Returns the offset for the base pointer used by the stub. */
2502 HOST_WIDE_INT get_stub_ptr_offset () const
2504 return STUB_INDEX_OFFSET + m_stack_align_off_in;
2507 static const struct xlogue_layout &get_instance ();
2508 static unsigned count_stub_managed_regs ();
2509 static bool is_stub_managed_reg (unsigned regno, unsigned count);
2511 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
2512 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
2513 static const unsigned MAX_REGS = 18;
2514 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
2515 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
2516 static const unsigned STUB_NAME_MAX_LEN = 16;
2517 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
2518 static const unsigned REG_ORDER[MAX_REGS];
2519 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
2521 private:
2522 xlogue_layout ();
2523 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
2524 xlogue_layout (const xlogue_layout &);
2526 /* True if hard frame pointer is used. */
2527 bool m_hfp;
2529 /* Max number of register this layout manages. */
2530 unsigned m_nregs;
2532 /* Incoming offset from 16-byte alignment. */
2533 HOST_WIDE_INT m_stack_align_off_in;
2535 /* Register order and offsets. */
2536 struct reginfo m_regs[MAX_REGS];
2538 /* Lazy-inited cache of symbol names for stubs. */
2539 static char s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2540 [STUB_NAME_MAX_LEN];
2542 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
2545 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
2546 "savms64",
2547 "resms64",
2548 "resms64x",
2549 "savms64f",
2550 "resms64f",
2551 "resms64fx"
2554 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
2555 /* The below offset values are where each register is stored for the layout
2556 relative to incoming stack pointer. The value of each m_regs[].offset will
2557 be relative to the incoming base pointer (rax or rsi) used by the stub.
2559 s_instances: 0 1 2 3
2560 Offset: realigned or aligned + 8
2561 Register aligned aligned + 8 aligned w/HFP w/HFP */
2562 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
2563 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
2564 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
2565 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
2566 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
2567 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
2568 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
2569 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
2570 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
2571 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
2572 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
2573 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
2574 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
2575 BP_REG, /* 0xc0 0xc8 N/A N/A */
2576 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
2577 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
2578 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
2579 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
2582 /* Instantiate static const values. */
2583 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
2584 const unsigned xlogue_layout::MIN_REGS;
2585 const unsigned xlogue_layout::MAX_REGS;
2586 const unsigned xlogue_layout::MAX_EXTRA_REGS;
2587 const unsigned xlogue_layout::VARIANT_COUNT;
2588 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
2590 /* Initialize xlogue_layout::s_stub_names to zero. */
2591 char xlogue_layout::s_stub_names[XLOGUE_STUB_COUNT][VARIANT_COUNT]
2592 [STUB_NAME_MAX_LEN];
2594 /* Instantiates all xlogue_layout instances. */
2595 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
2596 xlogue_layout (0, false),
2597 xlogue_layout (8, false),
2598 xlogue_layout (0, true),
2599 xlogue_layout (8, true)
2602 /* Return an appropriate const instance of xlogue_layout based upon values
2603 in cfun->machine and crtl. */
2604 const struct xlogue_layout &
2605 xlogue_layout::get_instance ()
2607 enum xlogue_stub_sets stub_set;
2608 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
2610 if (stack_realign_fp)
2611 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2612 else if (frame_pointer_needed)
2613 stub_set = aligned_plus_8
2614 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
2615 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
2616 else
2617 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
2619 return s_instances[stub_set];
2622 /* Determine how many clobbered registers can be saved by the stub.
2623 Returns the count of registers the stub will save and restore. */
2624 unsigned
2625 xlogue_layout::count_stub_managed_regs ()
2627 bool hfp = frame_pointer_needed || stack_realign_fp;
2628 unsigned i, count;
2629 unsigned regno;
2631 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
2633 regno = REG_ORDER[i];
2634 if (regno == BP_REG && hfp)
2635 continue;
2636 if (!ix86_save_reg (regno, false, false))
2637 break;
2638 ++count;
2640 return count;
2643 /* Determine if register REGNO is a stub managed register given the
2644 total COUNT of stub managed registers. */
2645 bool
2646 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
2648 bool hfp = frame_pointer_needed || stack_realign_fp;
2649 unsigned i;
2651 for (i = 0; i < count; ++i)
2653 gcc_assert (i < MAX_REGS);
2654 if (REG_ORDER[i] == BP_REG && hfp)
2655 ++count;
2656 else if (REG_ORDER[i] == regno)
2657 return true;
2659 return false;
2662 /* Constructor for xlogue_layout. */
2663 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
2664 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
2665 m_stack_align_off_in (stack_align_off_in)
2667 HOST_WIDE_INT offset = stack_align_off_in;
2668 unsigned i, j;
2670 for (i = j = 0; i < MAX_REGS; ++i)
2672 unsigned regno = REG_ORDER[i];
2674 if (regno == BP_REG && hfp)
2675 continue;
2676 if (SSE_REGNO_P (regno))
2678 offset += 16;
2679 /* Verify that SSE regs are always aligned. */
2680 gcc_assert (!((stack_align_off_in + offset) & 15));
2682 else
2683 offset += 8;
2685 m_regs[j].regno = regno;
2686 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
2688 gcc_assert (j == m_nregs);
2691 const char *
2692 xlogue_layout::get_stub_name (enum xlogue_stub stub,
2693 unsigned n_extra_regs)
2695 char *name = s_stub_names[stub][n_extra_regs];
2697 /* Lazy init */
2698 if (!*name)
2700 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%u",
2701 STUB_BASE_NAMES[stub], MIN_REGS + n_extra_regs);
2702 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
2705 return name;
2708 /* Return rtx of a symbol ref for the entry point (based upon
2709 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
2711 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
2713 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
2714 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
2715 gcc_assert (stub < XLOGUE_STUB_COUNT);
2716 gcc_assert (crtl->stack_realign_finalized);
2718 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
2721 /* Define the structure for the machine field in struct function. */
2723 struct GTY(()) stack_local_entry {
2724 unsigned short mode;
2725 unsigned short n;
2726 rtx rtl;
2727 struct stack_local_entry *next;
2730 /* Which cpu are we scheduling for. */
2731 enum attr_cpu ix86_schedule;
2733 /* Which cpu are we optimizing for. */
2734 enum processor_type ix86_tune;
2736 /* Which instruction set architecture to use. */
2737 enum processor_type ix86_arch;
2739 /* True if processor has SSE prefetch instruction. */
2740 unsigned char x86_prefetch_sse;
2742 /* -mstackrealign option */
2743 static const char ix86_force_align_arg_pointer_string[]
2744 = "force_align_arg_pointer";
2746 static rtx (*ix86_gen_leave) (void);
2747 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2748 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2749 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2750 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2751 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2752 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
2753 static rtx (*ix86_gen_clzero) (rtx);
2754 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2755 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2756 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2757 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2758 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2759 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2761 /* Preferred alignment for stack boundary in bits. */
2762 unsigned int ix86_preferred_stack_boundary;
2764 /* Alignment for incoming stack boundary in bits specified at
2765 command line. */
2766 static unsigned int ix86_user_incoming_stack_boundary;
2768 /* Default alignment for incoming stack boundary in bits. */
2769 static unsigned int ix86_default_incoming_stack_boundary;
2771 /* Alignment for incoming stack boundary in bits. */
2772 unsigned int ix86_incoming_stack_boundary;
2774 /* Calling abi specific va_list type nodes. */
2775 static GTY(()) tree sysv_va_list_type_node;
2776 static GTY(()) tree ms_va_list_type_node;
2778 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2779 char internal_label_prefix[16];
2780 int internal_label_prefix_len;
2782 /* Fence to use after loop using movnt. */
2783 tree x86_mfence;
2785 /* Register class used for passing given 64bit part of the argument.
2786 These represent classes as documented by the PS ABI, with the exception
2787 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2788 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2790 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2791 whenever possible (upper half does contain padding). */
2792 enum x86_64_reg_class
2794 X86_64_NO_CLASS,
2795 X86_64_INTEGER_CLASS,
2796 X86_64_INTEGERSI_CLASS,
2797 X86_64_SSE_CLASS,
2798 X86_64_SSESF_CLASS,
2799 X86_64_SSEDF_CLASS,
2800 X86_64_SSEUP_CLASS,
2801 X86_64_X87_CLASS,
2802 X86_64_X87UP_CLASS,
2803 X86_64_COMPLEX_X87_CLASS,
2804 X86_64_MEMORY_CLASS
2807 #define MAX_CLASSES 8
2809 /* Table of constants used by fldpi, fldln2, etc.... */
2810 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2811 static bool ext_80387_constants_init;
2814 static struct machine_function * ix86_init_machine_status (void);
2815 static rtx ix86_function_value (const_tree, const_tree, bool);
2816 static bool ix86_function_value_regno_p (const unsigned int);
2817 static unsigned int ix86_function_arg_boundary (machine_mode,
2818 const_tree);
2819 static rtx ix86_static_chain (const_tree, bool);
2820 static int ix86_function_regparm (const_tree, const_tree);
2821 static void ix86_compute_frame_layout (void);
2822 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
2823 rtx, rtx, int);
2824 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
2825 static tree ix86_canonical_va_list_type (tree);
2826 static void predict_jump (int);
2827 static unsigned int split_stack_prologue_scratch_regno (void);
2828 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2830 enum ix86_function_specific_strings
2832 IX86_FUNCTION_SPECIFIC_ARCH,
2833 IX86_FUNCTION_SPECIFIC_TUNE,
2834 IX86_FUNCTION_SPECIFIC_MAX
2837 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
2838 const char *, const char *, enum fpmath_unit,
2839 bool);
2840 static void ix86_function_specific_save (struct cl_target_option *,
2841 struct gcc_options *opts);
2842 static void ix86_function_specific_restore (struct gcc_options *opts,
2843 struct cl_target_option *);
2844 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
2845 static void ix86_function_specific_print (FILE *, int,
2846 struct cl_target_option *);
2847 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2848 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2849 struct gcc_options *,
2850 struct gcc_options *,
2851 struct gcc_options *);
2852 static bool ix86_can_inline_p (tree, tree);
2853 static void ix86_set_current_function (tree);
2854 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2856 static enum calling_abi ix86_function_abi (const_tree);
2859 #ifndef SUBTARGET32_DEFAULT_CPU
2860 #define SUBTARGET32_DEFAULT_CPU "i386"
2861 #endif
2863 /* Whether -mtune= or -march= were specified */
2864 static int ix86_tune_defaulted;
2865 static int ix86_arch_specified;
2867 /* Vectorization library interface and handlers. */
2868 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
2870 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
2871 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
2873 /* Processor target table, indexed by processor number */
2874 struct ptt
2876 const char *const name; /* processor name */
2877 const struct processor_costs *cost; /* Processor costs */
2878 const int align_loop; /* Default alignments. */
2879 const int align_loop_max_skip;
2880 const int align_jump;
2881 const int align_jump_max_skip;
2882 const int align_func;
2885 /* This table must be in sync with enum processor_type in i386.h. */
2886 static const struct ptt processor_target_table[PROCESSOR_max] =
2888 {"generic", &generic_cost, 16, 10, 16, 10, 16},
2889 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2890 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2891 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2892 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
2893 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2894 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2895 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2896 {"core2", &core_cost, 16, 10, 16, 10, 16},
2897 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
2898 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
2899 {"haswell", &core_cost, 16, 10, 16, 10, 16},
2900 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
2901 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
2902 {"knl", &slm_cost, 16, 15, 16, 7, 16},
2903 {"skylake-avx512", &core_cost, 16, 10, 16, 10, 16},
2904 {"intel", &intel_cost, 16, 15, 16, 7, 16},
2905 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2906 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2907 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2908 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2909 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2910 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2911 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2912 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2913 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
2914 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2915 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
2916 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
2919 static unsigned int
2920 rest_of_handle_insert_vzeroupper (void)
2922 int i;
2924 /* vzeroupper instructions are inserted immediately after reload to
2925 account for possible spills from 256bit registers. The pass
2926 reuses mode switching infrastructure by re-running mode insertion
2927 pass, so disable entities that have already been processed. */
2928 for (i = 0; i < MAX_386_ENTITIES; i++)
2929 ix86_optimize_mode_switching[i] = 0;
2931 ix86_optimize_mode_switching[AVX_U128] = 1;
2933 /* Call optimize_mode_switching. */
2934 g->get_passes ()->execute_pass_mode_switching ();
2935 return 0;
2938 /* Return 1 if INSN uses or defines a hard register.
2939 Hard register uses in a memory address are ignored.
2940 Clobbers and flags definitions are ignored. */
2942 static bool
2943 has_non_address_hard_reg (rtx_insn *insn)
2945 df_ref ref;
2946 FOR_EACH_INSN_DEF (ref, insn)
2947 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2948 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2949 && DF_REF_REGNO (ref) != FLAGS_REG)
2950 return true;
2952 FOR_EACH_INSN_USE (ref, insn)
2953 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2954 return true;
2956 return false;
2959 /* Check if comparison INSN may be transformed
2960 into vector comparison. Currently we transform
2961 zero checks only which look like:
2963 (set (reg:CCZ 17 flags)
2964 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
2965 (subreg:SI (reg:DI x) 0))
2966 (const_int 0 [0]))) */
2968 static bool
2969 convertible_comparison_p (rtx_insn *insn)
2971 if (!TARGET_SSE4_1)
2972 return false;
2974 rtx def_set = single_set (insn);
2976 gcc_assert (def_set);
2978 rtx src = SET_SRC (def_set);
2979 rtx dst = SET_DEST (def_set);
2981 gcc_assert (GET_CODE (src) == COMPARE);
2983 if (GET_CODE (dst) != REG
2984 || REGNO (dst) != FLAGS_REG
2985 || GET_MODE (dst) != CCZmode)
2986 return false;
2988 rtx op1 = XEXP (src, 0);
2989 rtx op2 = XEXP (src, 1);
2991 if (op2 != CONST0_RTX (GET_MODE (op2)))
2992 return false;
2994 if (GET_CODE (op1) != IOR)
2995 return false;
2997 op2 = XEXP (op1, 1);
2998 op1 = XEXP (op1, 0);
3000 if (!SUBREG_P (op1)
3001 || !SUBREG_P (op2)
3002 || GET_MODE (op1) != SImode
3003 || GET_MODE (op2) != SImode
3004 || ((SUBREG_BYTE (op1) != 0
3005 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
3006 && (SUBREG_BYTE (op2) != 0
3007 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
3008 return false;
3010 op1 = SUBREG_REG (op1);
3011 op2 = SUBREG_REG (op2);
3013 if (op1 != op2
3014 || !REG_P (op1)
3015 || GET_MODE (op1) != DImode)
3016 return false;
3018 return true;
3021 /* The DImode version of scalar_to_vector_candidate_p. */
3023 static bool
3024 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
3026 rtx def_set = single_set (insn);
3028 if (!def_set)
3029 return false;
3031 if (has_non_address_hard_reg (insn))
3032 return false;
3034 rtx src = SET_SRC (def_set);
3035 rtx dst = SET_DEST (def_set);
3037 if (GET_CODE (src) == COMPARE)
3038 return convertible_comparison_p (insn);
3040 /* We are interested in DImode promotion only. */
3041 if ((GET_MODE (src) != DImode
3042 && !CONST_INT_P (src))
3043 || GET_MODE (dst) != DImode)
3044 return false;
3046 if (!REG_P (dst) && !MEM_P (dst))
3047 return false;
3049 switch (GET_CODE (src))
3051 case ASHIFTRT:
3052 if (!TARGET_AVX512VL)
3053 return false;
3054 /* FALLTHRU */
3056 case ASHIFT:
3057 case LSHIFTRT:
3058 if (!REG_P (XEXP (src, 1))
3059 && (!SUBREG_P (XEXP (src, 1))
3060 || SUBREG_BYTE (XEXP (src, 1)) != 0
3061 || !REG_P (SUBREG_REG (XEXP (src, 1))))
3062 && (!CONST_INT_P (XEXP (src, 1))
3063 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
3064 return false;
3066 if (GET_MODE (XEXP (src, 1)) != QImode
3067 && !CONST_INT_P (XEXP (src, 1)))
3068 return false;
3069 break;
3071 case PLUS:
3072 case MINUS:
3073 case IOR:
3074 case XOR:
3075 case AND:
3076 if (!REG_P (XEXP (src, 1))
3077 && !MEM_P (XEXP (src, 1))
3078 && !CONST_INT_P (XEXP (src, 1)))
3079 return false;
3081 if (GET_MODE (XEXP (src, 1)) != DImode
3082 && !CONST_INT_P (XEXP (src, 1)))
3083 return false;
3084 break;
3086 case NEG:
3087 case NOT:
3088 break;
3090 case REG:
3091 return true;
3093 case MEM:
3094 case CONST_INT:
3095 return REG_P (dst);
3097 default:
3098 return false;
3101 if (!REG_P (XEXP (src, 0))
3102 && !MEM_P (XEXP (src, 0))
3103 && !CONST_INT_P (XEXP (src, 0))
3104 /* Check for andnot case. */
3105 && (GET_CODE (src) != AND
3106 || GET_CODE (XEXP (src, 0)) != NOT
3107 || !REG_P (XEXP (XEXP (src, 0), 0))))
3108 return false;
3110 if (GET_MODE (XEXP (src, 0)) != DImode
3111 && !CONST_INT_P (XEXP (src, 0)))
3112 return false;
3114 return true;
3117 /* The TImode version of scalar_to_vector_candidate_p. */
3119 static bool
3120 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
3122 rtx def_set = single_set (insn);
3124 if (!def_set)
3125 return false;
3127 if (has_non_address_hard_reg (insn))
3128 return false;
3130 rtx src = SET_SRC (def_set);
3131 rtx dst = SET_DEST (def_set);
3133 /* Only TImode load and store are allowed. */
3134 if (GET_MODE (dst) != TImode)
3135 return false;
3137 if (MEM_P (dst))
3139 /* Check for store. Memory must be aligned or unaligned store
3140 is optimal. Only support store from register, standard SSE
3141 constant or CONST_WIDE_INT generated from piecewise store.
3143 ??? Verify performance impact before enabling CONST_INT for
3144 __int128 store. */
3145 if (misaligned_operand (dst, TImode)
3146 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
3147 return false;
3149 switch (GET_CODE (src))
3151 default:
3152 return false;
3154 case REG:
3155 case CONST_WIDE_INT:
3156 return true;
3158 case CONST_INT:
3159 return standard_sse_constant_p (src, TImode);
3162 else if (MEM_P (src))
3164 /* Check for load. Memory must be aligned or unaligned load is
3165 optimal. */
3166 return (REG_P (dst)
3167 && (!misaligned_operand (src, TImode)
3168 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
3171 return false;
3174 /* Return 1 if INSN may be converted into vector
3175 instruction. */
3177 static bool
3178 scalar_to_vector_candidate_p (rtx_insn *insn)
3180 if (TARGET_64BIT)
3181 return timode_scalar_to_vector_candidate_p (insn);
3182 else
3183 return dimode_scalar_to_vector_candidate_p (insn);
3186 /* The DImode version of remove_non_convertible_regs. */
3188 static void
3189 dimode_remove_non_convertible_regs (bitmap candidates)
3191 bitmap_iterator bi;
3192 unsigned id;
3193 bitmap regs = BITMAP_ALLOC (NULL);
3195 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3197 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3198 rtx reg = SET_DEST (def_set);
3200 if (!REG_P (reg)
3201 || bitmap_bit_p (regs, REGNO (reg))
3202 || HARD_REGISTER_P (reg))
3203 continue;
3205 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
3206 def;
3207 def = DF_REF_NEXT_REG (def))
3209 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3211 if (dump_file)
3212 fprintf (dump_file,
3213 "r%d has non convertible definition in insn %d\n",
3214 REGNO (reg), DF_REF_INSN_UID (def));
3216 bitmap_set_bit (regs, REGNO (reg));
3217 break;
3222 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3224 for (df_ref def = DF_REG_DEF_CHAIN (id);
3225 def;
3226 def = DF_REF_NEXT_REG (def))
3227 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3229 if (dump_file)
3230 fprintf (dump_file, "Removing insn %d from candidates list\n",
3231 DF_REF_INSN_UID (def));
3233 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3237 BITMAP_FREE (regs);
3240 /* For a register REGNO, scan instructions for its defs and uses.
3241 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
3243 static void
3244 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
3245 unsigned int regno)
3247 for (df_ref def = DF_REG_DEF_CHAIN (regno);
3248 def;
3249 def = DF_REF_NEXT_REG (def))
3251 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3253 if (dump_file)
3254 fprintf (dump_file,
3255 "r%d has non convertible def in insn %d\n",
3256 regno, DF_REF_INSN_UID (def));
3258 bitmap_set_bit (regs, regno);
3259 break;
3263 for (df_ref ref = DF_REG_USE_CHAIN (regno);
3264 ref;
3265 ref = DF_REF_NEXT_REG (ref))
3267 /* Debug instructions are skipped. */
3268 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
3269 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3271 if (dump_file)
3272 fprintf (dump_file,
3273 "r%d has non convertible use in insn %d\n",
3274 regno, DF_REF_INSN_UID (ref));
3276 bitmap_set_bit (regs, regno);
3277 break;
3282 /* The TImode version of remove_non_convertible_regs. */
3284 static void
3285 timode_remove_non_convertible_regs (bitmap candidates)
3287 bitmap_iterator bi;
3288 unsigned id;
3289 bitmap regs = BITMAP_ALLOC (NULL);
3291 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
3293 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
3294 rtx dest = SET_DEST (def_set);
3295 rtx src = SET_SRC (def_set);
3297 if ((!REG_P (dest)
3298 || bitmap_bit_p (regs, REGNO (dest))
3299 || HARD_REGISTER_P (dest))
3300 && (!REG_P (src)
3301 || bitmap_bit_p (regs, REGNO (src))
3302 || HARD_REGISTER_P (src)))
3303 continue;
3305 if (REG_P (dest))
3306 timode_check_non_convertible_regs (candidates, regs,
3307 REGNO (dest));
3309 if (REG_P (src))
3310 timode_check_non_convertible_regs (candidates, regs,
3311 REGNO (src));
3314 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
3316 for (df_ref def = DF_REG_DEF_CHAIN (id);
3317 def;
3318 def = DF_REF_NEXT_REG (def))
3319 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
3321 if (dump_file)
3322 fprintf (dump_file, "Removing insn %d from candidates list\n",
3323 DF_REF_INSN_UID (def));
3325 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
3328 for (df_ref ref = DF_REG_USE_CHAIN (id);
3329 ref;
3330 ref = DF_REF_NEXT_REG (ref))
3331 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
3333 if (dump_file)
3334 fprintf (dump_file, "Removing insn %d from candidates list\n",
3335 DF_REF_INSN_UID (ref));
3337 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
3341 BITMAP_FREE (regs);
3344 /* For a given bitmap of insn UIDs scans all instruction and
3345 remove insn from CANDIDATES in case it has both convertible
3346 and not convertible definitions.
3348 All insns in a bitmap are conversion candidates according to
3349 scalar_to_vector_candidate_p. Currently it implies all insns
3350 are single_set. */
3352 static void
3353 remove_non_convertible_regs (bitmap candidates)
3355 if (TARGET_64BIT)
3356 timode_remove_non_convertible_regs (candidates);
3357 else
3358 dimode_remove_non_convertible_regs (candidates);
3361 class scalar_chain
3363 public:
3364 scalar_chain ();
3365 virtual ~scalar_chain ();
3367 static unsigned max_id;
3369 /* ID of a chain. */
3370 unsigned int chain_id;
3371 /* A queue of instructions to be included into a chain. */
3372 bitmap queue;
3373 /* Instructions included into a chain. */
3374 bitmap insns;
3375 /* All registers defined by a chain. */
3376 bitmap defs;
3377 /* Registers used in both vector and sclar modes. */
3378 bitmap defs_conv;
3380 void build (bitmap candidates, unsigned insn_uid);
3381 virtual int compute_convert_gain () = 0;
3382 int convert ();
3384 protected:
3385 void add_to_queue (unsigned insn_uid);
3386 void emit_conversion_insns (rtx insns, rtx_insn *pos);
3388 private:
3389 void add_insn (bitmap candidates, unsigned insn_uid);
3390 void analyze_register_chain (bitmap candidates, df_ref ref);
3391 virtual void mark_dual_mode_def (df_ref def) = 0;
3392 virtual void convert_insn (rtx_insn *insn) = 0;
3393 virtual void convert_registers () = 0;
3396 class dimode_scalar_chain : public scalar_chain
3398 public:
3399 int compute_convert_gain ();
3400 private:
3401 void mark_dual_mode_def (df_ref def);
3402 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
3403 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
3404 void convert_insn (rtx_insn *insn);
3405 void convert_op (rtx *op, rtx_insn *insn);
3406 void convert_reg (unsigned regno);
3407 void make_vector_copies (unsigned regno);
3408 void convert_registers ();
3409 int vector_const_cost (rtx exp);
3412 class timode_scalar_chain : public scalar_chain
3414 public:
3415 /* Convert from TImode to V1TImode is always faster. */
3416 int compute_convert_gain () { return 1; }
3418 private:
3419 void mark_dual_mode_def (df_ref def);
3420 void fix_debug_reg_uses (rtx reg);
3421 void convert_insn (rtx_insn *insn);
3422 /* We don't convert registers to difference size. */
3423 void convert_registers () {}
3426 unsigned scalar_chain::max_id = 0;
3428 /* Initialize new chain. */
3430 scalar_chain::scalar_chain ()
3432 chain_id = ++max_id;
3434 if (dump_file)
3435 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
3437 bitmap_obstack_initialize (NULL);
3438 insns = BITMAP_ALLOC (NULL);
3439 defs = BITMAP_ALLOC (NULL);
3440 defs_conv = BITMAP_ALLOC (NULL);
3441 queue = NULL;
3444 /* Free chain's data. */
3446 scalar_chain::~scalar_chain ()
3448 BITMAP_FREE (insns);
3449 BITMAP_FREE (defs);
3450 BITMAP_FREE (defs_conv);
3451 bitmap_obstack_release (NULL);
3454 /* Add instruction into chains' queue. */
3456 void
3457 scalar_chain::add_to_queue (unsigned insn_uid)
3459 if (bitmap_bit_p (insns, insn_uid)
3460 || bitmap_bit_p (queue, insn_uid))
3461 return;
3463 if (dump_file)
3464 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
3465 insn_uid, chain_id);
3466 bitmap_set_bit (queue, insn_uid);
3469 /* For DImode conversion, mark register defined by DEF as requiring
3470 conversion. */
3472 void
3473 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
3475 gcc_assert (DF_REF_REG_DEF_P (def));
3477 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
3478 return;
3480 if (dump_file)
3481 fprintf (dump_file,
3482 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
3483 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
3485 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
3488 /* For TImode conversion, it is unused. */
3490 void
3491 timode_scalar_chain::mark_dual_mode_def (df_ref)
3493 gcc_unreachable ();
3496 /* Check REF's chain to add new insns into a queue
3497 and find registers requiring conversion. */
3499 void
3500 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
3502 df_link *chain;
3504 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
3505 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
3506 add_to_queue (DF_REF_INSN_UID (ref));
3508 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
3510 unsigned uid = DF_REF_INSN_UID (chain->ref);
3512 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
3513 continue;
3515 if (!DF_REF_REG_MEM_P (chain->ref))
3517 if (bitmap_bit_p (insns, uid))
3518 continue;
3520 if (bitmap_bit_p (candidates, uid))
3522 add_to_queue (uid);
3523 continue;
3527 if (DF_REF_REG_DEF_P (chain->ref))
3529 if (dump_file)
3530 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
3531 DF_REF_REGNO (chain->ref), uid);
3532 mark_dual_mode_def (chain->ref);
3534 else
3536 if (dump_file)
3537 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
3538 DF_REF_REGNO (chain->ref), uid);
3539 mark_dual_mode_def (ref);
3544 /* Add instruction into a chain. */
3546 void
3547 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
3549 if (bitmap_bit_p (insns, insn_uid))
3550 return;
3552 if (dump_file)
3553 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
3555 bitmap_set_bit (insns, insn_uid);
3557 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3558 rtx def_set = single_set (insn);
3559 if (def_set && REG_P (SET_DEST (def_set))
3560 && !HARD_REGISTER_P (SET_DEST (def_set)))
3561 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
3563 df_ref ref;
3564 df_ref def;
3565 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3566 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
3567 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
3568 def;
3569 def = DF_REF_NEXT_REG (def))
3570 analyze_register_chain (candidates, def);
3571 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
3572 if (!DF_REF_REG_MEM_P (ref))
3573 analyze_register_chain (candidates, ref);
3576 /* Build new chain starting from insn INSN_UID recursively
3577 adding all dependent uses and definitions. */
3579 void
3580 scalar_chain::build (bitmap candidates, unsigned insn_uid)
3582 queue = BITMAP_ALLOC (NULL);
3583 bitmap_set_bit (queue, insn_uid);
3585 if (dump_file)
3586 fprintf (dump_file, "Building chain #%d...\n", chain_id);
3588 while (!bitmap_empty_p (queue))
3590 insn_uid = bitmap_first_set_bit (queue);
3591 bitmap_clear_bit (queue, insn_uid);
3592 bitmap_clear_bit (candidates, insn_uid);
3593 add_insn (candidates, insn_uid);
3596 if (dump_file)
3598 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
3599 fprintf (dump_file, " insns: ");
3600 dump_bitmap (dump_file, insns);
3601 if (!bitmap_empty_p (defs_conv))
3603 bitmap_iterator bi;
3604 unsigned id;
3605 const char *comma = "";
3606 fprintf (dump_file, " defs to convert: ");
3607 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
3609 fprintf (dump_file, "%sr%d", comma, id);
3610 comma = ", ";
3612 fprintf (dump_file, "\n");
3616 BITMAP_FREE (queue);
3619 /* Return a cost of building a vector costant
3620 instead of using a scalar one. */
3623 dimode_scalar_chain::vector_const_cost (rtx exp)
3625 gcc_assert (CONST_INT_P (exp));
3627 if (standard_sse_constant_p (exp, V2DImode))
3628 return COSTS_N_INSNS (1);
3629 return ix86_cost->sse_load[1];
3632 /* Compute a gain for chain conversion. */
3635 dimode_scalar_chain::compute_convert_gain ()
3637 bitmap_iterator bi;
3638 unsigned insn_uid;
3639 int gain = 0;
3640 int cost = 0;
3642 if (dump_file)
3643 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
3645 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
3647 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
3648 rtx def_set = single_set (insn);
3649 rtx src = SET_SRC (def_set);
3650 rtx dst = SET_DEST (def_set);
3652 if (REG_P (src) && REG_P (dst))
3653 gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
3654 else if (REG_P (src) && MEM_P (dst))
3655 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3656 else if (MEM_P (src) && REG_P (dst))
3657 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
3658 else if (GET_CODE (src) == ASHIFT
3659 || GET_CODE (src) == ASHIFTRT
3660 || GET_CODE (src) == LSHIFTRT)
3662 if (CONST_INT_P (XEXP (src, 0)))
3663 gain -= vector_const_cost (XEXP (src, 0));
3664 if (CONST_INT_P (XEXP (src, 1)))
3666 gain += ix86_cost->shift_const;
3667 if (INTVAL (XEXP (src, 1)) >= 32)
3668 gain -= COSTS_N_INSNS (1);
3670 else
3671 /* Additional gain for omitting two CMOVs. */
3672 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
3674 else if (GET_CODE (src) == PLUS
3675 || GET_CODE (src) == MINUS
3676 || GET_CODE (src) == IOR
3677 || GET_CODE (src) == XOR
3678 || GET_CODE (src) == AND)
3680 gain += ix86_cost->add;
3681 /* Additional gain for andnot for targets without BMI. */
3682 if (GET_CODE (XEXP (src, 0)) == NOT
3683 && !TARGET_BMI)
3684 gain += 2 * ix86_cost->add;
3686 if (CONST_INT_P (XEXP (src, 0)))
3687 gain -= vector_const_cost (XEXP (src, 0));
3688 if (CONST_INT_P (XEXP (src, 1)))
3689 gain -= vector_const_cost (XEXP (src, 1));
3691 else if (GET_CODE (src) == NEG
3692 || GET_CODE (src) == NOT)
3693 gain += ix86_cost->add - COSTS_N_INSNS (1);
3694 else if (GET_CODE (src) == COMPARE)
3696 /* Assume comparison cost is the same. */
3698 else if (CONST_INT_P (src))
3700 if (REG_P (dst))
3701 gain += COSTS_N_INSNS (2);
3702 else if (MEM_P (dst))
3703 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
3704 gain -= vector_const_cost (src);
3706 else
3707 gcc_unreachable ();
3710 if (dump_file)
3711 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
3713 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
3714 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
3716 if (dump_file)
3717 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
3719 gain -= cost;
3721 if (dump_file)
3722 fprintf (dump_file, " Total gain: %d\n", gain);
3724 return gain;
3727 /* Replace REG in X with a V2DI subreg of NEW_REG. */
3730 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
3732 if (x == reg)
3733 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
3735 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
3736 int i, j;
3737 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
3739 if (fmt[i] == 'e')
3740 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
3741 else if (fmt[i] == 'E')
3742 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
3743 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
3744 reg, new_reg);
3747 return x;
3750 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
3752 void
3753 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
3754 rtx reg, rtx new_reg)
3756 replace_with_subreg (single_set (insn), reg, new_reg);
3759 /* Insert generated conversion instruction sequence INSNS
3760 after instruction AFTER. New BB may be required in case
3761 instruction has EH region attached. */
3763 void
3764 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
3766 if (!control_flow_insn_p (after))
3768 emit_insn_after (insns, after);
3769 return;
3772 basic_block bb = BLOCK_FOR_INSN (after);
3773 edge e = find_fallthru_edge (bb->succs);
3774 gcc_assert (e);
3776 basic_block new_bb = split_edge (e);
3777 emit_insn_after (insns, BB_HEAD (new_bb));
3780 /* Make vector copies for all register REGNO definitions
3781 and replace its uses in a chain. */
3783 void
3784 dimode_scalar_chain::make_vector_copies (unsigned regno)
3786 rtx reg = regno_reg_rtx[regno];
3787 rtx vreg = gen_reg_rtx (DImode);
3788 bool count_reg = false;
3789 df_ref ref;
3791 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3792 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3794 df_ref use;
3796 /* Detect the count register of a shift instruction. */
3797 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
3798 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
3800 rtx_insn *insn = DF_REF_INSN (use);
3801 rtx def_set = single_set (insn);
3803 gcc_assert (def_set);
3805 rtx src = SET_SRC (def_set);
3807 if ((GET_CODE (src) == ASHIFT
3808 || GET_CODE (src) == ASHIFTRT
3809 || GET_CODE (src) == LSHIFTRT)
3810 && !CONST_INT_P (XEXP (src, 1))
3811 && reg_or_subregno (XEXP (src, 1)) == regno)
3812 count_reg = true;
3815 start_sequence ();
3816 if (count_reg)
3818 rtx qreg = gen_lowpart (QImode, reg);
3819 rtx tmp = gen_reg_rtx (SImode);
3821 if (TARGET_ZERO_EXTEND_WITH_AND
3822 && optimize_function_for_speed_p (cfun))
3824 emit_move_insn (tmp, const0_rtx);
3825 emit_insn (gen_movstrictqi
3826 (gen_lowpart (QImode, tmp), qreg));
3828 else
3829 emit_insn (gen_rtx_SET
3830 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
3832 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3834 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
3835 emit_move_insn (slot, tmp);
3836 tmp = copy_rtx (slot);
3839 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
3841 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
3843 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3844 emit_move_insn (adjust_address (tmp, SImode, 0),
3845 gen_rtx_SUBREG (SImode, reg, 0));
3846 emit_move_insn (adjust_address (tmp, SImode, 4),
3847 gen_rtx_SUBREG (SImode, reg, 4));
3848 emit_move_insn (vreg, tmp);
3850 else if (TARGET_SSE4_1)
3852 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3853 CONST0_RTX (V4SImode),
3854 gen_rtx_SUBREG (SImode, reg, 0)));
3855 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
3856 gen_rtx_SUBREG (V4SImode, vreg, 0),
3857 gen_rtx_SUBREG (SImode, reg, 4),
3858 GEN_INT (2)));
3860 else
3862 rtx tmp = gen_reg_rtx (DImode);
3863 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
3864 CONST0_RTX (V4SImode),
3865 gen_rtx_SUBREG (SImode, reg, 0)));
3866 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
3867 CONST0_RTX (V4SImode),
3868 gen_rtx_SUBREG (SImode, reg, 4)));
3869 emit_insn (gen_vec_interleave_lowv4si
3870 (gen_rtx_SUBREG (V4SImode, vreg, 0),
3871 gen_rtx_SUBREG (V4SImode, vreg, 0),
3872 gen_rtx_SUBREG (V4SImode, tmp, 0)));
3874 rtx_insn *seq = get_insns ();
3875 end_sequence ();
3876 rtx_insn *insn = DF_REF_INSN (ref);
3877 emit_conversion_insns (seq, insn);
3879 if (dump_file)
3880 fprintf (dump_file,
3881 " Copied r%d to a vector register r%d for insn %d\n",
3882 regno, REGNO (vreg), INSN_UID (insn));
3885 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3886 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3888 rtx_insn *insn = DF_REF_INSN (ref);
3889 if (count_reg)
3891 rtx def_set = single_set (insn);
3892 gcc_assert (def_set);
3894 rtx src = SET_SRC (def_set);
3896 if ((GET_CODE (src) == ASHIFT
3897 || GET_CODE (src) == ASHIFTRT
3898 || GET_CODE (src) == LSHIFTRT)
3899 && !CONST_INT_P (XEXP (src, 1))
3900 && reg_or_subregno (XEXP (src, 1)) == regno)
3901 XEXP (src, 1) = vreg;
3903 else
3904 replace_with_subreg_in_insn (insn, reg, vreg);
3906 if (dump_file)
3907 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
3908 regno, REGNO (vreg), INSN_UID (insn));
3912 /* Convert all definitions of register REGNO
3913 and fix its uses. Scalar copies may be created
3914 in case register is used in not convertible insn. */
3916 void
3917 dimode_scalar_chain::convert_reg (unsigned regno)
3919 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
3920 rtx reg = regno_reg_rtx[regno];
3921 rtx scopy = NULL_RTX;
3922 df_ref ref;
3923 bitmap conv;
3925 conv = BITMAP_ALLOC (NULL);
3926 bitmap_copy (conv, insns);
3928 if (scalar_copy)
3929 scopy = gen_reg_rtx (DImode);
3931 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3933 rtx_insn *insn = DF_REF_INSN (ref);
3934 rtx def_set = single_set (insn);
3935 rtx src = SET_SRC (def_set);
3936 rtx reg = DF_REF_REG (ref);
3938 if (!MEM_P (src))
3940 replace_with_subreg_in_insn (insn, reg, reg);
3941 bitmap_clear_bit (conv, INSN_UID (insn));
3944 if (scalar_copy)
3946 start_sequence ();
3947 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
3949 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
3950 emit_move_insn (tmp, reg);
3951 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3952 adjust_address (tmp, SImode, 0));
3953 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3954 adjust_address (tmp, SImode, 4));
3956 else if (TARGET_SSE4_1)
3958 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
3959 emit_insn
3960 (gen_rtx_SET
3961 (gen_rtx_SUBREG (SImode, scopy, 0),
3962 gen_rtx_VEC_SELECT (SImode,
3963 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3965 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
3966 emit_insn
3967 (gen_rtx_SET
3968 (gen_rtx_SUBREG (SImode, scopy, 4),
3969 gen_rtx_VEC_SELECT (SImode,
3970 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
3972 else
3974 rtx vcopy = gen_reg_rtx (V2DImode);
3975 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
3976 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
3977 gen_rtx_SUBREG (SImode, vcopy, 0));
3978 emit_move_insn (vcopy,
3979 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
3980 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
3981 gen_rtx_SUBREG (SImode, vcopy, 0));
3983 rtx_insn *seq = get_insns ();
3984 end_sequence ();
3985 emit_conversion_insns (seq, insn);
3987 if (dump_file)
3988 fprintf (dump_file,
3989 " Copied r%d to a scalar register r%d for insn %d\n",
3990 regno, REGNO (scopy), INSN_UID (insn));
3994 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
3995 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
3997 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
3999 rtx_insn *insn = DF_REF_INSN (ref);
4001 rtx def_set = single_set (insn);
4002 gcc_assert (def_set);
4004 rtx src = SET_SRC (def_set);
4005 rtx dst = SET_DEST (def_set);
4007 if ((GET_CODE (src) == ASHIFT
4008 || GET_CODE (src) == ASHIFTRT
4009 || GET_CODE (src) == LSHIFTRT)
4010 && !CONST_INT_P (XEXP (src, 1))
4011 && reg_or_subregno (XEXP (src, 1)) == regno)
4013 rtx tmp2 = gen_reg_rtx (V2DImode);
4015 start_sequence ();
4017 if (TARGET_SSE4_1)
4018 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
4019 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
4020 else
4022 rtx vec_cst
4023 = gen_rtx_CONST_VECTOR (V2DImode,
4024 gen_rtvec (2, GEN_INT (0xff),
4025 const0_rtx));
4026 vec_cst
4027 = validize_mem (force_const_mem (V2DImode, vec_cst));
4029 emit_insn (gen_rtx_SET
4030 (tmp2,
4031 gen_rtx_AND (V2DImode,
4032 gen_rtx_SUBREG (V2DImode, reg, 0),
4033 vec_cst)));
4035 rtx_insn *seq = get_insns ();
4036 end_sequence ();
4038 emit_insn_before (seq, insn);
4040 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
4042 else if (!MEM_P (dst) || !REG_P (src))
4043 replace_with_subreg_in_insn (insn, reg, reg);
4045 bitmap_clear_bit (conv, INSN_UID (insn));
4048 /* Skip debug insns and uninitialized uses. */
4049 else if (DF_REF_CHAIN (ref)
4050 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
4052 gcc_assert (scopy);
4053 replace_rtx (DF_REF_INSN (ref), reg, scopy);
4054 df_insn_rescan (DF_REF_INSN (ref));
4057 BITMAP_FREE (conv);
4060 /* Convert operand OP in INSN. We should handle
4061 memory operands and uninitialized registers.
4062 All other register uses are converted during
4063 registers conversion. */
4065 void
4066 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
4068 *op = copy_rtx_if_shared (*op);
4070 if (GET_CODE (*op) == NOT)
4072 convert_op (&XEXP (*op, 0), insn);
4073 PUT_MODE (*op, V2DImode);
4075 else if (MEM_P (*op))
4077 rtx tmp = gen_reg_rtx (DImode);
4079 emit_insn_before (gen_move_insn (tmp, *op), insn);
4080 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
4082 if (dump_file)
4083 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
4084 INSN_UID (insn), REGNO (tmp));
4086 else if (REG_P (*op))
4088 /* We may have not converted register usage in case
4089 this register has no definition. Otherwise it
4090 should be converted in convert_reg. */
4091 df_ref ref;
4092 FOR_EACH_INSN_USE (ref, insn)
4093 if (DF_REF_REGNO (ref) == REGNO (*op))
4095 gcc_assert (!DF_REF_CHAIN (ref));
4096 break;
4098 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
4100 else if (CONST_INT_P (*op))
4102 rtx vec_cst;
4103 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
4105 /* Prefer all ones vector in case of -1. */
4106 if (constm1_operand (*op, GET_MODE (*op)))
4107 vec_cst = CONSTM1_RTX (V2DImode);
4108 else
4109 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
4110 gen_rtvec (2, *op, const0_rtx));
4112 if (!standard_sse_constant_p (vec_cst, V2DImode))
4114 start_sequence ();
4115 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
4116 rtx_insn *seq = get_insns ();
4117 end_sequence ();
4118 emit_insn_before (seq, insn);
4121 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
4122 *op = tmp;
4124 else
4126 gcc_assert (SUBREG_P (*op));
4127 gcc_assert (GET_MODE (*op) == V2DImode);
4131 /* Convert INSN to vector mode. */
4133 void
4134 dimode_scalar_chain::convert_insn (rtx_insn *insn)
4136 rtx def_set = single_set (insn);
4137 rtx src = SET_SRC (def_set);
4138 rtx dst = SET_DEST (def_set);
4139 rtx subreg;
4141 if (MEM_P (dst) && !REG_P (src))
4143 /* There are no scalar integer instructions and therefore
4144 temporary register usage is required. */
4145 rtx tmp = gen_reg_rtx (DImode);
4146 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
4147 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
4150 switch (GET_CODE (src))
4152 case ASHIFT:
4153 case ASHIFTRT:
4154 case LSHIFTRT:
4155 convert_op (&XEXP (src, 0), insn);
4156 PUT_MODE (src, V2DImode);
4157 break;
4159 case PLUS:
4160 case MINUS:
4161 case IOR:
4162 case XOR:
4163 case AND:
4164 convert_op (&XEXP (src, 0), insn);
4165 convert_op (&XEXP (src, 1), insn);
4166 PUT_MODE (src, V2DImode);
4167 break;
4169 case NEG:
4170 src = XEXP (src, 0);
4171 convert_op (&src, insn);
4172 subreg = gen_reg_rtx (V2DImode);
4173 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
4174 src = gen_rtx_MINUS (V2DImode, subreg, src);
4175 break;
4177 case NOT:
4178 src = XEXP (src, 0);
4179 convert_op (&src, insn);
4180 subreg = gen_reg_rtx (V2DImode);
4181 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
4182 src = gen_rtx_XOR (V2DImode, src, subreg);
4183 break;
4185 case MEM:
4186 if (!REG_P (dst))
4187 convert_op (&src, insn);
4188 break;
4190 case REG:
4191 if (!MEM_P (dst))
4192 convert_op (&src, insn);
4193 break;
4195 case SUBREG:
4196 gcc_assert (GET_MODE (src) == V2DImode);
4197 break;
4199 case COMPARE:
4200 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
4202 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
4203 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
4205 if (REG_P (src))
4206 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
4207 else
4208 subreg = copy_rtx_if_shared (src);
4209 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
4210 copy_rtx_if_shared (subreg),
4211 copy_rtx_if_shared (subreg)),
4212 insn);
4213 dst = gen_rtx_REG (CCmode, FLAGS_REG);
4214 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
4215 copy_rtx_if_shared (src)),
4216 UNSPEC_PTEST);
4217 break;
4219 case CONST_INT:
4220 convert_op (&src, insn);
4221 break;
4223 default:
4224 gcc_unreachable ();
4227 SET_SRC (def_set) = src;
4228 SET_DEST (def_set) = dst;
4230 /* Drop possible dead definitions. */
4231 PATTERN (insn) = def_set;
4233 INSN_CODE (insn) = -1;
4234 recog_memoized (insn);
4235 df_insn_rescan (insn);
4238 /* Fix uses of converted REG in debug insns. */
4240 void
4241 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
4243 if (!flag_var_tracking)
4244 return;
4246 df_ref ref, next;
4247 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
4249 rtx_insn *insn = DF_REF_INSN (ref);
4250 /* Make sure the next ref is for a different instruction,
4251 so that we're not affected by the rescan. */
4252 next = DF_REF_NEXT_REG (ref);
4253 while (next && DF_REF_INSN (next) == insn)
4254 next = DF_REF_NEXT_REG (next);
4256 if (DEBUG_INSN_P (insn))
4258 /* It may be a debug insn with a TImode variable in
4259 register. */
4260 bool changed = false;
4261 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
4263 rtx *loc = DF_REF_LOC (ref);
4264 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
4266 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
4267 changed = true;
4270 if (changed)
4271 df_insn_rescan (insn);
4276 /* Convert INSN from TImode to V1T1mode. */
4278 void
4279 timode_scalar_chain::convert_insn (rtx_insn *insn)
4281 rtx def_set = single_set (insn);
4282 rtx src = SET_SRC (def_set);
4283 rtx dst = SET_DEST (def_set);
4285 switch (GET_CODE (dst))
4287 case REG:
4289 rtx tmp = find_reg_equal_equiv_note (insn);
4290 if (tmp)
4291 PUT_MODE (XEXP (tmp, 0), V1TImode);
4292 PUT_MODE (dst, V1TImode);
4293 fix_debug_reg_uses (dst);
4295 break;
4296 case MEM:
4297 PUT_MODE (dst, V1TImode);
4298 break;
4300 default:
4301 gcc_unreachable ();
4304 switch (GET_CODE (src))
4306 case REG:
4307 PUT_MODE (src, V1TImode);
4308 /* Call fix_debug_reg_uses only if SRC is never defined. */
4309 if (!DF_REG_DEF_CHAIN (REGNO (src)))
4310 fix_debug_reg_uses (src);
4311 break;
4313 case MEM:
4314 PUT_MODE (src, V1TImode);
4315 break;
4317 case CONST_WIDE_INT:
4318 if (NONDEBUG_INSN_P (insn))
4320 /* Since there are no instructions to store 128-bit constant,
4321 temporary register usage is required. */
4322 rtx tmp = gen_reg_rtx (V1TImode);
4323 start_sequence ();
4324 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
4325 src = validize_mem (force_const_mem (V1TImode, src));
4326 rtx_insn *seq = get_insns ();
4327 end_sequence ();
4328 if (seq)
4329 emit_insn_before (seq, insn);
4330 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4331 dst = tmp;
4333 break;
4335 case CONST_INT:
4336 switch (standard_sse_constant_p (src, TImode))
4338 case 1:
4339 src = CONST0_RTX (GET_MODE (dst));
4340 break;
4341 case 2:
4342 src = CONSTM1_RTX (GET_MODE (dst));
4343 break;
4344 default:
4345 gcc_unreachable ();
4347 if (NONDEBUG_INSN_P (insn))
4349 rtx tmp = gen_reg_rtx (V1TImode);
4350 /* Since there are no instructions to store standard SSE
4351 constant, temporary register usage is required. */
4352 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
4353 dst = tmp;
4355 break;
4357 default:
4358 gcc_unreachable ();
4361 SET_SRC (def_set) = src;
4362 SET_DEST (def_set) = dst;
4364 /* Drop possible dead definitions. */
4365 PATTERN (insn) = def_set;
4367 INSN_CODE (insn) = -1;
4368 recog_memoized (insn);
4369 df_insn_rescan (insn);
4372 void
4373 dimode_scalar_chain::convert_registers ()
4375 bitmap_iterator bi;
4376 unsigned id;
4378 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
4379 convert_reg (id);
4381 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
4382 make_vector_copies (id);
4385 /* Convert whole chain creating required register
4386 conversions and copies. */
4389 scalar_chain::convert ()
4391 bitmap_iterator bi;
4392 unsigned id;
4393 int converted_insns = 0;
4395 if (!dbg_cnt (stv_conversion))
4396 return 0;
4398 if (dump_file)
4399 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
4401 convert_registers ();
4403 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
4405 convert_insn (DF_INSN_UID_GET (id)->insn);
4406 converted_insns++;
4409 return converted_insns;
4412 /* Main STV pass function. Find and convert scalar
4413 instructions into vector mode when profitable. */
4415 static unsigned int
4416 convert_scalars_to_vector ()
4418 basic_block bb;
4419 bitmap candidates;
4420 int converted_insns = 0;
4422 bitmap_obstack_initialize (NULL);
4423 candidates = BITMAP_ALLOC (NULL);
4425 calculate_dominance_info (CDI_DOMINATORS);
4426 df_set_flags (DF_DEFER_INSN_RESCAN);
4427 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
4428 df_md_add_problem ();
4429 df_analyze ();
4431 /* Find all instructions we want to convert into vector mode. */
4432 if (dump_file)
4433 fprintf (dump_file, "Searching for mode conversion candidates...\n");
4435 FOR_EACH_BB_FN (bb, cfun)
4437 rtx_insn *insn;
4438 FOR_BB_INSNS (bb, insn)
4439 if (scalar_to_vector_candidate_p (insn))
4441 if (dump_file)
4442 fprintf (dump_file, " insn %d is marked as a candidate\n",
4443 INSN_UID (insn));
4445 bitmap_set_bit (candidates, INSN_UID (insn));
4449 remove_non_convertible_regs (candidates);
4451 if (bitmap_empty_p (candidates))
4452 if (dump_file)
4453 fprintf (dump_file, "There are no candidates for optimization.\n");
4455 while (!bitmap_empty_p (candidates))
4457 unsigned uid = bitmap_first_set_bit (candidates);
4458 scalar_chain *chain;
4460 if (TARGET_64BIT)
4461 chain = new timode_scalar_chain;
4462 else
4463 chain = new dimode_scalar_chain;
4465 /* Find instructions chain we want to convert to vector mode.
4466 Check all uses and definitions to estimate all required
4467 conversions. */
4468 chain->build (candidates, uid);
4470 if (chain->compute_convert_gain () > 0)
4471 converted_insns += chain->convert ();
4472 else
4473 if (dump_file)
4474 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
4475 chain->chain_id);
4477 delete chain;
4480 if (dump_file)
4481 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
4483 BITMAP_FREE (candidates);
4484 bitmap_obstack_release (NULL);
4485 df_process_deferred_rescans ();
4487 /* Conversion means we may have 128bit register spills/fills
4488 which require aligned stack. */
4489 if (converted_insns)
4491 if (crtl->stack_alignment_needed < 128)
4492 crtl->stack_alignment_needed = 128;
4493 if (crtl->stack_alignment_estimated < 128)
4494 crtl->stack_alignment_estimated = 128;
4495 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
4496 if (TARGET_64BIT)
4497 for (tree parm = DECL_ARGUMENTS (current_function_decl);
4498 parm; parm = DECL_CHAIN (parm))
4500 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
4501 continue;
4502 if (DECL_RTL_SET_P (parm)
4503 && GET_MODE (DECL_RTL (parm)) == V1TImode)
4505 rtx r = DECL_RTL (parm);
4506 if (REG_P (r))
4507 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
4509 if (DECL_INCOMING_RTL (parm)
4510 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
4512 rtx r = DECL_INCOMING_RTL (parm);
4513 if (REG_P (r))
4514 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
4519 return 0;
4522 namespace {
4524 const pass_data pass_data_insert_vzeroupper =
4526 RTL_PASS, /* type */
4527 "vzeroupper", /* name */
4528 OPTGROUP_NONE, /* optinfo_flags */
4529 TV_MACH_DEP, /* tv_id */
4530 0, /* properties_required */
4531 0, /* properties_provided */
4532 0, /* properties_destroyed */
4533 0, /* todo_flags_start */
4534 TODO_df_finish, /* todo_flags_finish */
4537 class pass_insert_vzeroupper : public rtl_opt_pass
4539 public:
4540 pass_insert_vzeroupper(gcc::context *ctxt)
4541 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
4544 /* opt_pass methods: */
4545 virtual bool gate (function *)
4547 return TARGET_AVX && !TARGET_AVX512F
4548 && TARGET_VZEROUPPER && flag_expensive_optimizations
4549 && !optimize_size;
4552 virtual unsigned int execute (function *)
4554 return rest_of_handle_insert_vzeroupper ();
4557 }; // class pass_insert_vzeroupper
4559 const pass_data pass_data_stv =
4561 RTL_PASS, /* type */
4562 "stv", /* name */
4563 OPTGROUP_NONE, /* optinfo_flags */
4564 TV_MACH_DEP, /* tv_id */
4565 0, /* properties_required */
4566 0, /* properties_provided */
4567 0, /* properties_destroyed */
4568 0, /* todo_flags_start */
4569 TODO_df_finish, /* todo_flags_finish */
4572 class pass_stv : public rtl_opt_pass
4574 public:
4575 pass_stv (gcc::context *ctxt)
4576 : rtl_opt_pass (pass_data_stv, ctxt),
4577 timode_p (false)
4580 /* opt_pass methods: */
4581 virtual bool gate (function *)
4583 return (timode_p == !!TARGET_64BIT
4584 && TARGET_STV && TARGET_SSE2 && optimize > 1);
4587 virtual unsigned int execute (function *)
4589 return convert_scalars_to_vector ();
4592 opt_pass *clone ()
4594 return new pass_stv (m_ctxt);
4597 void set_pass_param (unsigned int n, bool param)
4599 gcc_assert (n == 0);
4600 timode_p = param;
4603 private:
4604 bool timode_p;
4605 }; // class pass_stv
4607 } // anon namespace
4609 rtl_opt_pass *
4610 make_pass_insert_vzeroupper (gcc::context *ctxt)
4612 return new pass_insert_vzeroupper (ctxt);
4615 rtl_opt_pass *
4616 make_pass_stv (gcc::context *ctxt)
4618 return new pass_stv (ctxt);
4621 /* Return true if a red-zone is in use. */
4623 bool
4624 ix86_using_red_zone (void)
4626 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
4629 /* Return a string that documents the current -m options. The caller is
4630 responsible for freeing the string. */
4632 static char *
4633 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
4634 int flags, int flags2,
4635 const char *arch, const char *tune,
4636 enum fpmath_unit fpmath, bool add_nl_p)
4638 struct ix86_target_opts
4640 const char *option; /* option string */
4641 HOST_WIDE_INT mask; /* isa mask options */
4644 /* This table is ordered so that options like -msse4.2 that imply other
4645 ISAs come first. Target string will be displayed in the same order. */
4646 static struct ix86_target_opts isa2_opts[] =
4648 { "-mrdpid", OPTION_MASK_ISA_RDPID },
4649 { "-msgx", OPTION_MASK_ISA_SGX },
4650 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
4651 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
4652 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }
4654 static struct ix86_target_opts isa_opts[] =
4656 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
4657 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
4658 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
4659 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
4660 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
4661 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
4662 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
4663 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
4664 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
4665 { "-mavx2", OPTION_MASK_ISA_AVX2 },
4666 { "-mfma", OPTION_MASK_ISA_FMA },
4667 { "-mxop", OPTION_MASK_ISA_XOP },
4668 { "-mfma4", OPTION_MASK_ISA_FMA4 },
4669 { "-mf16c", OPTION_MASK_ISA_F16C },
4670 { "-mavx", OPTION_MASK_ISA_AVX },
4671 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
4672 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
4673 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
4674 { "-msse4a", OPTION_MASK_ISA_SSE4A },
4675 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
4676 { "-msse3", OPTION_MASK_ISA_SSE3 },
4677 { "-maes", OPTION_MASK_ISA_AES },
4678 { "-msha", OPTION_MASK_ISA_SHA },
4679 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
4680 { "-msse2", OPTION_MASK_ISA_SSE2 },
4681 { "-msse", OPTION_MASK_ISA_SSE },
4682 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
4683 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
4684 { "-mmmx", OPTION_MASK_ISA_MMX },
4685 { "-mrtm", OPTION_MASK_ISA_RTM },
4686 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
4687 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
4688 { "-madx", OPTION_MASK_ISA_ADX },
4689 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
4690 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
4691 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
4692 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
4693 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
4694 { "-mxsave", OPTION_MASK_ISA_XSAVE },
4695 { "-mabm", OPTION_MASK_ISA_ABM },
4696 { "-mbmi", OPTION_MASK_ISA_BMI },
4697 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
4698 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
4699 { "-mtbm", OPTION_MASK_ISA_TBM },
4700 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
4701 { "-mcx16", OPTION_MASK_ISA_CX16 },
4702 { "-msahf", OPTION_MASK_ISA_SAHF },
4703 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
4704 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
4705 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
4706 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
4707 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
4708 { "-mclzero", OPTION_MASK_ISA_CLZERO },
4709 { "-mpku", OPTION_MASK_ISA_PKU },
4710 { "-mlwp", OPTION_MASK_ISA_LWP },
4711 { "-mhle", OPTION_MASK_ISA_HLE },
4712 { "-mfxsr", OPTION_MASK_ISA_FXSR },
4713 { "-mmpx", OPTION_MASK_ISA_MPX },
4714 { "-mclwb", OPTION_MASK_ISA_CLWB }
4717 /* Flag options. */
4718 static struct ix86_target_opts flag_opts[] =
4720 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
4721 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
4722 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
4723 { "-m80387", MASK_80387 },
4724 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
4725 { "-malign-double", MASK_ALIGN_DOUBLE },
4726 { "-mcld", MASK_CLD },
4727 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
4728 { "-mieee-fp", MASK_IEEE_FP },
4729 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
4730 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
4731 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
4732 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
4733 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
4734 { "-mno-push-args", MASK_NO_PUSH_ARGS },
4735 { "-mno-red-zone", MASK_NO_RED_ZONE },
4736 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
4737 { "-mrecip", MASK_RECIP },
4738 { "-mrtd", MASK_RTD },
4739 { "-msseregparm", MASK_SSEREGPARM },
4740 { "-mstack-arg-probe", MASK_STACK_PROBE },
4741 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
4742 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
4743 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
4744 { "-mvzeroupper", MASK_VZEROUPPER },
4745 { "-mstv", MASK_STV },
4746 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
4747 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
4748 { "-mprefer-avx128", MASK_PREFER_AVX128 },
4749 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
4752 /* Additional flag options. */
4753 static struct ix86_target_opts flag2_opts[] =
4755 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
4758 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
4759 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
4761 char isa_other[40];
4762 char isa2_other[40];
4763 char flags_other[40];
4764 char flags2_other[40];
4765 unsigned num = 0;
4766 unsigned i, j;
4767 char *ret;
4768 char *ptr;
4769 size_t len;
4770 size_t line_len;
4771 size_t sep_len;
4772 const char *abi;
4774 memset (opts, '\0', sizeof (opts));
4776 /* Add -march= option. */
4777 if (arch)
4779 opts[num][0] = "-march=";
4780 opts[num++][1] = arch;
4783 /* Add -mtune= option. */
4784 if (tune)
4786 opts[num][0] = "-mtune=";
4787 opts[num++][1] = tune;
4790 /* Add -m32/-m64/-mx32. */
4791 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
4793 if ((isa & OPTION_MASK_ABI_64) != 0)
4794 abi = "-m64";
4795 else
4796 abi = "-mx32";
4797 isa &= ~ (OPTION_MASK_ISA_64BIT
4798 | OPTION_MASK_ABI_64
4799 | OPTION_MASK_ABI_X32);
4801 else
4802 abi = "-m32";
4803 opts[num++][0] = abi;
4805 /* Pick out the options in isa2 options. */
4806 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
4808 if ((isa2 & isa2_opts[i].mask) != 0)
4810 opts[num++][0] = isa2_opts[i].option;
4811 isa2 &= ~ isa2_opts[i].mask;
4815 if (isa2 && add_nl_p)
4817 opts[num++][0] = isa2_other;
4818 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
4821 /* Pick out the options in isa options. */
4822 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
4824 if ((isa & isa_opts[i].mask) != 0)
4826 opts[num++][0] = isa_opts[i].option;
4827 isa &= ~ isa_opts[i].mask;
4831 if (isa && add_nl_p)
4833 opts[num++][0] = isa_other;
4834 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
4837 /* Add flag options. */
4838 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
4840 if ((flags & flag_opts[i].mask) != 0)
4842 opts[num++][0] = flag_opts[i].option;
4843 flags &= ~ flag_opts[i].mask;
4847 if (flags && add_nl_p)
4849 opts[num++][0] = flags_other;
4850 sprintf (flags_other, "(other flags: %#x)", flags);
4853 /* Add additional flag options. */
4854 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
4856 if ((flags2 & flag2_opts[i].mask) != 0)
4858 opts[num++][0] = flag2_opts[i].option;
4859 flags2 &= ~ flag2_opts[i].mask;
4863 if (flags2 && add_nl_p)
4865 opts[num++][0] = flags2_other;
4866 sprintf (flags2_other, "(other flags2: %#x)", flags2);
4869 /* Add -fpmath= option. */
4870 if (fpmath)
4872 opts[num][0] = "-mfpmath=";
4873 switch ((int) fpmath)
4875 case FPMATH_387:
4876 opts[num++][1] = "387";
4877 break;
4879 case FPMATH_SSE:
4880 opts[num++][1] = "sse";
4881 break;
4883 case FPMATH_387 | FPMATH_SSE:
4884 opts[num++][1] = "sse+387";
4885 break;
4887 default:
4888 gcc_unreachable ();
4892 /* Any options? */
4893 if (num == 0)
4894 return NULL;
4896 gcc_assert (num < ARRAY_SIZE (opts));
4898 /* Size the string. */
4899 len = 0;
4900 sep_len = (add_nl_p) ? 3 : 1;
4901 for (i = 0; i < num; i++)
4903 len += sep_len;
4904 for (j = 0; j < 2; j++)
4905 if (opts[i][j])
4906 len += strlen (opts[i][j]);
4909 /* Build the string. */
4910 ret = ptr = (char *) xmalloc (len);
4911 line_len = 0;
4913 for (i = 0; i < num; i++)
4915 size_t len2[2];
4917 for (j = 0; j < 2; j++)
4918 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
4920 if (i != 0)
4922 *ptr++ = ' ';
4923 line_len++;
4925 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
4927 *ptr++ = '\\';
4928 *ptr++ = '\n';
4929 line_len = 0;
4933 for (j = 0; j < 2; j++)
4934 if (opts[i][j])
4936 memcpy (ptr, opts[i][j], len2[j]);
4937 ptr += len2[j];
4938 line_len += len2[j];
4942 *ptr = '\0';
4943 gcc_assert (ret + len >= ptr);
4945 return ret;
4948 /* Return true, if profiling code should be emitted before
4949 prologue. Otherwise it returns false.
4950 Note: For x86 with "hotfix" it is sorried. */
4951 static bool
4952 ix86_profile_before_prologue (void)
4954 return flag_fentry != 0;
4957 /* Function that is callable from the debugger to print the current
4958 options. */
4959 void ATTRIBUTE_UNUSED
4960 ix86_debug_options (void)
4962 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
4963 target_flags, ix86_target_flags,
4964 ix86_arch_string,ix86_tune_string,
4965 ix86_fpmath, true);
4967 if (opts)
4969 fprintf (stderr, "%s\n\n", opts);
4970 free (opts);
4972 else
4973 fputs ("<no options>\n\n", stderr);
4975 return;
4978 /* Return true if T is one of the bytes we should avoid with
4979 -fmitigate-rop. */
4981 static bool
4982 ix86_rop_should_change_byte_p (int t)
4984 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
4987 static const char *stringop_alg_names[] = {
4988 #define DEF_ENUM
4989 #define DEF_ALG(alg, name) #name,
4990 #include "stringop.def"
4991 #undef DEF_ENUM
4992 #undef DEF_ALG
4995 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
4996 The string is of the following form (or comma separated list of it):
4998 strategy_alg:max_size:[align|noalign]
5000 where the full size range for the strategy is either [0, max_size] or
5001 [min_size, max_size], in which min_size is the max_size + 1 of the
5002 preceding range. The last size range must have max_size == -1.
5004 Examples:
5007 -mmemcpy-strategy=libcall:-1:noalign
5009 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
5013 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
5015 This is to tell the compiler to use the following strategy for memset
5016 1) when the expected size is between [1, 16], use rep_8byte strategy;
5017 2) when the size is between [17, 2048], use vector_loop;
5018 3) when the size is > 2048, use libcall. */
5020 struct stringop_size_range
5022 int max;
5023 stringop_alg alg;
5024 bool noalign;
5027 static void
5028 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
5030 const struct stringop_algs *default_algs;
5031 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
5032 char *curr_range_str, *next_range_str;
5033 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
5034 int i = 0, n = 0;
5036 if (is_memset)
5037 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
5038 else
5039 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
5041 curr_range_str = strategy_str;
5045 int maxs;
5046 char alg_name[128];
5047 char align[16];
5048 next_range_str = strchr (curr_range_str, ',');
5049 if (next_range_str)
5050 *next_range_str++ = '\0';
5052 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
5053 alg_name, &maxs, align))
5055 error ("wrong argument %qs to option %qs", curr_range_str, opt);
5056 return;
5059 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
5061 error ("size ranges of option %qs should be increasing", opt);
5062 return;
5065 for (i = 0; i < last_alg; i++)
5066 if (!strcmp (alg_name, stringop_alg_names[i]))
5067 break;
5069 if (i == last_alg)
5071 error ("wrong strategy name %qs specified for option %qs",
5072 alg_name, opt);
5074 auto_vec <const char *> candidates;
5075 for (i = 0; i < last_alg; i++)
5076 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
5077 candidates.safe_push (stringop_alg_names[i]);
5079 char *s;
5080 const char *hint
5081 = candidates_list_and_hint (alg_name, s, candidates);
5082 if (hint)
5083 inform (input_location,
5084 "valid arguments to %qs are: %s; did you mean %qs?",
5085 opt, s, hint);
5086 else
5087 inform (input_location, "valid arguments to %qs are: %s",
5088 opt, s);
5089 XDELETEVEC (s);
5090 return;
5093 if ((stringop_alg) i == rep_prefix_8_byte
5094 && !TARGET_64BIT)
5096 /* rep; movq isn't available in 32-bit code. */
5097 error ("strategy name %qs specified for option %qs "
5098 "not supported for 32-bit code", alg_name, opt);
5099 return;
5102 input_ranges[n].max = maxs;
5103 input_ranges[n].alg = (stringop_alg) i;
5104 if (!strcmp (align, "align"))
5105 input_ranges[n].noalign = false;
5106 else if (!strcmp (align, "noalign"))
5107 input_ranges[n].noalign = true;
5108 else
5110 error ("unknown alignment %qs specified for option %qs", align, opt);
5111 return;
5113 n++;
5114 curr_range_str = next_range_str;
5116 while (curr_range_str);
5118 if (input_ranges[n - 1].max != -1)
5120 error ("the max value for the last size range should be -1"
5121 " for option %qs", opt);
5122 return;
5125 if (n > MAX_STRINGOP_ALGS)
5127 error ("too many size ranges specified in option %qs", opt);
5128 return;
5131 /* Now override the default algs array. */
5132 for (i = 0; i < n; i++)
5134 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
5135 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
5136 = input_ranges[i].alg;
5137 *const_cast<int *>(&default_algs->size[i].noalign)
5138 = input_ranges[i].noalign;
5143 /* parse -mtune-ctrl= option. When DUMP is true,
5144 print the features that are explicitly set. */
5146 static void
5147 parse_mtune_ctrl_str (bool dump)
5149 if (!ix86_tune_ctrl_string)
5150 return;
5152 char *next_feature_string = NULL;
5153 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
5154 char *orig = curr_feature_string;
5155 int i;
5158 bool clear = false;
5160 next_feature_string = strchr (curr_feature_string, ',');
5161 if (next_feature_string)
5162 *next_feature_string++ = '\0';
5163 if (*curr_feature_string == '^')
5165 curr_feature_string++;
5166 clear = true;
5168 for (i = 0; i < X86_TUNE_LAST; i++)
5170 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
5172 ix86_tune_features[i] = !clear;
5173 if (dump)
5174 fprintf (stderr, "Explicitly %s feature %s\n",
5175 clear ? "clear" : "set", ix86_tune_feature_names[i]);
5176 break;
5179 if (i == X86_TUNE_LAST)
5180 error ("Unknown parameter to option -mtune-ctrl: %s",
5181 clear ? curr_feature_string - 1 : curr_feature_string);
5182 curr_feature_string = next_feature_string;
5184 while (curr_feature_string);
5185 free (orig);
5188 /* Helper function to set ix86_tune_features. IX86_TUNE is the
5189 processor type. */
5191 static void
5192 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
5194 unsigned int ix86_tune_mask = 1u << ix86_tune;
5195 int i;
5197 for (i = 0; i < X86_TUNE_LAST; ++i)
5199 if (ix86_tune_no_default)
5200 ix86_tune_features[i] = 0;
5201 else
5202 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
5205 if (dump)
5207 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
5208 for (i = 0; i < X86_TUNE_LAST; i++)
5209 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
5210 ix86_tune_features[i] ? "on" : "off");
5213 parse_mtune_ctrl_str (dump);
5217 /* Default align_* from the processor table. */
5219 static void
5220 ix86_default_align (struct gcc_options *opts)
5222 if (opts->x_align_loops == 0)
5224 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
5225 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
5227 if (opts->x_align_jumps == 0)
5229 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
5230 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
5232 if (opts->x_align_functions == 0)
5234 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
5238 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
5240 static void
5241 ix86_override_options_after_change (void)
5243 ix86_default_align (&global_options);
5246 /* Override various settings based on options. If MAIN_ARGS_P, the
5247 options are from the command line, otherwise they are from
5248 attributes. Return true if there's an error related to march
5249 option. */
5251 static bool
5252 ix86_option_override_internal (bool main_args_p,
5253 struct gcc_options *opts,
5254 struct gcc_options *opts_set)
5256 int i;
5257 unsigned int ix86_arch_mask;
5258 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
5260 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
5261 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
5262 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
5263 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
5264 #define PTA_AES (HOST_WIDE_INT_1 << 4)
5265 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
5266 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
5267 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
5268 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
5269 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
5270 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
5271 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
5272 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
5273 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
5274 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
5275 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
5276 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
5277 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
5278 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
5279 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
5280 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
5281 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
5282 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
5283 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
5284 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
5285 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
5286 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
5287 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
5288 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
5289 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
5290 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
5291 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
5292 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
5293 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
5294 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
5295 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
5296 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
5297 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
5298 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
5299 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
5300 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
5301 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
5302 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
5303 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
5304 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
5305 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
5306 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
5307 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
5308 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
5309 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
5310 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
5311 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
5312 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
5313 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
5314 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
5315 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
5316 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
5317 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
5318 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
5319 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
5320 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
5321 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
5322 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
5323 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
5325 #define PTA_CORE2 \
5326 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
5327 | PTA_CX16 | PTA_FXSR)
5328 #define PTA_NEHALEM \
5329 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
5330 #define PTA_WESTMERE \
5331 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
5332 #define PTA_SANDYBRIDGE \
5333 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
5334 #define PTA_IVYBRIDGE \
5335 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
5336 #define PTA_HASWELL \
5337 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
5338 | PTA_FMA | PTA_MOVBE | PTA_HLE)
5339 #define PTA_BROADWELL \
5340 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
5341 #define PTA_SKYLAKE \
5342 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
5343 #define PTA_SKYLAKE_AVX512 \
5344 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
5345 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
5346 #define PTA_KNL \
5347 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
5348 #define PTA_BONNELL \
5349 (PTA_CORE2 | PTA_MOVBE)
5350 #define PTA_SILVERMONT \
5351 (PTA_WESTMERE | PTA_MOVBE)
5353 /* if this reaches 64, need to widen struct pta flags below */
5355 static struct pta
5357 const char *const name; /* processor name or nickname. */
5358 const enum processor_type processor;
5359 const enum attr_cpu schedule;
5360 const unsigned HOST_WIDE_INT flags;
5362 const processor_alias_table[] =
5364 {"i386", PROCESSOR_I386, CPU_NONE, 0},
5365 {"i486", PROCESSOR_I486, CPU_NONE, 0},
5366 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5367 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
5368 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
5369 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
5370 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
5371 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5372 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5373 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
5374 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5375 PTA_MMX | PTA_SSE | PTA_FXSR},
5376 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5377 PTA_MMX | PTA_SSE | PTA_FXSR},
5378 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5379 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5380 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5381 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5382 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5383 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
5384 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
5385 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5386 PTA_MMX | PTA_SSE | PTA_FXSR},
5387 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5388 PTA_MMX | PTA_SSE | PTA_FXSR},
5389 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
5390 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5391 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
5392 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
5393 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
5394 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
5395 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
5396 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5397 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
5398 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5399 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
5400 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
5401 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5402 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
5403 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
5404 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5405 PTA_SANDYBRIDGE},
5406 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5407 PTA_SANDYBRIDGE},
5408 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5409 PTA_IVYBRIDGE},
5410 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
5411 PTA_IVYBRIDGE},
5412 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5413 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
5414 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
5415 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
5416 {"skylake-avx512", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE_AVX512},
5417 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5418 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
5419 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5420 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
5421 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
5422 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
5423 {"geode", PROCESSOR_GEODE, CPU_GEODE,
5424 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5425 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
5426 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5427 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
5428 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
5429 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5430 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
5431 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
5432 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
5433 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5434 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
5435 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5436 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
5437 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
5438 {"x86-64", PROCESSOR_K8, CPU_K8,
5439 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5440 {"eden-x2", PROCESSOR_K8, CPU_K8,
5441 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
5442 {"nano", PROCESSOR_K8, CPU_K8,
5443 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5444 | PTA_SSSE3 | PTA_FXSR},
5445 {"nano-1000", PROCESSOR_K8, CPU_K8,
5446 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5447 | PTA_SSSE3 | PTA_FXSR},
5448 {"nano-2000", PROCESSOR_K8, CPU_K8,
5449 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5450 | PTA_SSSE3 | PTA_FXSR},
5451 {"nano-3000", PROCESSOR_K8, CPU_K8,
5452 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5453 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5454 {"nano-x2", PROCESSOR_K8, CPU_K8,
5455 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5456 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5457 {"eden-x4", PROCESSOR_K8, CPU_K8,
5458 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5459 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5460 {"nano-x4", PROCESSOR_K8, CPU_K8,
5461 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5462 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
5463 {"k8", PROCESSOR_K8, CPU_K8,
5464 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5465 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5466 {"k8-sse3", PROCESSOR_K8, CPU_K8,
5467 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5468 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5469 {"opteron", PROCESSOR_K8, CPU_K8,
5470 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5471 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5472 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
5473 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5474 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5475 {"athlon64", PROCESSOR_K8, CPU_K8,
5476 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5477 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5478 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
5479 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5480 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
5481 {"athlon-fx", PROCESSOR_K8, CPU_K8,
5482 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
5483 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
5484 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5485 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5486 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5487 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
5488 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
5489 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
5490 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
5491 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5492 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5493 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5494 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5495 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
5496 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5497 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5498 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5499 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5500 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
5501 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
5502 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5503 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5504 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
5505 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
5506 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
5507 | PTA_XSAVEOPT | PTA_FSGSBASE},
5508 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
5509 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5510 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5511 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5512 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
5513 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
5514 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
5515 | PTA_MOVBE | PTA_MWAITX},
5516 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
5517 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5518 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
5519 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
5520 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
5521 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
5522 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
5523 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
5524 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
5525 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
5526 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5527 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
5528 | PTA_FXSR | PTA_XSAVE},
5529 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
5530 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
5531 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
5532 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
5533 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
5534 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
5536 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
5537 PTA_64BIT
5538 | PTA_HLE /* flags are only used for -march switch. */ },
5541 /* -mrecip options. */
5542 static struct
5544 const char *string; /* option name */
5545 unsigned int mask; /* mask bits to set */
5547 const recip_options[] =
5549 { "all", RECIP_MASK_ALL },
5550 { "none", RECIP_MASK_NONE },
5551 { "div", RECIP_MASK_DIV },
5552 { "sqrt", RECIP_MASK_SQRT },
5553 { "vec-div", RECIP_MASK_VEC_DIV },
5554 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
5557 int const pta_size = ARRAY_SIZE (processor_alias_table);
5559 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
5560 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
5561 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5562 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
5563 #ifdef TARGET_BI_ARCH
5564 else
5566 #if TARGET_BI_ARCH == 1
5567 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
5568 is on and OPTION_MASK_ABI_X32 is off. We turn off
5569 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
5570 -mx32. */
5571 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5572 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5573 #else
5574 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
5575 on and OPTION_MASK_ABI_64 is off. We turn off
5576 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
5577 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
5578 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
5579 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
5580 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5581 #endif
5582 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5583 && TARGET_IAMCU_P (opts->x_target_flags))
5584 sorry ("Intel MCU psABI isn%'t supported in %s mode",
5585 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
5587 #endif
5589 if (TARGET_X32_P (opts->x_ix86_isa_flags))
5591 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5592 OPTION_MASK_ABI_64 for TARGET_X32. */
5593 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5594 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
5596 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
5597 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
5598 | OPTION_MASK_ABI_X32
5599 | OPTION_MASK_ABI_64);
5600 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
5602 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
5603 OPTION_MASK_ABI_X32 for TARGET_LP64. */
5604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
5605 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
5608 #ifdef SUBTARGET_OVERRIDE_OPTIONS
5609 SUBTARGET_OVERRIDE_OPTIONS;
5610 #endif
5612 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
5613 SUBSUBTARGET_OVERRIDE_OPTIONS;
5614 #endif
5616 /* -fPIC is the default for x86_64. */
5617 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
5618 opts->x_flag_pic = 2;
5620 /* Need to check -mtune=generic first. */
5621 if (opts->x_ix86_tune_string)
5623 /* As special support for cross compilers we read -mtune=native
5624 as -mtune=generic. With native compilers we won't see the
5625 -mtune=native, as it was changed by the driver. */
5626 if (!strcmp (opts->x_ix86_tune_string, "native"))
5628 opts->x_ix86_tune_string = "generic";
5630 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5631 warning (OPT_Wdeprecated,
5632 main_args_p
5633 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
5634 "or %<-mtune=generic%> instead as appropriate")
5635 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
5636 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
5637 " instead as appropriate"));
5639 else
5641 if (opts->x_ix86_arch_string)
5642 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
5643 if (!opts->x_ix86_tune_string)
5645 opts->x_ix86_tune_string
5646 = processor_target_table[TARGET_CPU_DEFAULT].name;
5647 ix86_tune_defaulted = 1;
5650 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
5651 or defaulted. We need to use a sensible tune option. */
5652 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
5654 opts->x_ix86_tune_string = "generic";
5658 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
5659 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
5661 /* rep; movq isn't available in 32-bit code. */
5662 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
5663 opts->x_ix86_stringop_alg = no_stringop;
5666 if (!opts->x_ix86_arch_string)
5667 opts->x_ix86_arch_string
5668 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
5669 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
5670 else
5671 ix86_arch_specified = 1;
5673 if (opts_set->x_ix86_pmode)
5675 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
5676 && opts->x_ix86_pmode == PMODE_SI)
5677 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
5678 && opts->x_ix86_pmode == PMODE_DI))
5679 error ("address mode %qs not supported in the %s bit mode",
5680 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
5681 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
5683 else
5684 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
5685 ? PMODE_DI : PMODE_SI;
5687 if (!opts_set->x_ix86_abi)
5688 opts->x_ix86_abi = DEFAULT_ABI;
5690 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
5691 error ("-mabi=ms not supported with X32 ABI");
5692 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
5694 /* For targets using ms ABI enable ms-extensions, if not
5695 explicit turned off. For non-ms ABI we turn off this
5696 option. */
5697 if (!opts_set->x_flag_ms_extensions)
5698 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
5700 if (opts_set->x_ix86_cmodel)
5702 switch (opts->x_ix86_cmodel)
5704 case CM_SMALL:
5705 case CM_SMALL_PIC:
5706 if (opts->x_flag_pic)
5707 opts->x_ix86_cmodel = CM_SMALL_PIC;
5708 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5709 error ("code model %qs not supported in the %s bit mode",
5710 "small", "32");
5711 break;
5713 case CM_MEDIUM:
5714 case CM_MEDIUM_PIC:
5715 if (opts->x_flag_pic)
5716 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
5717 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5718 error ("code model %qs not supported in the %s bit mode",
5719 "medium", "32");
5720 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5721 error ("code model %qs not supported in x32 mode",
5722 "medium");
5723 break;
5725 case CM_LARGE:
5726 case CM_LARGE_PIC:
5727 if (opts->x_flag_pic)
5728 opts->x_ix86_cmodel = CM_LARGE_PIC;
5729 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5730 error ("code model %qs not supported in the %s bit mode",
5731 "large", "32");
5732 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
5733 error ("code model %qs not supported in x32 mode",
5734 "large");
5735 break;
5737 case CM_32:
5738 if (opts->x_flag_pic)
5739 error ("code model %s does not support PIC mode", "32");
5740 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5741 error ("code model %qs not supported in the %s bit mode",
5742 "32", "64");
5743 break;
5745 case CM_KERNEL:
5746 if (opts->x_flag_pic)
5748 error ("code model %s does not support PIC mode", "kernel");
5749 opts->x_ix86_cmodel = CM_32;
5751 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
5752 error ("code model %qs not supported in the %s bit mode",
5753 "kernel", "32");
5754 break;
5756 default:
5757 gcc_unreachable ();
5760 else
5762 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
5763 use of rip-relative addressing. This eliminates fixups that
5764 would otherwise be needed if this object is to be placed in a
5765 DLL, and is essentially just as efficient as direct addressing. */
5766 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5767 && (TARGET_RDOS || TARGET_PECOFF))
5768 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
5769 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
5770 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
5771 else
5772 opts->x_ix86_cmodel = CM_32;
5774 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
5776 error ("-masm=intel not supported in this configuration");
5777 opts->x_ix86_asm_dialect = ASM_ATT;
5779 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
5780 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
5781 sorry ("%i-bit mode not compiled in",
5782 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
5784 for (i = 0; i < pta_size; i++)
5785 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
5787 if (!strcmp (opts->x_ix86_arch_string, "generic"))
5789 error (main_args_p
5790 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
5791 "switch")
5792 : G_("%<generic%> CPU can be used only for "
5793 "%<target(\"tune=\")%> attribute"));
5794 return false;
5796 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
5798 error (main_args_p
5799 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
5800 "switch")
5801 : G_("%<intel%> CPU can be used only for "
5802 "%<target(\"tune=\")%> attribute"));
5803 return false;
5806 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
5807 && !(processor_alias_table[i].flags & PTA_64BIT))
5809 error ("CPU you selected does not support x86-64 "
5810 "instruction set");
5811 return false;
5814 ix86_schedule = processor_alias_table[i].schedule;
5815 ix86_arch = processor_alias_table[i].processor;
5816 /* Default cpu tuning to the architecture. */
5817 ix86_tune = ix86_arch;
5819 if (processor_alias_table[i].flags & PTA_MMX
5820 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
5821 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
5822 if (processor_alias_table[i].flags & PTA_3DNOW
5823 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
5824 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
5825 if (processor_alias_table[i].flags & PTA_3DNOW_A
5826 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
5827 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
5828 if (processor_alias_table[i].flags & PTA_SSE
5829 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
5830 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
5831 if (processor_alias_table[i].flags & PTA_SSE2
5832 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
5833 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
5834 if (processor_alias_table[i].flags & PTA_SSE3
5835 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
5836 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
5837 if (processor_alias_table[i].flags & PTA_SSSE3
5838 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
5839 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
5840 if (processor_alias_table[i].flags & PTA_SSE4_1
5841 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
5842 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
5843 if (processor_alias_table[i].flags & PTA_SSE4_2
5844 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
5845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
5846 if (processor_alias_table[i].flags & PTA_AVX
5847 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
5848 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
5849 if (processor_alias_table[i].flags & PTA_AVX2
5850 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
5851 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
5852 if (processor_alias_table[i].flags & PTA_FMA
5853 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
5854 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
5855 if (processor_alias_table[i].flags & PTA_SSE4A
5856 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
5857 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
5858 if (processor_alias_table[i].flags & PTA_FMA4
5859 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
5860 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
5861 if (processor_alias_table[i].flags & PTA_XOP
5862 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
5863 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
5864 if (processor_alias_table[i].flags & PTA_LWP
5865 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
5866 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
5867 if (processor_alias_table[i].flags & PTA_ABM
5868 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
5869 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
5870 if (processor_alias_table[i].flags & PTA_BMI
5871 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
5872 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
5873 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
5874 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
5875 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
5876 if (processor_alias_table[i].flags & PTA_TBM
5877 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
5878 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
5879 if (processor_alias_table[i].flags & PTA_BMI2
5880 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
5881 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
5882 if (processor_alias_table[i].flags & PTA_CX16
5883 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
5884 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
5885 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
5886 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
5887 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
5888 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
5889 && (processor_alias_table[i].flags & PTA_NO_SAHF))
5890 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
5891 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
5892 if (processor_alias_table[i].flags & PTA_MOVBE
5893 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
5894 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
5895 if (processor_alias_table[i].flags & PTA_AES
5896 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
5897 ix86_isa_flags |= OPTION_MASK_ISA_AES;
5898 if (processor_alias_table[i].flags & PTA_SHA
5899 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
5900 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
5901 if (processor_alias_table[i].flags & PTA_PCLMUL
5902 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
5903 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
5904 if (processor_alias_table[i].flags & PTA_FSGSBASE
5905 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
5906 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
5907 if (processor_alias_table[i].flags & PTA_RDRND
5908 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
5909 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
5910 if (processor_alias_table[i].flags & PTA_F16C
5911 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
5912 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
5913 if (processor_alias_table[i].flags & PTA_RTM
5914 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
5915 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
5916 if (processor_alias_table[i].flags & PTA_HLE
5917 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
5918 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
5919 if (processor_alias_table[i].flags & PTA_PRFCHW
5920 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
5921 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
5922 if (processor_alias_table[i].flags & PTA_RDSEED
5923 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
5924 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
5925 if (processor_alias_table[i].flags & PTA_ADX
5926 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
5927 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
5928 if (processor_alias_table[i].flags & PTA_FXSR
5929 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
5930 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
5931 if (processor_alias_table[i].flags & PTA_XSAVE
5932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
5933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
5934 if (processor_alias_table[i].flags & PTA_XSAVEOPT
5935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
5936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
5937 if (processor_alias_table[i].flags & PTA_AVX512F
5938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
5939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
5940 if (processor_alias_table[i].flags & PTA_AVX512ER
5941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
5942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
5943 if (processor_alias_table[i].flags & PTA_AVX512PF
5944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
5945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
5946 if (processor_alias_table[i].flags & PTA_AVX512CD
5947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
5948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
5949 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
5950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
5951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
5952 if (processor_alias_table[i].flags & PTA_CLWB
5953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
5954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
5955 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
5956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
5957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
5958 if (processor_alias_table[i].flags & PTA_CLZERO
5959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
5960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
5961 if (processor_alias_table[i].flags & PTA_XSAVEC
5962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
5963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
5964 if (processor_alias_table[i].flags & PTA_XSAVES
5965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
5966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
5967 if (processor_alias_table[i].flags & PTA_AVX512DQ
5968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
5969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
5970 if (processor_alias_table[i].flags & PTA_AVX512BW
5971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
5972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
5973 if (processor_alias_table[i].flags & PTA_AVX512VL
5974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
5975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
5976 if (processor_alias_table[i].flags & PTA_MPX
5977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MPX))
5978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MPX;
5979 if (processor_alias_table[i].flags & PTA_AVX512VBMI
5980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
5981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
5982 if (processor_alias_table[i].flags & PTA_AVX512IFMA
5983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
5984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
5986 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
5987 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
5988 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
5989 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
5990 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
5991 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
5992 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
5993 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
5994 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
5995 if (processor_alias_table[i].flags & PTA_SGX
5996 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
5997 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
5999 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
6000 x86_prefetch_sse = true;
6001 if (processor_alias_table[i].flags & PTA_MWAITX
6002 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
6003 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
6004 if (processor_alias_table[i].flags & PTA_PKU
6005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
6006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
6008 /* Don't enable x87 instructions if only
6009 general registers are allowed. */
6010 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
6011 && !(opts_set->x_target_flags & MASK_80387))
6013 if (processor_alias_table[i].flags & PTA_NO_80387)
6014 opts->x_target_flags &= ~MASK_80387;
6015 else
6016 opts->x_target_flags |= MASK_80387;
6018 break;
6021 if (TARGET_X32 && (opts->x_ix86_isa_flags & OPTION_MASK_ISA_MPX))
6022 error ("Intel MPX does not support x32");
6024 if (TARGET_X32 && (ix86_isa_flags & OPTION_MASK_ISA_MPX))
6025 error ("Intel MPX does not support x32");
6027 if (i == pta_size)
6029 error (main_args_p
6030 ? G_("bad value (%qs) for %<-march=%> switch")
6031 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
6032 opts->x_ix86_arch_string);
6034 auto_vec <const char *> candidates;
6035 for (i = 0; i < pta_size; i++)
6036 if (strcmp (processor_alias_table[i].name, "generic")
6037 && strcmp (processor_alias_table[i].name, "intel")
6038 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6039 || (processor_alias_table[i].flags & PTA_64BIT)))
6040 candidates.safe_push (processor_alias_table[i].name);
6042 char *s;
6043 const char *hint
6044 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
6045 if (hint)
6046 inform (input_location,
6047 main_args_p
6048 ? G_("valid arguments to %<-march=%> switch are: "
6049 "%s; did you mean %qs?")
6050 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
6051 "%s; did you mean %qs?"), s, hint);
6052 else
6053 inform (input_location,
6054 main_args_p
6055 ? G_("valid arguments to %<-march=%> switch are: %s")
6056 : G_("valid arguments to %<target(\"arch=\")%> attribute "
6057 "are: %s"), s);
6058 XDELETEVEC (s);
6061 ix86_arch_mask = 1u << ix86_arch;
6062 for (i = 0; i < X86_ARCH_LAST; ++i)
6063 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6065 for (i = 0; i < pta_size; i++)
6066 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
6068 ix86_schedule = processor_alias_table[i].schedule;
6069 ix86_tune = processor_alias_table[i].processor;
6070 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6072 if (!(processor_alias_table[i].flags & PTA_64BIT))
6074 if (ix86_tune_defaulted)
6076 opts->x_ix86_tune_string = "x86-64";
6077 for (i = 0; i < pta_size; i++)
6078 if (! strcmp (opts->x_ix86_tune_string,
6079 processor_alias_table[i].name))
6080 break;
6081 ix86_schedule = processor_alias_table[i].schedule;
6082 ix86_tune = processor_alias_table[i].processor;
6084 else
6085 error ("CPU you selected does not support x86-64 "
6086 "instruction set");
6089 /* Intel CPUs have always interpreted SSE prefetch instructions as
6090 NOPs; so, we can enable SSE prefetch instructions even when
6091 -mtune (rather than -march) points us to a processor that has them.
6092 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
6093 higher processors. */
6094 if (TARGET_CMOV
6095 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
6096 x86_prefetch_sse = true;
6097 break;
6100 if (ix86_tune_specified && i == pta_size)
6102 error (main_args_p
6103 ? G_("bad value (%qs) for %<-mtune=%> switch")
6104 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
6105 opts->x_ix86_tune_string);
6107 auto_vec <const char *> candidates;
6108 for (i = 0; i < pta_size; i++)
6109 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
6110 || (processor_alias_table[i].flags & PTA_64BIT))
6111 candidates.safe_push (processor_alias_table[i].name);
6113 char *s;
6114 const char *hint
6115 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
6116 if (hint)
6117 inform (input_location,
6118 main_args_p
6119 ? G_("valid arguments to %<-mtune=%> switch are: "
6120 "%s; did you mean %qs?")
6121 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
6122 "%s; did you mean %qs?"), s, hint);
6123 else
6124 inform (input_location,
6125 main_args_p
6126 ? G_("valid arguments to %<-mtune=%> switch are: %s")
6127 : G_("valid arguments to %<target(\"tune=\")%> attribute "
6128 "are: %s"), s);
6129 XDELETEVEC (s);
6132 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
6134 #ifndef USE_IX86_FRAME_POINTER
6135 #define USE_IX86_FRAME_POINTER 0
6136 #endif
6138 #ifndef USE_X86_64_FRAME_POINTER
6139 #define USE_X86_64_FRAME_POINTER 0
6140 #endif
6142 /* Set the default values for switches whose default depends on TARGET_64BIT
6143 in case they weren't overwritten by command line options. */
6144 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6146 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6147 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
6148 if (opts->x_flag_asynchronous_unwind_tables
6149 && !opts_set->x_flag_unwind_tables
6150 && TARGET_64BIT_MS_ABI)
6151 opts->x_flag_unwind_tables = 1;
6152 if (opts->x_flag_asynchronous_unwind_tables == 2)
6153 opts->x_flag_unwind_tables
6154 = opts->x_flag_asynchronous_unwind_tables = 1;
6155 if (opts->x_flag_pcc_struct_return == 2)
6156 opts->x_flag_pcc_struct_return = 0;
6158 else
6160 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
6161 opts->x_flag_omit_frame_pointer
6162 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
6163 if (opts->x_flag_asynchronous_unwind_tables == 2)
6164 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
6165 if (opts->x_flag_pcc_struct_return == 2)
6167 /* Intel MCU psABI specifies that -freg-struct-return should
6168 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
6169 we check -miamcu so that -freg-struct-return is always
6170 turned on if -miamcu is used. */
6171 if (TARGET_IAMCU_P (opts->x_target_flags))
6172 opts->x_flag_pcc_struct_return = 0;
6173 else
6174 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
6178 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6179 /* TODO: ix86_cost should be chosen at instruction or function granuality
6180 so for cold code we use size_cost even in !optimize_size compilation. */
6181 if (opts->x_optimize_size)
6182 ix86_cost = &ix86_size_cost;
6183 else
6184 ix86_cost = ix86_tune_cost;
6186 /* Arrange to set up i386_stack_locals for all functions. */
6187 init_machine_status = ix86_init_machine_status;
6189 /* Validate -mregparm= value. */
6190 if (opts_set->x_ix86_regparm)
6192 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6193 warning (0, "-mregparm is ignored in 64-bit mode");
6194 else if (TARGET_IAMCU_P (opts->x_target_flags))
6195 warning (0, "-mregparm is ignored for Intel MCU psABI");
6196 if (opts->x_ix86_regparm > REGPARM_MAX)
6198 error ("-mregparm=%d is not between 0 and %d",
6199 opts->x_ix86_regparm, REGPARM_MAX);
6200 opts->x_ix86_regparm = 0;
6203 if (TARGET_IAMCU_P (opts->x_target_flags)
6204 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
6205 opts->x_ix86_regparm = REGPARM_MAX;
6207 /* Default align_* from the processor table. */
6208 ix86_default_align (opts);
6210 /* Provide default for -mbranch-cost= value. */
6211 if (!opts_set->x_ix86_branch_cost)
6212 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
6214 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6216 opts->x_target_flags
6217 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
6219 /* Enable by default the SSE and MMX builtins. Do allow the user to
6220 explicitly disable any of these. In particular, disabling SSE and
6221 MMX for kernel code is extremely useful. */
6222 if (!ix86_arch_specified)
6223 opts->x_ix86_isa_flags
6224 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
6225 | TARGET_SUBTARGET64_ISA_DEFAULT)
6226 & ~opts->x_ix86_isa_flags_explicit);
6228 if (TARGET_RTD_P (opts->x_target_flags))
6229 warning (0,
6230 main_args_p
6231 ? G_("%<-mrtd%> is ignored in 64bit mode")
6232 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
6234 else
6236 opts->x_target_flags
6237 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
6239 if (!ix86_arch_specified)
6240 opts->x_ix86_isa_flags
6241 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
6243 /* i386 ABI does not specify red zone. It still makes sense to use it
6244 when programmer takes care to stack from being destroyed. */
6245 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
6246 opts->x_target_flags |= MASK_NO_RED_ZONE;
6249 /* Keep nonleaf frame pointers. */
6250 if (opts->x_flag_omit_frame_pointer)
6251 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
6252 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
6253 opts->x_flag_omit_frame_pointer = 1;
6255 /* If we're doing fast math, we don't care about comparison order
6256 wrt NaNs. This lets us use a shorter comparison sequence. */
6257 if (opts->x_flag_finite_math_only)
6258 opts->x_target_flags &= ~MASK_IEEE_FP;
6260 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
6261 since the insns won't need emulation. */
6262 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
6263 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
6265 /* Likewise, if the target doesn't have a 387, or we've specified
6266 software floating point, don't use 387 inline intrinsics. */
6267 if (!TARGET_80387_P (opts->x_target_flags))
6268 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
6270 /* Turn on MMX builtins for -msse. */
6271 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
6272 opts->x_ix86_isa_flags
6273 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
6275 /* Enable SSE prefetch. */
6276 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
6277 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
6278 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
6279 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
6280 x86_prefetch_sse = true;
6282 /* Enable popcnt instruction for -msse4.2 or -mabm. */
6283 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
6284 || TARGET_ABM_P (opts->x_ix86_isa_flags))
6285 opts->x_ix86_isa_flags
6286 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
6288 /* Enable lzcnt instruction for -mabm. */
6289 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
6290 opts->x_ix86_isa_flags
6291 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
6293 /* Disable BMI, BMI2 and TBM instructions for -m16. */
6294 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
6295 opts->x_ix86_isa_flags
6296 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
6297 & ~opts->x_ix86_isa_flags_explicit);
6299 /* Validate -mpreferred-stack-boundary= value or default it to
6300 PREFERRED_STACK_BOUNDARY_DEFAULT. */
6301 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
6302 if (opts_set->x_ix86_preferred_stack_boundary_arg)
6304 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
6305 int max = TARGET_SEH ? 4 : 12;
6307 if (opts->x_ix86_preferred_stack_boundary_arg < min
6308 || opts->x_ix86_preferred_stack_boundary_arg > max)
6310 if (min == max)
6311 error ("-mpreferred-stack-boundary is not supported "
6312 "for this target");
6313 else
6314 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
6315 opts->x_ix86_preferred_stack_boundary_arg, min, max);
6317 else
6318 ix86_preferred_stack_boundary
6319 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
6322 /* Set the default value for -mstackrealign. */
6323 if (!opts_set->x_ix86_force_align_arg_pointer)
6324 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
6326 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
6328 /* Validate -mincoming-stack-boundary= value or default it to
6329 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
6330 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
6331 if (opts_set->x_ix86_incoming_stack_boundary_arg)
6333 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
6335 if (opts->x_ix86_incoming_stack_boundary_arg < min
6336 || opts->x_ix86_incoming_stack_boundary_arg > 12)
6337 error ("-mincoming-stack-boundary=%d is not between %d and 12",
6338 opts->x_ix86_incoming_stack_boundary_arg, min);
6339 else
6341 ix86_user_incoming_stack_boundary
6342 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
6343 ix86_incoming_stack_boundary
6344 = ix86_user_incoming_stack_boundary;
6348 #ifndef NO_PROFILE_COUNTERS
6349 if (flag_nop_mcount)
6350 error ("-mnop-mcount is not compatible with this target");
6351 #endif
6352 if (flag_nop_mcount && flag_pic)
6353 error ("-mnop-mcount is not implemented for -fPIC");
6355 /* Accept -msseregparm only if at least SSE support is enabled. */
6356 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
6357 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
6358 error (main_args_p
6359 ? G_("%<-msseregparm%> used without SSE enabled")
6360 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
6362 if (opts_set->x_ix86_fpmath)
6364 if (opts->x_ix86_fpmath & FPMATH_SSE)
6366 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
6368 if (TARGET_80387_P (opts->x_target_flags))
6370 warning (0, "SSE instruction set disabled, using 387 arithmetics");
6371 opts->x_ix86_fpmath = FPMATH_387;
6374 else if ((opts->x_ix86_fpmath & FPMATH_387)
6375 && !TARGET_80387_P (opts->x_target_flags))
6377 warning (0, "387 instruction set disabled, using SSE arithmetics");
6378 opts->x_ix86_fpmath = FPMATH_SSE;
6382 /* For all chips supporting SSE2, -mfpmath=sse performs better than
6383 fpmath=387. The second is however default at many targets since the
6384 extra 80bit precision of temporaries is considered to be part of ABI.
6385 Overwrite the default at least for -ffast-math.
6386 TODO: -mfpmath=both seems to produce same performing code with bit
6387 smaller binaries. It is however not clear if register allocation is
6388 ready for this setting.
6389 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
6390 codegen. We may switch to 387 with -ffast-math for size optimized
6391 functions. */
6392 else if (fast_math_flags_set_p (&global_options)
6393 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
6394 opts->x_ix86_fpmath = FPMATH_SSE;
6395 else
6396 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
6398 /* Use external vectorized library in vectorizing intrinsics. */
6399 if (opts_set->x_ix86_veclibabi_type)
6400 switch (opts->x_ix86_veclibabi_type)
6402 case ix86_veclibabi_type_svml:
6403 ix86_veclib_handler = ix86_veclibabi_svml;
6404 break;
6406 case ix86_veclibabi_type_acml:
6407 ix86_veclib_handler = ix86_veclibabi_acml;
6408 break;
6410 default:
6411 gcc_unreachable ();
6414 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
6415 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6416 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6418 /* If stack probes are required, the space used for large function
6419 arguments on the stack must also be probed, so enable
6420 -maccumulate-outgoing-args so this happens in the prologue. */
6421 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
6422 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6424 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6425 warning (0,
6426 main_args_p
6427 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
6428 "for correctness")
6429 : G_("stack probing requires "
6430 "%<target(\"accumulate-outgoing-args\")%> for "
6431 "correctness"));
6432 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6435 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
6436 so enable -maccumulate-outgoing-args when %ebp is fixed. */
6437 if (fixed_regs[BP_REG]
6438 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
6440 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
6441 warning (0,
6442 main_args_p
6443 ? G_("fixed ebp register requires "
6444 "%<-maccumulate-outgoing-args%>")
6445 : G_("fixed ebp register requires "
6446 "%<target(\"accumulate-outgoing-args\")%>"));
6447 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
6450 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
6452 char *p;
6453 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
6454 p = strchr (internal_label_prefix, 'X');
6455 internal_label_prefix_len = p - internal_label_prefix;
6456 *p = '\0';
6459 /* When scheduling description is not available, disable scheduler pass
6460 so it won't slow down the compilation and make x87 code slower. */
6461 if (!TARGET_SCHEDULE)
6462 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
6464 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
6465 ix86_tune_cost->simultaneous_prefetches,
6466 opts->x_param_values,
6467 opts_set->x_param_values);
6468 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
6469 ix86_tune_cost->prefetch_block,
6470 opts->x_param_values,
6471 opts_set->x_param_values);
6472 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
6473 ix86_tune_cost->l1_cache_size,
6474 opts->x_param_values,
6475 opts_set->x_param_values);
6476 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
6477 ix86_tune_cost->l2_cache_size,
6478 opts->x_param_values,
6479 opts_set->x_param_values);
6481 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
6482 if (opts->x_flag_prefetch_loop_arrays < 0
6483 && HAVE_prefetch
6484 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
6485 && !opts->x_optimize_size
6486 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
6487 opts->x_flag_prefetch_loop_arrays = 1;
6489 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
6490 can be opts->x_optimized to ap = __builtin_next_arg (0). */
6491 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
6492 targetm.expand_builtin_va_start = NULL;
6494 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
6496 ix86_gen_leave = gen_leave_rex64;
6497 if (Pmode == DImode)
6499 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
6500 ix86_gen_tls_local_dynamic_base_64
6501 = gen_tls_local_dynamic_base_64_di;
6503 else
6505 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
6506 ix86_gen_tls_local_dynamic_base_64
6507 = gen_tls_local_dynamic_base_64_si;
6510 else
6511 ix86_gen_leave = gen_leave;
6513 if (Pmode == DImode)
6515 ix86_gen_add3 = gen_adddi3;
6516 ix86_gen_sub3 = gen_subdi3;
6517 ix86_gen_sub3_carry = gen_subdi3_carry;
6518 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
6519 ix86_gen_andsp = gen_anddi3;
6520 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
6521 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
6522 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
6523 ix86_gen_monitor = gen_sse3_monitor_di;
6524 ix86_gen_monitorx = gen_monitorx_di;
6525 ix86_gen_clzero = gen_clzero_di;
6527 else
6529 ix86_gen_add3 = gen_addsi3;
6530 ix86_gen_sub3 = gen_subsi3;
6531 ix86_gen_sub3_carry = gen_subsi3_carry;
6532 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
6533 ix86_gen_andsp = gen_andsi3;
6534 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
6535 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
6536 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
6537 ix86_gen_monitor = gen_sse3_monitor_si;
6538 ix86_gen_monitorx = gen_monitorx_si;
6539 ix86_gen_clzero = gen_clzero_si;
6542 #ifdef USE_IX86_CLD
6543 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
6544 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
6545 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
6546 #endif
6548 /* Set the default value for -mfentry. */
6549 if (!opts_set->x_flag_fentry)
6550 opts->x_flag_fentry = TARGET_SEH;
6551 else
6553 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
6554 && opts->x_flag_fentry)
6555 sorry ("-mfentry isn%'t supported for 32-bit in combination "
6556 "with -fpic");
6557 else if (TARGET_SEH && !opts->x_flag_fentry)
6558 sorry ("-mno-fentry isn%'t compatible with SEH");
6561 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
6562 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
6564 if (!(opts_set->x_target_flags & MASK_VZEROUPPER))
6565 opts->x_target_flags |= MASK_VZEROUPPER;
6566 if (!(opts_set->x_target_flags & MASK_STV))
6567 opts->x_target_flags |= MASK_STV;
6568 /* Disable STV if -mpreferred-stack-boundary={2,3} or
6569 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
6570 stack realignment will be extra cost the pass doesn't take into
6571 account and the pass can't realign the stack. */
6572 if (ix86_preferred_stack_boundary < 128
6573 || ix86_incoming_stack_boundary < 128
6574 || opts->x_ix86_force_align_arg_pointer)
6575 opts->x_target_flags &= ~MASK_STV;
6576 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
6577 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
6578 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
6579 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
6580 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
6581 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
6582 /* Enable 128-bit AVX instruction generation
6583 for the auto-vectorizer. */
6584 if (TARGET_AVX128_OPTIMAL
6585 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
6586 opts->x_target_flags |= MASK_PREFER_AVX128;
6588 if (opts->x_ix86_recip_name)
6590 char *p = ASTRDUP (opts->x_ix86_recip_name);
6591 char *q;
6592 unsigned int mask, i;
6593 bool invert;
6595 while ((q = strtok (p, ",")) != NULL)
6597 p = NULL;
6598 if (*q == '!')
6600 invert = true;
6601 q++;
6603 else
6604 invert = false;
6606 if (!strcmp (q, "default"))
6607 mask = RECIP_MASK_ALL;
6608 else
6610 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
6611 if (!strcmp (q, recip_options[i].string))
6613 mask = recip_options[i].mask;
6614 break;
6617 if (i == ARRAY_SIZE (recip_options))
6619 error ("unknown option for -mrecip=%s", q);
6620 invert = false;
6621 mask = RECIP_MASK_NONE;
6625 opts->x_recip_mask_explicit |= mask;
6626 if (invert)
6627 opts->x_recip_mask &= ~mask;
6628 else
6629 opts->x_recip_mask |= mask;
6633 if (TARGET_RECIP_P (opts->x_target_flags))
6634 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
6635 else if (opts_set->x_target_flags & MASK_RECIP)
6636 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
6638 /* Default long double to 64-bit for 32-bit Bionic and to __float128
6639 for 64-bit Bionic. Also default long double to 64-bit for Intel
6640 MCU psABI. */
6641 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
6642 && !(opts_set->x_target_flags
6643 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
6644 opts->x_target_flags |= (TARGET_64BIT
6645 ? MASK_LONG_DOUBLE_128
6646 : MASK_LONG_DOUBLE_64);
6648 /* Only one of them can be active. */
6649 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
6650 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
6652 /* Handle stack protector */
6653 if (!opts_set->x_ix86_stack_protector_guard)
6654 opts->x_ix86_stack_protector_guard
6655 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
6657 #ifdef TARGET_THREAD_SSP_OFFSET
6658 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
6659 #endif
6661 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
6663 char *endp;
6664 const char *str = ix86_stack_protector_guard_offset_str;
6666 errno = 0;
6667 int64_t offset;
6669 #if defined(INT64_T_IS_LONG)
6670 offset = strtol (str, &endp, 0);
6671 #else
6672 offset = strtoll (str, &endp, 0);
6673 #endif
6675 if (!*str || *endp || errno)
6676 error ("%qs is not a valid number "
6677 "in -mstack-protector-guard-offset=", str);
6679 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
6680 HOST_WIDE_INT_C (0x7fffffff)))
6681 error ("%qs is not a valid offset "
6682 "in -mstack-protector-guard-offset=", str);
6684 ix86_stack_protector_guard_offset = offset;
6687 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
6689 /* The kernel uses a different segment register for performance
6690 reasons; a system call would not have to trash the userspace
6691 segment register, which would be expensive. */
6692 if (ix86_cmodel == CM_KERNEL)
6693 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
6695 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
6697 const char *str = ix86_stack_protector_guard_reg_str;
6698 addr_space_t seg = ADDR_SPACE_GENERIC;
6700 /* Discard optional register prefix. */
6701 if (str[0] == '%')
6702 str++;
6704 if (strlen (str) == 2 && str[1] == 's')
6706 if (str[0] == 'f')
6707 seg = ADDR_SPACE_SEG_FS;
6708 else if (str[0] == 'g')
6709 seg = ADDR_SPACE_SEG_GS;
6712 if (seg == ADDR_SPACE_GENERIC)
6713 error ("%qs is not a valid base register "
6714 "in -mstack-protector-guard-reg=",
6715 ix86_stack_protector_guard_reg_str);
6717 ix86_stack_protector_guard_reg = seg;
6720 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
6721 if (opts->x_ix86_tune_memcpy_strategy)
6723 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
6724 ix86_parse_stringop_strategy_string (str, false);
6725 free (str);
6728 if (opts->x_ix86_tune_memset_strategy)
6730 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
6731 ix86_parse_stringop_strategy_string (str, true);
6732 free (str);
6735 /* Save the initial options in case the user does function specific
6736 options. */
6737 if (main_args_p)
6738 target_option_default_node = target_option_current_node
6739 = build_target_option_node (opts);
6741 return true;
6744 /* Implement the TARGET_OPTION_OVERRIDE hook. */
6746 static void
6747 ix86_option_override (void)
6749 ix86_option_override_internal (true, &global_options, &global_options_set);
6752 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
6753 static char *
6754 ix86_offload_options (void)
6756 if (TARGET_LP64)
6757 return xstrdup ("-foffload-abi=lp64");
6758 return xstrdup ("-foffload-abi=ilp32");
6761 /* Update register usage after having seen the compiler flags. */
6763 static void
6764 ix86_conditional_register_usage (void)
6766 int i, c_mask;
6768 /* If there are no caller-saved registers, preserve all registers.
6769 except fixed_regs and registers used for function return value
6770 since aggregate_value_p checks call_used_regs[regno] on return
6771 value. */
6772 if (cfun && cfun->machine->no_caller_saved_registers)
6773 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6774 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
6775 call_used_regs[i] = 0;
6777 /* For 32-bit targets, squash the REX registers. */
6778 if (! TARGET_64BIT)
6780 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
6781 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6782 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
6783 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6784 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6785 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6788 /* See the definition of CALL_USED_REGISTERS in i386.h. */
6789 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
6791 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
6793 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6795 /* Set/reset conditionally defined registers from
6796 CALL_USED_REGISTERS initializer. */
6797 if (call_used_regs[i] > 1)
6798 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
6800 /* Calculate registers of CLOBBERED_REGS register set
6801 as call used registers from GENERAL_REGS register set. */
6802 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
6803 && call_used_regs[i])
6804 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
6807 /* If MMX is disabled, squash the registers. */
6808 if (! TARGET_MMX)
6809 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6810 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
6811 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6813 /* If SSE is disabled, squash the registers. */
6814 if (! TARGET_SSE)
6815 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6816 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
6817 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6819 /* If the FPU is disabled, squash the registers. */
6820 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
6821 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
6822 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
6823 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6825 /* If AVX512F is disabled, squash the registers. */
6826 if (! TARGET_AVX512F)
6828 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
6829 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6831 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
6832 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6835 /* If MPX is disabled, squash the registers. */
6836 if (! TARGET_MPX)
6837 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
6838 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
6842 /* Save the current options */
6844 static void
6845 ix86_function_specific_save (struct cl_target_option *ptr,
6846 struct gcc_options *opts)
6848 ptr->arch = ix86_arch;
6849 ptr->schedule = ix86_schedule;
6850 ptr->prefetch_sse = x86_prefetch_sse;
6851 ptr->tune = ix86_tune;
6852 ptr->branch_cost = ix86_branch_cost;
6853 ptr->tune_defaulted = ix86_tune_defaulted;
6854 ptr->arch_specified = ix86_arch_specified;
6855 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
6856 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
6857 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
6858 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
6859 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
6860 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
6861 ptr->x_ix86_abi = opts->x_ix86_abi;
6862 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
6863 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
6864 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
6865 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
6866 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
6867 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
6868 ptr->x_ix86_pmode = opts->x_ix86_pmode;
6869 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
6870 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
6871 ptr->x_ix86_regparm = opts->x_ix86_regparm;
6872 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
6873 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
6874 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
6875 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
6876 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
6877 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
6878 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
6879 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
6880 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
6881 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
6883 /* The fields are char but the variables are not; make sure the
6884 values fit in the fields. */
6885 gcc_assert (ptr->arch == ix86_arch);
6886 gcc_assert (ptr->schedule == ix86_schedule);
6887 gcc_assert (ptr->tune == ix86_tune);
6888 gcc_assert (ptr->branch_cost == ix86_branch_cost);
6891 /* Restore the current options */
6893 static void
6894 ix86_function_specific_restore (struct gcc_options *opts,
6895 struct cl_target_option *ptr)
6897 enum processor_type old_tune = ix86_tune;
6898 enum processor_type old_arch = ix86_arch;
6899 unsigned int ix86_arch_mask;
6900 int i;
6902 /* We don't change -fPIC. */
6903 opts->x_flag_pic = flag_pic;
6905 ix86_arch = (enum processor_type) ptr->arch;
6906 ix86_schedule = (enum attr_cpu) ptr->schedule;
6907 ix86_tune = (enum processor_type) ptr->tune;
6908 x86_prefetch_sse = ptr->prefetch_sse;
6909 opts->x_ix86_branch_cost = ptr->branch_cost;
6910 ix86_tune_defaulted = ptr->tune_defaulted;
6911 ix86_arch_specified = ptr->arch_specified;
6912 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
6913 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
6914 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
6915 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
6916 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
6917 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
6918 opts->x_ix86_abi = ptr->x_ix86_abi;
6919 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
6920 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
6921 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
6922 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
6923 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
6924 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
6925 opts->x_ix86_pmode = ptr->x_ix86_pmode;
6926 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
6927 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
6928 opts->x_ix86_regparm = ptr->x_ix86_regparm;
6929 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
6930 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
6931 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
6932 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
6933 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
6934 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
6935 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
6936 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
6937 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
6938 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
6939 ix86_tune_cost = processor_target_table[ix86_tune].cost;
6940 /* TODO: ix86_cost should be chosen at instruction or function granuality
6941 so for cold code we use size_cost even in !optimize_size compilation. */
6942 if (opts->x_optimize_size)
6943 ix86_cost = &ix86_size_cost;
6944 else
6945 ix86_cost = ix86_tune_cost;
6947 /* Recreate the arch feature tests if the arch changed */
6948 if (old_arch != ix86_arch)
6950 ix86_arch_mask = 1u << ix86_arch;
6951 for (i = 0; i < X86_ARCH_LAST; ++i)
6952 ix86_arch_features[i]
6953 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
6956 /* Recreate the tune optimization tests */
6957 if (old_tune != ix86_tune)
6958 set_ix86_tune_features (ix86_tune, false);
6961 /* Adjust target options after streaming them in. This is mainly about
6962 reconciling them with global options. */
6964 static void
6965 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
6967 /* flag_pic is a global option, but ix86_cmodel is target saved option
6968 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
6969 for PIC, or error out. */
6970 if (flag_pic)
6971 switch (ptr->x_ix86_cmodel)
6973 case CM_SMALL:
6974 ptr->x_ix86_cmodel = CM_SMALL_PIC;
6975 break;
6977 case CM_MEDIUM:
6978 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
6979 break;
6981 case CM_LARGE:
6982 ptr->x_ix86_cmodel = CM_LARGE_PIC;
6983 break;
6985 case CM_KERNEL:
6986 error ("code model %s does not support PIC mode", "kernel");
6987 break;
6989 default:
6990 break;
6992 else
6993 switch (ptr->x_ix86_cmodel)
6995 case CM_SMALL_PIC:
6996 ptr->x_ix86_cmodel = CM_SMALL;
6997 break;
6999 case CM_MEDIUM_PIC:
7000 ptr->x_ix86_cmodel = CM_MEDIUM;
7001 break;
7003 case CM_LARGE_PIC:
7004 ptr->x_ix86_cmodel = CM_LARGE;
7005 break;
7007 default:
7008 break;
7012 /* Print the current options */
7014 static void
7015 ix86_function_specific_print (FILE *file, int indent,
7016 struct cl_target_option *ptr)
7018 char *target_string
7019 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
7020 ptr->x_target_flags, ptr->x_ix86_target_flags,
7021 NULL, NULL, ptr->x_ix86_fpmath, false);
7023 gcc_assert (ptr->arch < PROCESSOR_max);
7024 fprintf (file, "%*sarch = %d (%s)\n",
7025 indent, "",
7026 ptr->arch, processor_target_table[ptr->arch].name);
7028 gcc_assert (ptr->tune < PROCESSOR_max);
7029 fprintf (file, "%*stune = %d (%s)\n",
7030 indent, "",
7031 ptr->tune, processor_target_table[ptr->tune].name);
7033 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
7035 if (target_string)
7037 fprintf (file, "%*s%s\n", indent, "", target_string);
7038 free (target_string);
7043 /* Inner function to process the attribute((target(...))), take an argument and
7044 set the current options from the argument. If we have a list, recursively go
7045 over the list. */
7047 static bool
7048 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
7049 struct gcc_options *opts,
7050 struct gcc_options *opts_set,
7051 struct gcc_options *enum_opts_set)
7053 char *next_optstr;
7054 bool ret = true;
7056 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
7057 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
7058 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
7059 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
7060 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
7062 enum ix86_opt_type
7064 ix86_opt_unknown,
7065 ix86_opt_yes,
7066 ix86_opt_no,
7067 ix86_opt_str,
7068 ix86_opt_enum,
7069 ix86_opt_isa
7072 static const struct
7074 const char *string;
7075 size_t len;
7076 enum ix86_opt_type type;
7077 int opt;
7078 int mask;
7079 } attrs[] = {
7080 /* isa options */
7081 IX86_ATTR_ISA ("sgx", OPT_msgx),
7082 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
7083 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
7084 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
7086 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
7087 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
7088 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
7089 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
7090 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
7091 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
7092 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
7093 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
7094 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
7095 IX86_ATTR_ISA ("avx2", OPT_mavx2),
7096 IX86_ATTR_ISA ("fma", OPT_mfma),
7097 IX86_ATTR_ISA ("xop", OPT_mxop),
7098 IX86_ATTR_ISA ("fma4", OPT_mfma4),
7099 IX86_ATTR_ISA ("f16c", OPT_mf16c),
7100 IX86_ATTR_ISA ("avx", OPT_mavx),
7101 IX86_ATTR_ISA ("sse4", OPT_msse4),
7102 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
7103 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
7104 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
7105 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
7106 IX86_ATTR_ISA ("sse3", OPT_msse3),
7107 IX86_ATTR_ISA ("aes", OPT_maes),
7108 IX86_ATTR_ISA ("sha", OPT_msha),
7109 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
7110 IX86_ATTR_ISA ("sse2", OPT_msse2),
7111 IX86_ATTR_ISA ("sse", OPT_msse),
7112 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
7113 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
7114 IX86_ATTR_ISA ("mmx", OPT_mmmx),
7115 IX86_ATTR_ISA ("rtm", OPT_mrtm),
7116 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
7117 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
7118 IX86_ATTR_ISA ("adx", OPT_madx),
7119 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
7120 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
7121 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
7122 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
7123 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
7124 IX86_ATTR_ISA ("xsave", OPT_mxsave),
7125 IX86_ATTR_ISA ("abm", OPT_mabm),
7126 IX86_ATTR_ISA ("bmi", OPT_mbmi),
7127 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
7128 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
7129 IX86_ATTR_ISA ("tbm", OPT_mtbm),
7130 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
7131 IX86_ATTR_ISA ("cx16", OPT_mcx16),
7132 IX86_ATTR_ISA ("sahf", OPT_msahf),
7133 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
7134 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
7135 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
7136 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
7137 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
7138 IX86_ATTR_ISA ("clzero", OPT_mclzero),
7139 IX86_ATTR_ISA ("pku", OPT_mpku),
7140 IX86_ATTR_ISA ("lwp", OPT_mlwp),
7141 IX86_ATTR_ISA ("hle", OPT_mhle),
7142 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
7143 IX86_ATTR_ISA ("mpx", OPT_mmpx),
7144 IX86_ATTR_ISA ("clwb", OPT_mclwb),
7145 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
7147 /* enum options */
7148 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
7150 /* string options */
7151 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
7152 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
7154 /* flag options */
7155 IX86_ATTR_YES ("cld",
7156 OPT_mcld,
7157 MASK_CLD),
7159 IX86_ATTR_NO ("fancy-math-387",
7160 OPT_mfancy_math_387,
7161 MASK_NO_FANCY_MATH_387),
7163 IX86_ATTR_YES ("ieee-fp",
7164 OPT_mieee_fp,
7165 MASK_IEEE_FP),
7167 IX86_ATTR_YES ("inline-all-stringops",
7168 OPT_minline_all_stringops,
7169 MASK_INLINE_ALL_STRINGOPS),
7171 IX86_ATTR_YES ("inline-stringops-dynamically",
7172 OPT_minline_stringops_dynamically,
7173 MASK_INLINE_STRINGOPS_DYNAMICALLY),
7175 IX86_ATTR_NO ("align-stringops",
7176 OPT_mno_align_stringops,
7177 MASK_NO_ALIGN_STRINGOPS),
7179 IX86_ATTR_YES ("recip",
7180 OPT_mrecip,
7181 MASK_RECIP),
7185 /* If this is a list, recurse to get the options. */
7186 if (TREE_CODE (args) == TREE_LIST)
7188 bool ret = true;
7190 for (; args; args = TREE_CHAIN (args))
7191 if (TREE_VALUE (args)
7192 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
7193 p_strings, opts, opts_set,
7194 enum_opts_set))
7195 ret = false;
7197 return ret;
7200 else if (TREE_CODE (args) != STRING_CST)
7202 error ("attribute %<target%> argument not a string");
7203 return false;
7206 /* Handle multiple arguments separated by commas. */
7207 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
7209 while (next_optstr && *next_optstr != '\0')
7211 char *p = next_optstr;
7212 char *orig_p = p;
7213 char *comma = strchr (next_optstr, ',');
7214 const char *opt_string;
7215 size_t len, opt_len;
7216 int opt;
7217 bool opt_set_p;
7218 char ch;
7219 unsigned i;
7220 enum ix86_opt_type type = ix86_opt_unknown;
7221 int mask = 0;
7223 if (comma)
7225 *comma = '\0';
7226 len = comma - next_optstr;
7227 next_optstr = comma + 1;
7229 else
7231 len = strlen (p);
7232 next_optstr = NULL;
7235 /* Recognize no-xxx. */
7236 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
7238 opt_set_p = false;
7239 p += 3;
7240 len -= 3;
7242 else
7243 opt_set_p = true;
7245 /* Find the option. */
7246 ch = *p;
7247 opt = N_OPTS;
7248 for (i = 0; i < ARRAY_SIZE (attrs); i++)
7250 type = attrs[i].type;
7251 opt_len = attrs[i].len;
7252 if (ch == attrs[i].string[0]
7253 && ((type != ix86_opt_str && type != ix86_opt_enum)
7254 ? len == opt_len
7255 : len > opt_len)
7256 && memcmp (p, attrs[i].string, opt_len) == 0)
7258 opt = attrs[i].opt;
7259 mask = attrs[i].mask;
7260 opt_string = attrs[i].string;
7261 break;
7265 /* Process the option. */
7266 if (opt == N_OPTS)
7268 error ("attribute(target(\"%s\")) is unknown", orig_p);
7269 ret = false;
7272 else if (type == ix86_opt_isa)
7274 struct cl_decoded_option decoded;
7276 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
7277 ix86_handle_option (opts, opts_set,
7278 &decoded, input_location);
7281 else if (type == ix86_opt_yes || type == ix86_opt_no)
7283 if (type == ix86_opt_no)
7284 opt_set_p = !opt_set_p;
7286 if (opt_set_p)
7287 opts->x_target_flags |= mask;
7288 else
7289 opts->x_target_flags &= ~mask;
7292 else if (type == ix86_opt_str)
7294 if (p_strings[opt])
7296 error ("option(\"%s\") was already specified", opt_string);
7297 ret = false;
7299 else
7300 p_strings[opt] = xstrdup (p + opt_len);
7303 else if (type == ix86_opt_enum)
7305 bool arg_ok;
7306 int value;
7308 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
7309 if (arg_ok)
7310 set_option (opts, enum_opts_set, opt, value,
7311 p + opt_len, DK_UNSPECIFIED, input_location,
7312 global_dc);
7313 else
7315 error ("attribute(target(\"%s\")) is unknown", orig_p);
7316 ret = false;
7320 else
7321 gcc_unreachable ();
7324 return ret;
7327 /* Release allocated strings. */
7328 static void
7329 release_options_strings (char **option_strings)
7331 /* Free up memory allocated to hold the strings */
7332 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
7333 free (option_strings[i]);
7336 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
7338 tree
7339 ix86_valid_target_attribute_tree (tree args,
7340 struct gcc_options *opts,
7341 struct gcc_options *opts_set)
7343 const char *orig_arch_string = opts->x_ix86_arch_string;
7344 const char *orig_tune_string = opts->x_ix86_tune_string;
7345 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
7346 int orig_tune_defaulted = ix86_tune_defaulted;
7347 int orig_arch_specified = ix86_arch_specified;
7348 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
7349 tree t = NULL_TREE;
7350 struct cl_target_option *def
7351 = TREE_TARGET_OPTION (target_option_default_node);
7352 struct gcc_options enum_opts_set;
7354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
7356 /* Process each of the options on the chain. */
7357 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
7358 opts_set, &enum_opts_set))
7359 return error_mark_node;
7361 /* If the changed options are different from the default, rerun
7362 ix86_option_override_internal, and then save the options away.
7363 The string options are attribute options, and will be undone
7364 when we copy the save structure. */
7365 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
7366 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
7367 || opts->x_target_flags != def->x_target_flags
7368 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
7369 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
7370 || enum_opts_set.x_ix86_fpmath)
7372 /* If we are using the default tune= or arch=, undo the string assigned,
7373 and use the default. */
7374 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
7376 opts->x_ix86_arch_string
7377 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
7379 /* If arch= is set, clear all bits in x_ix86_isa_flags,
7380 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
7381 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
7382 | OPTION_MASK_ABI_64
7383 | OPTION_MASK_ABI_X32
7384 | OPTION_MASK_CODE16);
7385 opts->x_ix86_isa_flags2 = 0;
7387 else if (!orig_arch_specified)
7388 opts->x_ix86_arch_string = NULL;
7390 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
7391 opts->x_ix86_tune_string
7392 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
7393 else if (orig_tune_defaulted)
7394 opts->x_ix86_tune_string = NULL;
7396 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
7397 if (enum_opts_set.x_ix86_fpmath)
7398 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
7400 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
7401 bool r = ix86_option_override_internal (false, opts, opts_set);
7402 if (!r)
7404 release_options_strings (option_strings);
7405 return error_mark_node;
7408 /* Add any builtin functions with the new isa if any. */
7409 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
7411 /* Save the current options unless we are validating options for
7412 #pragma. */
7413 t = build_target_option_node (opts);
7415 opts->x_ix86_arch_string = orig_arch_string;
7416 opts->x_ix86_tune_string = orig_tune_string;
7417 opts_set->x_ix86_fpmath = orig_fpmath_set;
7419 release_options_strings (option_strings);
7422 return t;
7425 /* Hook to validate attribute((target("string"))). */
7427 static bool
7428 ix86_valid_target_attribute_p (tree fndecl,
7429 tree ARG_UNUSED (name),
7430 tree args,
7431 int ARG_UNUSED (flags))
7433 struct gcc_options func_options;
7434 tree new_target, new_optimize;
7435 bool ret = true;
7437 /* attribute((target("default"))) does nothing, beyond
7438 affecting multi-versioning. */
7439 if (TREE_VALUE (args)
7440 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
7441 && TREE_CHAIN (args) == NULL_TREE
7442 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
7443 return true;
7445 tree old_optimize = build_optimization_node (&global_options);
7447 /* Get the optimization options of the current function. */
7448 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
7450 if (!func_optimize)
7451 func_optimize = old_optimize;
7453 /* Init func_options. */
7454 memset (&func_options, 0, sizeof (func_options));
7455 init_options_struct (&func_options, NULL);
7456 lang_hooks.init_options_struct (&func_options);
7458 cl_optimization_restore (&func_options,
7459 TREE_OPTIMIZATION (func_optimize));
7461 /* Initialize func_options to the default before its target options can
7462 be set. */
7463 cl_target_option_restore (&func_options,
7464 TREE_TARGET_OPTION (target_option_default_node));
7466 new_target = ix86_valid_target_attribute_tree (args, &func_options,
7467 &global_options_set);
7469 new_optimize = build_optimization_node (&func_options);
7471 if (new_target == error_mark_node)
7472 ret = false;
7474 else if (fndecl && new_target)
7476 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
7478 if (old_optimize != new_optimize)
7479 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
7482 finalize_options_struct (&func_options);
7484 return ret;
7488 /* Hook to determine if one function can safely inline another. */
7490 static bool
7491 ix86_can_inline_p (tree caller, tree callee)
7493 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
7494 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
7495 if (!callee_tree)
7496 callee_tree = target_option_default_node;
7497 if (!caller_tree)
7498 caller_tree = target_option_default_node;
7499 if (callee_tree == caller_tree)
7500 return true;
7502 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
7503 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
7504 bool ret = false;
7506 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
7507 function can inline a SSE2 function but a SSE2 function can't inline
7508 a SSE4 function. */
7509 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
7510 != callee_opts->x_ix86_isa_flags)
7511 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
7512 != callee_opts->x_ix86_isa_flags2))
7513 ret = false;
7515 /* See if we have the same non-isa options. */
7516 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
7517 ret = false;
7519 /* See if arch, tune, etc. are the same. */
7520 else if (caller_opts->arch != callee_opts->arch)
7521 ret = false;
7523 else if (caller_opts->tune != callee_opts->tune)
7524 ret = false;
7526 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
7527 /* If the calle doesn't use FP expressions differences in
7528 ix86_fpmath can be ignored. We are called from FEs
7529 for multi-versioning call optimization, so beware of
7530 ipa_fn_summaries not available. */
7531 && (! ipa_fn_summaries
7532 || ipa_fn_summaries->get
7533 (cgraph_node::get (callee))->fp_expressions))
7534 ret = false;
7536 else if (caller_opts->branch_cost != callee_opts->branch_cost)
7537 ret = false;
7539 else
7540 ret = true;
7542 return ret;
7546 /* Remember the last target of ix86_set_current_function. */
7547 static GTY(()) tree ix86_previous_fndecl;
7549 /* Set targets globals to the default (or current #pragma GCC target
7550 if active). Invalidate ix86_previous_fndecl cache. */
7552 void
7553 ix86_reset_previous_fndecl (void)
7555 tree new_tree = target_option_current_node;
7556 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7557 if (TREE_TARGET_GLOBALS (new_tree))
7558 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7559 else if (new_tree == target_option_default_node)
7560 restore_target_globals (&default_target_globals);
7561 else
7562 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7563 ix86_previous_fndecl = NULL_TREE;
7566 /* Set the func_type field from the function FNDECL. */
7568 static void
7569 ix86_set_func_type (tree fndecl)
7571 if (cfun->machine->func_type == TYPE_UNKNOWN)
7573 if (lookup_attribute ("interrupt",
7574 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7576 if (ix86_function_naked (fndecl))
7577 error_at (DECL_SOURCE_LOCATION (fndecl),
7578 "interrupt and naked attributes are not compatible");
7580 int nargs = 0;
7581 for (tree arg = DECL_ARGUMENTS (fndecl);
7582 arg;
7583 arg = TREE_CHAIN (arg))
7584 nargs++;
7585 cfun->machine->no_caller_saved_registers = true;
7586 cfun->machine->func_type
7587 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
7589 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
7591 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
7592 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
7593 sorry ("Only DWARF debug format is supported for interrupt "
7594 "service routine.");
7596 else
7598 cfun->machine->func_type = TYPE_NORMAL;
7599 if (lookup_attribute ("no_caller_saved_registers",
7600 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
7601 cfun->machine->no_caller_saved_registers = true;
7606 /* Establish appropriate back-end context for processing the function
7607 FNDECL. The argument might be NULL to indicate processing at top
7608 level, outside of any function scope. */
7609 static void
7610 ix86_set_current_function (tree fndecl)
7612 /* Only change the context if the function changes. This hook is called
7613 several times in the course of compiling a function, and we don't want to
7614 slow things down too much or call target_reinit when it isn't safe. */
7615 if (fndecl == ix86_previous_fndecl)
7617 /* There may be 2 function bodies for the same function FNDECL,
7618 one is extern inline and one isn't. Call ix86_set_func_type
7619 to set the func_type field. */
7620 if (fndecl != NULL_TREE)
7621 ix86_set_func_type (fndecl);
7622 return;
7625 tree old_tree;
7626 if (ix86_previous_fndecl == NULL_TREE)
7627 old_tree = target_option_current_node;
7628 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
7629 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
7630 else
7631 old_tree = target_option_default_node;
7633 if (fndecl == NULL_TREE)
7635 if (old_tree != target_option_current_node)
7636 ix86_reset_previous_fndecl ();
7637 return;
7640 ix86_set_func_type (fndecl);
7642 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
7643 if (new_tree == NULL_TREE)
7644 new_tree = target_option_default_node;
7646 if (old_tree != new_tree)
7648 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
7649 if (TREE_TARGET_GLOBALS (new_tree))
7650 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7651 else if (new_tree == target_option_default_node)
7652 restore_target_globals (&default_target_globals);
7653 else
7654 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
7656 ix86_previous_fndecl = fndecl;
7658 static bool prev_no_caller_saved_registers;
7660 /* 64-bit MS and SYSV ABI have different set of call used registers.
7661 Avoid expensive re-initialization of init_regs each time we switch
7662 function context. */
7663 if (TARGET_64BIT
7664 && (call_used_regs[SI_REG]
7665 == (cfun->machine->call_abi == MS_ABI)))
7666 reinit_regs ();
7667 /* Need to re-initialize init_regs if caller-saved registers are
7668 changed. */
7669 else if (prev_no_caller_saved_registers
7670 != cfun->machine->no_caller_saved_registers)
7671 reinit_regs ();
7673 if (cfun->machine->func_type != TYPE_NORMAL
7674 || cfun->machine->no_caller_saved_registers)
7676 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
7677 may change processor state. */
7678 const char *isa;
7679 if (TARGET_MPX)
7680 isa = "MPX";
7681 else if (TARGET_SSE)
7682 isa = "SSE";
7683 else if (TARGET_MMX)
7684 isa = "MMX/3Dnow";
7685 else if (TARGET_80387)
7686 isa = "80387";
7687 else
7688 isa = NULL;
7689 if (isa != NULL)
7691 if (cfun->machine->func_type != TYPE_NORMAL)
7692 sorry ("%s instructions aren't allowed in %s service routine",
7693 isa, (cfun->machine->func_type == TYPE_EXCEPTION
7694 ? "exception" : "interrupt"));
7695 else
7696 sorry ("%s instructions aren't allowed in function with "
7697 "no_caller_saved_registers attribute", isa);
7698 /* Don't issue the same error twice. */
7699 cfun->machine->func_type = TYPE_NORMAL;
7700 cfun->machine->no_caller_saved_registers = false;
7704 prev_no_caller_saved_registers
7705 = cfun->machine->no_caller_saved_registers;
7709 /* Return true if this goes in large data/bss. */
7711 static bool
7712 ix86_in_large_data_p (tree exp)
7714 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
7715 return false;
7717 if (exp == NULL_TREE)
7718 return false;
7720 /* Functions are never large data. */
7721 if (TREE_CODE (exp) == FUNCTION_DECL)
7722 return false;
7724 /* Automatic variables are never large data. */
7725 if (VAR_P (exp) && !is_global_var (exp))
7726 return false;
7728 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
7730 const char *section = DECL_SECTION_NAME (exp);
7731 if (strcmp (section, ".ldata") == 0
7732 || strcmp (section, ".lbss") == 0)
7733 return true;
7734 return false;
7736 else
7738 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
7740 /* If this is an incomplete type with size 0, then we can't put it
7741 in data because it might be too big when completed. Also,
7742 int_size_in_bytes returns -1 if size can vary or is larger than
7743 an integer in which case also it is safer to assume that it goes in
7744 large data. */
7745 if (size <= 0 || size > ix86_section_threshold)
7746 return true;
7749 return false;
7752 /* i386-specific section flag to mark large sections. */
7753 #define SECTION_LARGE SECTION_MACH_DEP
7755 /* Switch to the appropriate section for output of DECL.
7756 DECL is either a `VAR_DECL' node or a constant of some sort.
7757 RELOC indicates whether forming the initial value of DECL requires
7758 link-time relocations. */
7760 ATTRIBUTE_UNUSED static section *
7761 x86_64_elf_select_section (tree decl, int reloc,
7762 unsigned HOST_WIDE_INT align)
7764 if (ix86_in_large_data_p (decl))
7766 const char *sname = NULL;
7767 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
7768 switch (categorize_decl_for_section (decl, reloc))
7770 case SECCAT_DATA:
7771 sname = ".ldata";
7772 break;
7773 case SECCAT_DATA_REL:
7774 sname = ".ldata.rel";
7775 break;
7776 case SECCAT_DATA_REL_LOCAL:
7777 sname = ".ldata.rel.local";
7778 break;
7779 case SECCAT_DATA_REL_RO:
7780 sname = ".ldata.rel.ro";
7781 break;
7782 case SECCAT_DATA_REL_RO_LOCAL:
7783 sname = ".ldata.rel.ro.local";
7784 break;
7785 case SECCAT_BSS:
7786 sname = ".lbss";
7787 flags |= SECTION_BSS;
7788 break;
7789 case SECCAT_RODATA:
7790 case SECCAT_RODATA_MERGE_STR:
7791 case SECCAT_RODATA_MERGE_STR_INIT:
7792 case SECCAT_RODATA_MERGE_CONST:
7793 sname = ".lrodata";
7794 flags &= ~SECTION_WRITE;
7795 break;
7796 case SECCAT_SRODATA:
7797 case SECCAT_SDATA:
7798 case SECCAT_SBSS:
7799 gcc_unreachable ();
7800 case SECCAT_TEXT:
7801 case SECCAT_TDATA:
7802 case SECCAT_TBSS:
7803 /* We don't split these for medium model. Place them into
7804 default sections and hope for best. */
7805 break;
7807 if (sname)
7809 /* We might get called with string constants, but get_named_section
7810 doesn't like them as they are not DECLs. Also, we need to set
7811 flags in that case. */
7812 if (!DECL_P (decl))
7813 return get_section (sname, flags, NULL);
7814 return get_named_section (decl, sname, reloc);
7817 return default_elf_select_section (decl, reloc, align);
7820 /* Select a set of attributes for section NAME based on the properties
7821 of DECL and whether or not RELOC indicates that DECL's initializer
7822 might contain runtime relocations. */
7824 static unsigned int ATTRIBUTE_UNUSED
7825 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
7827 unsigned int flags = default_section_type_flags (decl, name, reloc);
7829 if (ix86_in_large_data_p (decl))
7830 flags |= SECTION_LARGE;
7832 if (decl == NULL_TREE
7833 && (strcmp (name, ".ldata.rel.ro") == 0
7834 || strcmp (name, ".ldata.rel.ro.local") == 0))
7835 flags |= SECTION_RELRO;
7837 if (strcmp (name, ".lbss") == 0
7838 || strncmp (name, ".lbss.", 5) == 0
7839 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
7840 flags |= SECTION_BSS;
7842 return flags;
7845 /* Build up a unique section name, expressed as a
7846 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
7847 RELOC indicates whether the initial value of EXP requires
7848 link-time relocations. */
7850 static void ATTRIBUTE_UNUSED
7851 x86_64_elf_unique_section (tree decl, int reloc)
7853 if (ix86_in_large_data_p (decl))
7855 const char *prefix = NULL;
7856 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
7857 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
7859 switch (categorize_decl_for_section (decl, reloc))
7861 case SECCAT_DATA:
7862 case SECCAT_DATA_REL:
7863 case SECCAT_DATA_REL_LOCAL:
7864 case SECCAT_DATA_REL_RO:
7865 case SECCAT_DATA_REL_RO_LOCAL:
7866 prefix = one_only ? ".ld" : ".ldata";
7867 break;
7868 case SECCAT_BSS:
7869 prefix = one_only ? ".lb" : ".lbss";
7870 break;
7871 case SECCAT_RODATA:
7872 case SECCAT_RODATA_MERGE_STR:
7873 case SECCAT_RODATA_MERGE_STR_INIT:
7874 case SECCAT_RODATA_MERGE_CONST:
7875 prefix = one_only ? ".lr" : ".lrodata";
7876 break;
7877 case SECCAT_SRODATA:
7878 case SECCAT_SDATA:
7879 case SECCAT_SBSS:
7880 gcc_unreachable ();
7881 case SECCAT_TEXT:
7882 case SECCAT_TDATA:
7883 case SECCAT_TBSS:
7884 /* We don't split these for medium model. Place them into
7885 default sections and hope for best. */
7886 break;
7888 if (prefix)
7890 const char *name, *linkonce;
7891 char *string;
7893 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7894 name = targetm.strip_name_encoding (name);
7896 /* If we're using one_only, then there needs to be a .gnu.linkonce
7897 prefix to the section name. */
7898 linkonce = one_only ? ".gnu.linkonce" : "";
7900 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
7902 set_decl_section_name (decl, string);
7903 return;
7906 default_unique_section (decl, reloc);
7909 #ifdef COMMON_ASM_OP
7911 #ifndef LARGECOMM_SECTION_ASM_OP
7912 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
7913 #endif
7915 /* This says how to output assembler code to declare an
7916 uninitialized external linkage data object.
7918 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
7919 large objects. */
7920 void
7921 x86_elf_aligned_decl_common (FILE *file, tree decl,
7922 const char *name, unsigned HOST_WIDE_INT size,
7923 int align)
7925 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7926 && size > (unsigned int)ix86_section_threshold)
7928 switch_to_section (get_named_section (decl, ".lbss", 0));
7929 fputs (LARGECOMM_SECTION_ASM_OP, file);
7931 else
7932 fputs (COMMON_ASM_OP, file);
7933 assemble_name (file, name);
7934 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
7935 size, align / BITS_PER_UNIT);
7937 #endif
7939 /* Utility function for targets to use in implementing
7940 ASM_OUTPUT_ALIGNED_BSS. */
7942 void
7943 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
7944 unsigned HOST_WIDE_INT size, int align)
7946 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
7947 && size > (unsigned int)ix86_section_threshold)
7948 switch_to_section (get_named_section (decl, ".lbss", 0));
7949 else
7950 switch_to_section (bss_section);
7951 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
7952 #ifdef ASM_DECLARE_OBJECT_NAME
7953 last_assemble_variable_decl = decl;
7954 ASM_DECLARE_OBJECT_NAME (file, name, decl);
7955 #else
7956 /* Standard thing is just output label for the object. */
7957 ASM_OUTPUT_LABEL (file, name);
7958 #endif /* ASM_DECLARE_OBJECT_NAME */
7959 ASM_OUTPUT_SKIP (file, size ? size : 1);
7962 /* Decide whether we must probe the stack before any space allocation
7963 on this target. It's essentially TARGET_STACK_PROBE except when
7964 -fstack-check causes the stack to be already probed differently. */
7966 bool
7967 ix86_target_stack_probe (void)
7969 /* Do not probe the stack twice if static stack checking is enabled. */
7970 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7971 return false;
7973 return TARGET_STACK_PROBE;
7976 /* Decide whether we can make a sibling call to a function. DECL is the
7977 declaration of the function being targeted by the call and EXP is the
7978 CALL_EXPR representing the call. */
7980 static bool
7981 ix86_function_ok_for_sibcall (tree decl, tree exp)
7983 tree type, decl_or_type;
7984 rtx a, b;
7985 bool bind_global = decl && !targetm.binds_local_p (decl);
7987 if (ix86_function_naked (current_function_decl))
7988 return false;
7990 /* Sibling call isn't OK if there are no caller-saved registers
7991 since all registers must be preserved before return. */
7992 if (cfun->machine->no_caller_saved_registers)
7993 return false;
7995 /* If we are generating position-independent code, we cannot sibcall
7996 optimize direct calls to global functions, as the PLT requires
7997 %ebx be live. (Darwin does not have a PLT.) */
7998 if (!TARGET_MACHO
7999 && !TARGET_64BIT
8000 && flag_pic
8001 && flag_plt
8002 && bind_global)
8003 return false;
8005 /* If we need to align the outgoing stack, then sibcalling would
8006 unalign the stack, which may break the called function. */
8007 if (ix86_minimum_incoming_stack_boundary (true)
8008 < PREFERRED_STACK_BOUNDARY)
8009 return false;
8011 if (decl)
8013 decl_or_type = decl;
8014 type = TREE_TYPE (decl);
8016 else
8018 /* We're looking at the CALL_EXPR, we need the type of the function. */
8019 type = CALL_EXPR_FN (exp); /* pointer expression */
8020 type = TREE_TYPE (type); /* pointer type */
8021 type = TREE_TYPE (type); /* function type */
8022 decl_or_type = type;
8025 /* Check that the return value locations are the same. Like
8026 if we are returning floats on the 80387 register stack, we cannot
8027 make a sibcall from a function that doesn't return a float to a
8028 function that does or, conversely, from a function that does return
8029 a float to a function that doesn't; the necessary stack adjustment
8030 would not be executed. This is also the place we notice
8031 differences in the return value ABI. Note that it is ok for one
8032 of the functions to have void return type as long as the return
8033 value of the other is passed in a register. */
8034 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
8035 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
8036 cfun->decl, false);
8037 if (STACK_REG_P (a) || STACK_REG_P (b))
8039 if (!rtx_equal_p (a, b))
8040 return false;
8042 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
8044 else if (!rtx_equal_p (a, b))
8045 return false;
8047 if (TARGET_64BIT)
8049 /* The SYSV ABI has more call-clobbered registers;
8050 disallow sibcalls from MS to SYSV. */
8051 if (cfun->machine->call_abi == MS_ABI
8052 && ix86_function_type_abi (type) == SYSV_ABI)
8053 return false;
8055 else
8057 /* If this call is indirect, we'll need to be able to use a
8058 call-clobbered register for the address of the target function.
8059 Make sure that all such registers are not used for passing
8060 parameters. Note that DLLIMPORT functions and call to global
8061 function via GOT slot are indirect. */
8062 if (!decl
8063 || (bind_global && flag_pic && !flag_plt)
8064 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
8066 /* Check if regparm >= 3 since arg_reg_available is set to
8067 false if regparm == 0. If regparm is 1 or 2, there is
8068 always a call-clobbered register available.
8070 ??? The symbol indirect call doesn't need a call-clobbered
8071 register. But we don't know if this is a symbol indirect
8072 call or not here. */
8073 if (ix86_function_regparm (type, NULL) >= 3
8074 && !cfun->machine->arg_reg_available)
8075 return false;
8079 /* Otherwise okay. That also includes certain types of indirect calls. */
8080 return true;
8083 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
8084 and "sseregparm" calling convention attributes;
8085 arguments as in struct attribute_spec.handler. */
8087 static tree
8088 ix86_handle_cconv_attribute (tree *node, tree name,
8089 tree args,
8090 int,
8091 bool *no_add_attrs)
8093 if (TREE_CODE (*node) != FUNCTION_TYPE
8094 && TREE_CODE (*node) != METHOD_TYPE
8095 && TREE_CODE (*node) != FIELD_DECL
8096 && TREE_CODE (*node) != TYPE_DECL)
8098 warning (OPT_Wattributes, "%qE attribute only applies to functions",
8099 name);
8100 *no_add_attrs = true;
8101 return NULL_TREE;
8104 /* Can combine regparm with all attributes but fastcall, and thiscall. */
8105 if (is_attribute_p ("regparm", name))
8107 tree cst;
8109 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8111 error ("fastcall and regparm attributes are not compatible");
8114 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8116 error ("regparam and thiscall attributes are not compatible");
8119 cst = TREE_VALUE (args);
8120 if (TREE_CODE (cst) != INTEGER_CST)
8122 warning (OPT_Wattributes,
8123 "%qE attribute requires an integer constant argument",
8124 name);
8125 *no_add_attrs = true;
8127 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
8129 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
8130 name, REGPARM_MAX);
8131 *no_add_attrs = true;
8134 return NULL_TREE;
8137 if (TARGET_64BIT)
8139 /* Do not warn when emulating the MS ABI. */
8140 if ((TREE_CODE (*node) != FUNCTION_TYPE
8141 && TREE_CODE (*node) != METHOD_TYPE)
8142 || ix86_function_type_abi (*node) != MS_ABI)
8143 warning (OPT_Wattributes, "%qE attribute ignored",
8144 name);
8145 *no_add_attrs = true;
8146 return NULL_TREE;
8149 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
8150 if (is_attribute_p ("fastcall", name))
8152 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8154 error ("fastcall and cdecl attributes are not compatible");
8156 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8158 error ("fastcall and stdcall attributes are not compatible");
8160 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
8162 error ("fastcall and regparm attributes are not compatible");
8164 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8166 error ("fastcall and thiscall attributes are not compatible");
8170 /* Can combine stdcall with fastcall (redundant), regparm and
8171 sseregparm. */
8172 else if (is_attribute_p ("stdcall", name))
8174 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8176 error ("stdcall and cdecl attributes are not compatible");
8178 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8180 error ("stdcall and fastcall attributes are not compatible");
8182 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8184 error ("stdcall and thiscall attributes are not compatible");
8188 /* Can combine cdecl with regparm and sseregparm. */
8189 else if (is_attribute_p ("cdecl", name))
8191 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8193 error ("stdcall and cdecl attributes are not compatible");
8195 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8197 error ("fastcall and cdecl attributes are not compatible");
8199 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
8201 error ("cdecl and thiscall attributes are not compatible");
8204 else if (is_attribute_p ("thiscall", name))
8206 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
8207 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
8208 name);
8209 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
8211 error ("stdcall and thiscall attributes are not compatible");
8213 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
8215 error ("fastcall and thiscall attributes are not compatible");
8217 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
8219 error ("cdecl and thiscall attributes are not compatible");
8223 /* Can combine sseregparm with all attributes. */
8225 return NULL_TREE;
8228 /* The transactional memory builtins are implicitly regparm or fastcall
8229 depending on the ABI. Override the generic do-nothing attribute that
8230 these builtins were declared with, and replace it with one of the two
8231 attributes that we expect elsewhere. */
8233 static tree
8234 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
8235 int flags, bool *no_add_attrs)
8237 tree alt;
8239 /* In no case do we want to add the placeholder attribute. */
8240 *no_add_attrs = true;
8242 /* The 64-bit ABI is unchanged for transactional memory. */
8243 if (TARGET_64BIT)
8244 return NULL_TREE;
8246 /* ??? Is there a better way to validate 32-bit windows? We have
8247 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
8248 if (CHECK_STACK_LIMIT > 0)
8249 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
8250 else
8252 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
8253 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
8255 decl_attributes (node, alt, flags);
8257 return NULL_TREE;
8260 /* This function determines from TYPE the calling-convention. */
8262 unsigned int
8263 ix86_get_callcvt (const_tree type)
8265 unsigned int ret = 0;
8266 bool is_stdarg;
8267 tree attrs;
8269 if (TARGET_64BIT)
8270 return IX86_CALLCVT_CDECL;
8272 attrs = TYPE_ATTRIBUTES (type);
8273 if (attrs != NULL_TREE)
8275 if (lookup_attribute ("cdecl", attrs))
8276 ret |= IX86_CALLCVT_CDECL;
8277 else if (lookup_attribute ("stdcall", attrs))
8278 ret |= IX86_CALLCVT_STDCALL;
8279 else if (lookup_attribute ("fastcall", attrs))
8280 ret |= IX86_CALLCVT_FASTCALL;
8281 else if (lookup_attribute ("thiscall", attrs))
8282 ret |= IX86_CALLCVT_THISCALL;
8284 /* Regparam isn't allowed for thiscall and fastcall. */
8285 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
8287 if (lookup_attribute ("regparm", attrs))
8288 ret |= IX86_CALLCVT_REGPARM;
8289 if (lookup_attribute ("sseregparm", attrs))
8290 ret |= IX86_CALLCVT_SSEREGPARM;
8293 if (IX86_BASE_CALLCVT(ret) != 0)
8294 return ret;
8297 is_stdarg = stdarg_p (type);
8298 if (TARGET_RTD && !is_stdarg)
8299 return IX86_CALLCVT_STDCALL | ret;
8301 if (ret != 0
8302 || is_stdarg
8303 || TREE_CODE (type) != METHOD_TYPE
8304 || ix86_function_type_abi (type) != MS_ABI)
8305 return IX86_CALLCVT_CDECL | ret;
8307 return IX86_CALLCVT_THISCALL;
8310 /* Return 0 if the attributes for two types are incompatible, 1 if they
8311 are compatible, and 2 if they are nearly compatible (which causes a
8312 warning to be generated). */
8314 static int
8315 ix86_comp_type_attributes (const_tree type1, const_tree type2)
8317 unsigned int ccvt1, ccvt2;
8319 if (TREE_CODE (type1) != FUNCTION_TYPE
8320 && TREE_CODE (type1) != METHOD_TYPE)
8321 return 1;
8323 ccvt1 = ix86_get_callcvt (type1);
8324 ccvt2 = ix86_get_callcvt (type2);
8325 if (ccvt1 != ccvt2)
8326 return 0;
8327 if (ix86_function_regparm (type1, NULL)
8328 != ix86_function_regparm (type2, NULL))
8329 return 0;
8331 return 1;
8334 /* Return the regparm value for a function with the indicated TYPE and DECL.
8335 DECL may be NULL when calling function indirectly
8336 or considering a libcall. */
8338 static int
8339 ix86_function_regparm (const_tree type, const_tree decl)
8341 tree attr;
8342 int regparm;
8343 unsigned int ccvt;
8345 if (TARGET_64BIT)
8346 return (ix86_function_type_abi (type) == SYSV_ABI
8347 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
8348 ccvt = ix86_get_callcvt (type);
8349 regparm = ix86_regparm;
8351 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
8353 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
8354 if (attr)
8356 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
8357 return regparm;
8360 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
8361 return 2;
8362 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
8363 return 1;
8365 /* Use register calling convention for local functions when possible. */
8366 if (decl
8367 && TREE_CODE (decl) == FUNCTION_DECL)
8369 cgraph_node *target = cgraph_node::get (decl);
8370 if (target)
8371 target = target->function_symbol ();
8373 /* Caller and callee must agree on the calling convention, so
8374 checking here just optimize means that with
8375 __attribute__((optimize (...))) caller could use regparm convention
8376 and callee not, or vice versa. Instead look at whether the callee
8377 is optimized or not. */
8378 if (target && opt_for_fn (target->decl, optimize)
8379 && !(profile_flag && !flag_fentry))
8381 cgraph_local_info *i = &target->local;
8382 if (i && i->local && i->can_change_signature)
8384 int local_regparm, globals = 0, regno;
8386 /* Make sure no regparm register is taken by a
8387 fixed register variable. */
8388 for (local_regparm = 0; local_regparm < REGPARM_MAX;
8389 local_regparm++)
8390 if (fixed_regs[local_regparm])
8391 break;
8393 /* We don't want to use regparm(3) for nested functions as
8394 these use a static chain pointer in the third argument. */
8395 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
8396 local_regparm = 2;
8398 /* Save a register for the split stack. */
8399 if (flag_split_stack)
8401 if (local_regparm == 3)
8402 local_regparm = 2;
8403 else if (local_regparm == 2
8404 && DECL_STATIC_CHAIN (target->decl))
8405 local_regparm = 1;
8408 /* Each fixed register usage increases register pressure,
8409 so less registers should be used for argument passing.
8410 This functionality can be overriden by an explicit
8411 regparm value. */
8412 for (regno = AX_REG; regno <= DI_REG; regno++)
8413 if (fixed_regs[regno])
8414 globals++;
8416 local_regparm
8417 = globals < local_regparm ? local_regparm - globals : 0;
8419 if (local_regparm > regparm)
8420 regparm = local_regparm;
8425 return regparm;
8428 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
8429 DFmode (2) arguments in SSE registers for a function with the
8430 indicated TYPE and DECL. DECL may be NULL when calling function
8431 indirectly or considering a libcall. Return -1 if any FP parameter
8432 should be rejected by error. This is used in siutation we imply SSE
8433 calling convetion but the function is called from another function with
8434 SSE disabled. Otherwise return 0. */
8436 static int
8437 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
8439 gcc_assert (!TARGET_64BIT);
8441 /* Use SSE registers to pass SFmode and DFmode arguments if requested
8442 by the sseregparm attribute. */
8443 if (TARGET_SSEREGPARM
8444 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
8446 if (!TARGET_SSE)
8448 if (warn)
8450 if (decl)
8451 error ("calling %qD with attribute sseregparm without "
8452 "SSE/SSE2 enabled", decl);
8453 else
8454 error ("calling %qT with attribute sseregparm without "
8455 "SSE/SSE2 enabled", type);
8457 return 0;
8460 return 2;
8463 if (!decl)
8464 return 0;
8466 cgraph_node *target = cgraph_node::get (decl);
8467 if (target)
8468 target = target->function_symbol ();
8470 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
8471 (and DFmode for SSE2) arguments in SSE registers. */
8472 if (target
8473 /* TARGET_SSE_MATH */
8474 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
8475 && opt_for_fn (target->decl, optimize)
8476 && !(profile_flag && !flag_fentry))
8478 cgraph_local_info *i = &target->local;
8479 if (i && i->local && i->can_change_signature)
8481 /* Refuse to produce wrong code when local function with SSE enabled
8482 is called from SSE disabled function.
8483 FIXME: We need a way to detect these cases cross-ltrans partition
8484 and avoid using SSE calling conventions on local functions called
8485 from function with SSE disabled. For now at least delay the
8486 warning until we know we are going to produce wrong code.
8487 See PR66047 */
8488 if (!TARGET_SSE && warn)
8489 return -1;
8490 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
8491 ->x_ix86_isa_flags) ? 2 : 1;
8495 return 0;
8498 /* Return true if EAX is live at the start of the function. Used by
8499 ix86_expand_prologue to determine if we need special help before
8500 calling allocate_stack_worker. */
8502 static bool
8503 ix86_eax_live_at_start_p (void)
8505 /* Cheat. Don't bother working forward from ix86_function_regparm
8506 to the function type to whether an actual argument is located in
8507 eax. Instead just look at cfg info, which is still close enough
8508 to correct at this point. This gives false positives for broken
8509 functions that might use uninitialized data that happens to be
8510 allocated in eax, but who cares? */
8511 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
8514 static bool
8515 ix86_keep_aggregate_return_pointer (tree fntype)
8517 tree attr;
8519 if (!TARGET_64BIT)
8521 attr = lookup_attribute ("callee_pop_aggregate_return",
8522 TYPE_ATTRIBUTES (fntype));
8523 if (attr)
8524 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
8526 /* For 32-bit MS-ABI the default is to keep aggregate
8527 return pointer. */
8528 if (ix86_function_type_abi (fntype) == MS_ABI)
8529 return true;
8531 return KEEP_AGGREGATE_RETURN_POINTER != 0;
8534 /* Value is the number of bytes of arguments automatically
8535 popped when returning from a subroutine call.
8536 FUNDECL is the declaration node of the function (as a tree),
8537 FUNTYPE is the data type of the function (as a tree),
8538 or for a library call it is an identifier node for the subroutine name.
8539 SIZE is the number of bytes of arguments passed on the stack.
8541 On the 80386, the RTD insn may be used to pop them if the number
8542 of args is fixed, but if the number is variable then the caller
8543 must pop them all. RTD can't be used for library calls now
8544 because the library is compiled with the Unix compiler.
8545 Use of RTD is a selectable option, since it is incompatible with
8546 standard Unix calling sequences. If the option is not selected,
8547 the caller must always pop the args.
8549 The attribute stdcall is equivalent to RTD on a per module basis. */
8551 static int
8552 ix86_return_pops_args (tree fundecl, tree funtype, int size)
8554 unsigned int ccvt;
8556 /* None of the 64-bit ABIs pop arguments. */
8557 if (TARGET_64BIT)
8558 return 0;
8560 ccvt = ix86_get_callcvt (funtype);
8562 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
8563 | IX86_CALLCVT_THISCALL)) != 0
8564 && ! stdarg_p (funtype))
8565 return size;
8567 /* Lose any fake structure return argument if it is passed on the stack. */
8568 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
8569 && !ix86_keep_aggregate_return_pointer (funtype))
8571 int nregs = ix86_function_regparm (funtype, fundecl);
8572 if (nregs == 0)
8573 return GET_MODE_SIZE (Pmode);
8576 return 0;
8579 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
8581 static bool
8582 ix86_legitimate_combined_insn (rtx_insn *insn)
8584 int i;
8586 /* Check operand constraints in case hard registers were propagated
8587 into insn pattern. This check prevents combine pass from
8588 generating insn patterns with invalid hard register operands.
8589 These invalid insns can eventually confuse reload to error out
8590 with a spill failure. See also PRs 46829 and 46843. */
8592 gcc_assert (INSN_CODE (insn) >= 0);
8594 extract_insn (insn);
8595 preprocess_constraints (insn);
8597 int n_operands = recog_data.n_operands;
8598 int n_alternatives = recog_data.n_alternatives;
8599 for (i = 0; i < n_operands; i++)
8601 rtx op = recog_data.operand[i];
8602 machine_mode mode = GET_MODE (op);
8603 const operand_alternative *op_alt;
8604 int offset = 0;
8605 bool win;
8606 int j;
8608 /* A unary operator may be accepted by the predicate, but it
8609 is irrelevant for matching constraints. */
8610 if (UNARY_P (op))
8611 op = XEXP (op, 0);
8613 if (SUBREG_P (op))
8615 if (REG_P (SUBREG_REG (op))
8616 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
8617 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
8618 GET_MODE (SUBREG_REG (op)),
8619 SUBREG_BYTE (op),
8620 GET_MODE (op));
8621 op = SUBREG_REG (op);
8624 if (!(REG_P (op) && HARD_REGISTER_P (op)))
8625 continue;
8627 op_alt = recog_op_alt;
8629 /* Operand has no constraints, anything is OK. */
8630 win = !n_alternatives;
8632 alternative_mask preferred = get_preferred_alternatives (insn);
8633 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
8635 if (!TEST_BIT (preferred, j))
8636 continue;
8637 if (op_alt[i].anything_ok
8638 || (op_alt[i].matches != -1
8639 && operands_match_p
8640 (recog_data.operand[i],
8641 recog_data.operand[op_alt[i].matches]))
8642 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
8644 win = true;
8645 break;
8649 if (!win)
8650 return false;
8653 return true;
8656 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
8658 static unsigned HOST_WIDE_INT
8659 ix86_asan_shadow_offset (void)
8661 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
8662 : HOST_WIDE_INT_C (0x7fff8000))
8663 : (HOST_WIDE_INT_1 << 29);
8666 /* Argument support functions. */
8668 /* Return true when register may be used to pass function parameters. */
8669 bool
8670 ix86_function_arg_regno_p (int regno)
8672 int i;
8673 enum calling_abi call_abi;
8674 const int *parm_regs;
8676 if (TARGET_MPX && BND_REGNO_P (regno))
8677 return true;
8679 if (!TARGET_64BIT)
8681 if (TARGET_MACHO)
8682 return (regno < REGPARM_MAX
8683 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
8684 else
8685 return (regno < REGPARM_MAX
8686 || (TARGET_MMX && MMX_REGNO_P (regno)
8687 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
8688 || (TARGET_SSE && SSE_REGNO_P (regno)
8689 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
8692 if (TARGET_SSE && SSE_REGNO_P (regno)
8693 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
8694 return true;
8696 /* TODO: The function should depend on current function ABI but
8697 builtins.c would need updating then. Therefore we use the
8698 default ABI. */
8699 call_abi = ix86_cfun_abi ();
8701 /* RAX is used as hidden argument to va_arg functions. */
8702 if (call_abi == SYSV_ABI && regno == AX_REG)
8703 return true;
8705 if (call_abi == MS_ABI)
8706 parm_regs = x86_64_ms_abi_int_parameter_registers;
8707 else
8708 parm_regs = x86_64_int_parameter_registers;
8710 for (i = 0; i < (call_abi == MS_ABI
8711 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
8712 if (regno == parm_regs[i])
8713 return true;
8714 return false;
8717 /* Return if we do not know how to pass TYPE solely in registers. */
8719 static bool
8720 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
8722 if (must_pass_in_stack_var_size_or_pad (mode, type))
8723 return true;
8725 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
8726 The layout_type routine is crafty and tries to trick us into passing
8727 currently unsupported vector types on the stack by using TImode. */
8728 return (!TARGET_64BIT && mode == TImode
8729 && type && TREE_CODE (type) != VECTOR_TYPE);
8732 /* It returns the size, in bytes, of the area reserved for arguments passed
8733 in registers for the function represented by fndecl dependent to the used
8734 abi format. */
8736 ix86_reg_parm_stack_space (const_tree fndecl)
8738 enum calling_abi call_abi = SYSV_ABI;
8739 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
8740 call_abi = ix86_function_abi (fndecl);
8741 else
8742 call_abi = ix86_function_type_abi (fndecl);
8743 if (TARGET_64BIT && call_abi == MS_ABI)
8744 return 32;
8745 return 0;
8748 /* We add this as a workaround in order to use libc_has_function
8749 hook in i386.md. */
8750 bool
8751 ix86_libc_has_function (enum function_class fn_class)
8753 return targetm.libc_has_function (fn_class);
8756 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
8757 specifying the call abi used. */
8758 enum calling_abi
8759 ix86_function_type_abi (const_tree fntype)
8761 enum calling_abi abi = ix86_abi;
8763 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
8764 return abi;
8766 if (abi == SYSV_ABI
8767 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
8769 static int warned;
8770 if (TARGET_X32 && !warned)
8772 error ("X32 does not support ms_abi attribute");
8773 warned = 1;
8776 abi = MS_ABI;
8778 else if (abi == MS_ABI
8779 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
8780 abi = SYSV_ABI;
8782 return abi;
8785 static enum calling_abi
8786 ix86_function_abi (const_tree fndecl)
8788 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
8791 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
8792 specifying the call abi used. */
8793 enum calling_abi
8794 ix86_cfun_abi (void)
8796 return cfun ? cfun->machine->call_abi : ix86_abi;
8799 static bool
8800 ix86_function_ms_hook_prologue (const_tree fn)
8802 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
8804 if (decl_function_context (fn) != NULL_TREE)
8805 error_at (DECL_SOURCE_LOCATION (fn),
8806 "ms_hook_prologue is not compatible with nested function");
8807 else
8808 return true;
8810 return false;
8813 static bool
8814 ix86_function_naked (const_tree fn)
8816 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
8817 return true;
8819 return false;
8822 /* Write the extra assembler code needed to declare a function properly. */
8824 void
8825 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
8826 tree decl)
8828 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
8830 if (is_ms_hook)
8832 int i, filler_count = (TARGET_64BIT ? 32 : 16);
8833 unsigned int filler_cc = 0xcccccccc;
8835 for (i = 0; i < filler_count; i += 4)
8836 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
8839 #ifdef SUBTARGET_ASM_UNWIND_INIT
8840 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
8841 #endif
8843 ASM_OUTPUT_LABEL (asm_out_file, fname);
8845 /* Output magic byte marker, if hot-patch attribute is set. */
8846 if (is_ms_hook)
8848 if (TARGET_64BIT)
8850 /* leaq [%rsp + 0], %rsp */
8851 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
8852 asm_out_file);
8854 else
8856 /* movl.s %edi, %edi
8857 push %ebp
8858 movl.s %esp, %ebp */
8859 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
8864 /* Implementation of call abi switching target hook. Specific to FNDECL
8865 the specific call register sets are set. See also
8866 ix86_conditional_register_usage for more details. */
8867 void
8868 ix86_call_abi_override (const_tree fndecl)
8870 cfun->machine->call_abi = ix86_function_abi (fndecl);
8873 /* Return 1 if pseudo register should be created and used to hold
8874 GOT address for PIC code. */
8875 bool
8876 ix86_use_pseudo_pic_reg (void)
8878 if ((TARGET_64BIT
8879 && (ix86_cmodel == CM_SMALL_PIC
8880 || TARGET_PECOFF))
8881 || !flag_pic)
8882 return false;
8883 return true;
8886 /* Initialize large model PIC register. */
8888 static void
8889 ix86_init_large_pic_reg (unsigned int tmp_regno)
8891 rtx_code_label *label;
8892 rtx tmp_reg;
8894 gcc_assert (Pmode == DImode);
8895 label = gen_label_rtx ();
8896 emit_label (label);
8897 LABEL_PRESERVE_P (label) = 1;
8898 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
8899 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
8900 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
8901 label));
8902 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
8903 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
8904 pic_offset_table_rtx, tmp_reg));
8907 /* Create and initialize PIC register if required. */
8908 static void
8909 ix86_init_pic_reg (void)
8911 edge entry_edge;
8912 rtx_insn *seq;
8914 if (!ix86_use_pseudo_pic_reg ())
8915 return;
8917 start_sequence ();
8919 if (TARGET_64BIT)
8921 if (ix86_cmodel == CM_LARGE_PIC)
8922 ix86_init_large_pic_reg (R11_REG);
8923 else
8924 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
8926 else
8928 /* If there is future mcount call in the function it is more profitable
8929 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
8930 rtx reg = crtl->profile
8931 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
8932 : pic_offset_table_rtx;
8933 rtx_insn *insn = emit_insn (gen_set_got (reg));
8934 RTX_FRAME_RELATED_P (insn) = 1;
8935 if (crtl->profile)
8936 emit_move_insn (pic_offset_table_rtx, reg);
8937 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
8940 seq = get_insns ();
8941 end_sequence ();
8943 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8944 insert_insn_on_edge (seq, entry_edge);
8945 commit_one_edge_insertion (entry_edge);
8948 /* Initialize a variable CUM of type CUMULATIVE_ARGS
8949 for a call to a function whose data type is FNTYPE.
8950 For a library call, FNTYPE is 0. */
8952 void
8953 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
8954 tree fntype, /* tree ptr for function decl */
8955 rtx libname, /* SYMBOL_REF of library name or 0 */
8956 tree fndecl,
8957 int caller)
8959 struct cgraph_local_info *i = NULL;
8960 struct cgraph_node *target = NULL;
8962 memset (cum, 0, sizeof (*cum));
8964 if (fndecl)
8966 target = cgraph_node::get (fndecl);
8967 if (target)
8969 target = target->function_symbol ();
8970 i = cgraph_node::local_info (target->decl);
8971 cum->call_abi = ix86_function_abi (target->decl);
8973 else
8974 cum->call_abi = ix86_function_abi (fndecl);
8976 else
8977 cum->call_abi = ix86_function_type_abi (fntype);
8979 cum->caller = caller;
8981 /* Set up the number of registers to use for passing arguments. */
8982 cum->nregs = ix86_regparm;
8983 if (TARGET_64BIT)
8985 cum->nregs = (cum->call_abi == SYSV_ABI
8986 ? X86_64_REGPARM_MAX
8987 : X86_64_MS_REGPARM_MAX);
8989 if (TARGET_SSE)
8991 cum->sse_nregs = SSE_REGPARM_MAX;
8992 if (TARGET_64BIT)
8994 cum->sse_nregs = (cum->call_abi == SYSV_ABI
8995 ? X86_64_SSE_REGPARM_MAX
8996 : X86_64_MS_SSE_REGPARM_MAX);
8999 if (TARGET_MMX)
9000 cum->mmx_nregs = MMX_REGPARM_MAX;
9001 cum->warn_avx512f = true;
9002 cum->warn_avx = true;
9003 cum->warn_sse = true;
9004 cum->warn_mmx = true;
9006 /* Because type might mismatch in between caller and callee, we need to
9007 use actual type of function for local calls.
9008 FIXME: cgraph_analyze can be told to actually record if function uses
9009 va_start so for local functions maybe_vaarg can be made aggressive
9010 helping K&R code.
9011 FIXME: once typesytem is fixed, we won't need this code anymore. */
9012 if (i && i->local && i->can_change_signature)
9013 fntype = TREE_TYPE (target->decl);
9014 cum->stdarg = stdarg_p (fntype);
9015 cum->maybe_vaarg = (fntype
9016 ? (!prototype_p (fntype) || stdarg_p (fntype))
9017 : !libname);
9019 cum->bnd_regno = FIRST_BND_REG;
9020 cum->bnds_in_bt = 0;
9021 cum->force_bnd_pass = 0;
9022 cum->decl = fndecl;
9024 if (!TARGET_64BIT)
9026 /* If there are variable arguments, then we won't pass anything
9027 in registers in 32-bit mode. */
9028 if (stdarg_p (fntype))
9030 cum->nregs = 0;
9031 /* Since in 32-bit, variable arguments are always passed on
9032 stack, there is scratch register available for indirect
9033 sibcall. */
9034 cfun->machine->arg_reg_available = true;
9035 cum->sse_nregs = 0;
9036 cum->mmx_nregs = 0;
9037 cum->warn_avx512f = false;
9038 cum->warn_avx = false;
9039 cum->warn_sse = false;
9040 cum->warn_mmx = false;
9041 return;
9044 /* Use ecx and edx registers if function has fastcall attribute,
9045 else look for regparm information. */
9046 if (fntype)
9048 unsigned int ccvt = ix86_get_callcvt (fntype);
9049 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
9051 cum->nregs = 1;
9052 cum->fastcall = 1; /* Same first register as in fastcall. */
9054 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
9056 cum->nregs = 2;
9057 cum->fastcall = 1;
9059 else
9060 cum->nregs = ix86_function_regparm (fntype, fndecl);
9063 /* Set up the number of SSE registers used for passing SFmode
9064 and DFmode arguments. Warn for mismatching ABI. */
9065 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
9068 cfun->machine->arg_reg_available = (cum->nregs > 0);
9071 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
9072 But in the case of vector types, it is some vector mode.
9074 When we have only some of our vector isa extensions enabled, then there
9075 are some modes for which vector_mode_supported_p is false. For these
9076 modes, the generic vector support in gcc will choose some non-vector mode
9077 in order to implement the type. By computing the natural mode, we'll
9078 select the proper ABI location for the operand and not depend on whatever
9079 the middle-end decides to do with these vector types.
9081 The midde-end can't deal with the vector types > 16 bytes. In this
9082 case, we return the original mode and warn ABI change if CUM isn't
9083 NULL.
9085 If INT_RETURN is true, warn ABI change if the vector mode isn't
9086 available for function return value. */
9088 static machine_mode
9089 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
9090 bool in_return)
9092 machine_mode mode = TYPE_MODE (type);
9094 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
9096 HOST_WIDE_INT size = int_size_in_bytes (type);
9097 if ((size == 8 || size == 16 || size == 32 || size == 64)
9098 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
9099 && TYPE_VECTOR_SUBPARTS (type) > 1)
9101 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
9103 /* There are no XFmode vector modes. */
9104 if (innermode == XFmode)
9105 return mode;
9107 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
9108 mode = MIN_MODE_VECTOR_FLOAT;
9109 else
9110 mode = MIN_MODE_VECTOR_INT;
9112 /* Get the mode which has this inner mode and number of units. */
9113 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
9114 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
9115 && GET_MODE_INNER (mode) == innermode)
9117 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
9119 static bool warnedavx512f;
9120 static bool warnedavx512f_ret;
9122 if (cum && cum->warn_avx512f && !warnedavx512f)
9124 if (warning (OPT_Wpsabi, "AVX512F vector argument "
9125 "without AVX512F enabled changes the ABI"))
9126 warnedavx512f = true;
9128 else if (in_return && !warnedavx512f_ret)
9130 if (warning (OPT_Wpsabi, "AVX512F vector return "
9131 "without AVX512F enabled changes the ABI"))
9132 warnedavx512f_ret = true;
9135 return TYPE_MODE (type);
9137 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
9139 static bool warnedavx;
9140 static bool warnedavx_ret;
9142 if (cum && cum->warn_avx && !warnedavx)
9144 if (warning (OPT_Wpsabi, "AVX vector argument "
9145 "without AVX enabled changes the ABI"))
9146 warnedavx = true;
9148 else if (in_return && !warnedavx_ret)
9150 if (warning (OPT_Wpsabi, "AVX vector return "
9151 "without AVX enabled changes the ABI"))
9152 warnedavx_ret = true;
9155 return TYPE_MODE (type);
9157 else if (((size == 8 && TARGET_64BIT) || size == 16)
9158 && !TARGET_SSE
9159 && !TARGET_IAMCU)
9161 static bool warnedsse;
9162 static bool warnedsse_ret;
9164 if (cum && cum->warn_sse && !warnedsse)
9166 if (warning (OPT_Wpsabi, "SSE vector argument "
9167 "without SSE enabled changes the ABI"))
9168 warnedsse = true;
9170 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
9172 if (warning (OPT_Wpsabi, "SSE vector return "
9173 "without SSE enabled changes the ABI"))
9174 warnedsse_ret = true;
9177 else if ((size == 8 && !TARGET_64BIT)
9178 && (!cfun
9179 || cfun->machine->func_type == TYPE_NORMAL)
9180 && !TARGET_MMX
9181 && !TARGET_IAMCU)
9183 static bool warnedmmx;
9184 static bool warnedmmx_ret;
9186 if (cum && cum->warn_mmx && !warnedmmx)
9188 if (warning (OPT_Wpsabi, "MMX vector argument "
9189 "without MMX enabled changes the ABI"))
9190 warnedmmx = true;
9192 else if (in_return && !warnedmmx_ret)
9194 if (warning (OPT_Wpsabi, "MMX vector return "
9195 "without MMX enabled changes the ABI"))
9196 warnedmmx_ret = true;
9199 return mode;
9202 gcc_unreachable ();
9206 return mode;
9209 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
9210 this may not agree with the mode that the type system has chosen for the
9211 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
9212 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
9214 static rtx
9215 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
9216 unsigned int regno)
9218 rtx tmp;
9220 if (orig_mode != BLKmode)
9221 tmp = gen_rtx_REG (orig_mode, regno);
9222 else
9224 tmp = gen_rtx_REG (mode, regno);
9225 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
9226 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
9229 return tmp;
9232 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
9233 of this code is to classify each 8bytes of incoming argument by the register
9234 class and assign registers accordingly. */
9236 /* Return the union class of CLASS1 and CLASS2.
9237 See the x86-64 PS ABI for details. */
9239 static enum x86_64_reg_class
9240 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
9242 /* Rule #1: If both classes are equal, this is the resulting class. */
9243 if (class1 == class2)
9244 return class1;
9246 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
9247 the other class. */
9248 if (class1 == X86_64_NO_CLASS)
9249 return class2;
9250 if (class2 == X86_64_NO_CLASS)
9251 return class1;
9253 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
9254 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
9255 return X86_64_MEMORY_CLASS;
9257 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
9258 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
9259 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
9260 return X86_64_INTEGERSI_CLASS;
9261 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
9262 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
9263 return X86_64_INTEGER_CLASS;
9265 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
9266 MEMORY is used. */
9267 if (class1 == X86_64_X87_CLASS
9268 || class1 == X86_64_X87UP_CLASS
9269 || class1 == X86_64_COMPLEX_X87_CLASS
9270 || class2 == X86_64_X87_CLASS
9271 || class2 == X86_64_X87UP_CLASS
9272 || class2 == X86_64_COMPLEX_X87_CLASS)
9273 return X86_64_MEMORY_CLASS;
9275 /* Rule #6: Otherwise class SSE is used. */
9276 return X86_64_SSE_CLASS;
9279 /* Classify the argument of type TYPE and mode MODE.
9280 CLASSES will be filled by the register class used to pass each word
9281 of the operand. The number of words is returned. In case the parameter
9282 should be passed in memory, 0 is returned. As a special case for zero
9283 sized containers, classes[0] will be NO_CLASS and 1 is returned.
9285 BIT_OFFSET is used internally for handling records and specifies offset
9286 of the offset in bits modulo 512 to avoid overflow cases.
9288 See the x86-64 PS ABI for details.
9291 static int
9292 classify_argument (machine_mode mode, const_tree type,
9293 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
9295 HOST_WIDE_INT bytes =
9296 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9297 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
9299 /* Variable sized entities are always passed/returned in memory. */
9300 if (bytes < 0)
9301 return 0;
9303 if (mode != VOIDmode
9304 && targetm.calls.must_pass_in_stack (mode, type))
9305 return 0;
9307 if (type && AGGREGATE_TYPE_P (type))
9309 int i;
9310 tree field;
9311 enum x86_64_reg_class subclasses[MAX_CLASSES];
9313 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
9314 if (bytes > 64)
9315 return 0;
9317 for (i = 0; i < words; i++)
9318 classes[i] = X86_64_NO_CLASS;
9320 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
9321 signalize memory class, so handle it as special case. */
9322 if (!words)
9324 classes[0] = X86_64_NO_CLASS;
9325 return 1;
9328 /* Classify each field of record and merge classes. */
9329 switch (TREE_CODE (type))
9331 case RECORD_TYPE:
9332 /* And now merge the fields of structure. */
9333 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9335 if (TREE_CODE (field) == FIELD_DECL)
9337 int num;
9339 if (TREE_TYPE (field) == error_mark_node)
9340 continue;
9342 /* Bitfields are always classified as integer. Handle them
9343 early, since later code would consider them to be
9344 misaligned integers. */
9345 if (DECL_BIT_FIELD (field))
9347 for (i = (int_bit_position (field)
9348 + (bit_offset % 64)) / 8 / 8;
9349 i < ((int_bit_position (field) + (bit_offset % 64))
9350 + tree_to_shwi (DECL_SIZE (field))
9351 + 63) / 8 / 8; i++)
9352 classes[i] =
9353 merge_classes (X86_64_INTEGER_CLASS,
9354 classes[i]);
9356 else
9358 int pos;
9360 type = TREE_TYPE (field);
9362 /* Flexible array member is ignored. */
9363 if (TYPE_MODE (type) == BLKmode
9364 && TREE_CODE (type) == ARRAY_TYPE
9365 && TYPE_SIZE (type) == NULL_TREE
9366 && TYPE_DOMAIN (type) != NULL_TREE
9367 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
9368 == NULL_TREE))
9370 static bool warned;
9372 if (!warned && warn_psabi)
9374 warned = true;
9375 inform (input_location,
9376 "the ABI of passing struct with"
9377 " a flexible array member has"
9378 " changed in GCC 4.4");
9380 continue;
9382 num = classify_argument (TYPE_MODE (type), type,
9383 subclasses,
9384 (int_bit_position (field)
9385 + bit_offset) % 512);
9386 if (!num)
9387 return 0;
9388 pos = (int_bit_position (field)
9389 + (bit_offset % 64)) / 8 / 8;
9390 for (i = 0; i < num && (i + pos) < words; i++)
9391 classes[i + pos] =
9392 merge_classes (subclasses[i], classes[i + pos]);
9396 break;
9398 case ARRAY_TYPE:
9399 /* Arrays are handled as small records. */
9401 int num;
9402 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
9403 TREE_TYPE (type), subclasses, bit_offset);
9404 if (!num)
9405 return 0;
9407 /* The partial classes are now full classes. */
9408 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
9409 subclasses[0] = X86_64_SSE_CLASS;
9410 if (subclasses[0] == X86_64_INTEGERSI_CLASS
9411 && !((bit_offset % 64) == 0 && bytes == 4))
9412 subclasses[0] = X86_64_INTEGER_CLASS;
9414 for (i = 0; i < words; i++)
9415 classes[i] = subclasses[i % num];
9417 break;
9419 case UNION_TYPE:
9420 case QUAL_UNION_TYPE:
9421 /* Unions are similar to RECORD_TYPE but offset is always 0.
9423 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
9425 if (TREE_CODE (field) == FIELD_DECL)
9427 int num;
9429 if (TREE_TYPE (field) == error_mark_node)
9430 continue;
9432 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
9433 TREE_TYPE (field), subclasses,
9434 bit_offset);
9435 if (!num)
9436 return 0;
9437 for (i = 0; i < num && i < words; i++)
9438 classes[i] = merge_classes (subclasses[i], classes[i]);
9441 break;
9443 default:
9444 gcc_unreachable ();
9447 if (words > 2)
9449 /* When size > 16 bytes, if the first one isn't
9450 X86_64_SSE_CLASS or any other ones aren't
9451 X86_64_SSEUP_CLASS, everything should be passed in
9452 memory. */
9453 if (classes[0] != X86_64_SSE_CLASS)
9454 return 0;
9456 for (i = 1; i < words; i++)
9457 if (classes[i] != X86_64_SSEUP_CLASS)
9458 return 0;
9461 /* Final merger cleanup. */
9462 for (i = 0; i < words; i++)
9464 /* If one class is MEMORY, everything should be passed in
9465 memory. */
9466 if (classes[i] == X86_64_MEMORY_CLASS)
9467 return 0;
9469 /* The X86_64_SSEUP_CLASS should be always preceded by
9470 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
9471 if (classes[i] == X86_64_SSEUP_CLASS
9472 && classes[i - 1] != X86_64_SSE_CLASS
9473 && classes[i - 1] != X86_64_SSEUP_CLASS)
9475 /* The first one should never be X86_64_SSEUP_CLASS. */
9476 gcc_assert (i != 0);
9477 classes[i] = X86_64_SSE_CLASS;
9480 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
9481 everything should be passed in memory. */
9482 if (classes[i] == X86_64_X87UP_CLASS
9483 && (classes[i - 1] != X86_64_X87_CLASS))
9485 static bool warned;
9487 /* The first one should never be X86_64_X87UP_CLASS. */
9488 gcc_assert (i != 0);
9489 if (!warned && warn_psabi)
9491 warned = true;
9492 inform (input_location,
9493 "the ABI of passing union with long double"
9494 " has changed in GCC 4.4");
9496 return 0;
9499 return words;
9502 /* Compute alignment needed. We align all types to natural boundaries with
9503 exception of XFmode that is aligned to 64bits. */
9504 if (mode != VOIDmode && mode != BLKmode)
9506 int mode_alignment = GET_MODE_BITSIZE (mode);
9508 if (mode == XFmode)
9509 mode_alignment = 128;
9510 else if (mode == XCmode)
9511 mode_alignment = 256;
9512 if (COMPLEX_MODE_P (mode))
9513 mode_alignment /= 2;
9514 /* Misaligned fields are always returned in memory. */
9515 if (bit_offset % mode_alignment)
9516 return 0;
9519 /* for V1xx modes, just use the base mode */
9520 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
9521 && GET_MODE_UNIT_SIZE (mode) == bytes)
9522 mode = GET_MODE_INNER (mode);
9524 /* Classification of atomic types. */
9525 switch (mode)
9527 case E_SDmode:
9528 case E_DDmode:
9529 classes[0] = X86_64_SSE_CLASS;
9530 return 1;
9531 case E_TDmode:
9532 classes[0] = X86_64_SSE_CLASS;
9533 classes[1] = X86_64_SSEUP_CLASS;
9534 return 2;
9535 case E_DImode:
9536 case E_SImode:
9537 case E_HImode:
9538 case E_QImode:
9539 case E_CSImode:
9540 case E_CHImode:
9541 case E_CQImode:
9543 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
9545 /* Analyze last 128 bits only. */
9546 size = (size - 1) & 0x7f;
9548 if (size < 32)
9550 classes[0] = X86_64_INTEGERSI_CLASS;
9551 return 1;
9553 else if (size < 64)
9555 classes[0] = X86_64_INTEGER_CLASS;
9556 return 1;
9558 else if (size < 64+32)
9560 classes[0] = X86_64_INTEGER_CLASS;
9561 classes[1] = X86_64_INTEGERSI_CLASS;
9562 return 2;
9564 else if (size < 64+64)
9566 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9567 return 2;
9569 else
9570 gcc_unreachable ();
9572 case E_CDImode:
9573 case E_TImode:
9574 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
9575 return 2;
9576 case E_COImode:
9577 case E_OImode:
9578 /* OImode shouldn't be used directly. */
9579 gcc_unreachable ();
9580 case E_CTImode:
9581 return 0;
9582 case E_SFmode:
9583 if (!(bit_offset % 64))
9584 classes[0] = X86_64_SSESF_CLASS;
9585 else
9586 classes[0] = X86_64_SSE_CLASS;
9587 return 1;
9588 case E_DFmode:
9589 classes[0] = X86_64_SSEDF_CLASS;
9590 return 1;
9591 case E_XFmode:
9592 classes[0] = X86_64_X87_CLASS;
9593 classes[1] = X86_64_X87UP_CLASS;
9594 return 2;
9595 case E_TFmode:
9596 classes[0] = X86_64_SSE_CLASS;
9597 classes[1] = X86_64_SSEUP_CLASS;
9598 return 2;
9599 case E_SCmode:
9600 classes[0] = X86_64_SSE_CLASS;
9601 if (!(bit_offset % 64))
9602 return 1;
9603 else
9605 static bool warned;
9607 if (!warned && warn_psabi)
9609 warned = true;
9610 inform (input_location,
9611 "the ABI of passing structure with complex float"
9612 " member has changed in GCC 4.4");
9614 classes[1] = X86_64_SSESF_CLASS;
9615 return 2;
9617 case E_DCmode:
9618 classes[0] = X86_64_SSEDF_CLASS;
9619 classes[1] = X86_64_SSEDF_CLASS;
9620 return 2;
9621 case E_XCmode:
9622 classes[0] = X86_64_COMPLEX_X87_CLASS;
9623 return 1;
9624 case E_TCmode:
9625 /* This modes is larger than 16 bytes. */
9626 return 0;
9627 case E_V8SFmode:
9628 case E_V8SImode:
9629 case E_V32QImode:
9630 case E_V16HImode:
9631 case E_V4DFmode:
9632 case E_V4DImode:
9633 classes[0] = X86_64_SSE_CLASS;
9634 classes[1] = X86_64_SSEUP_CLASS;
9635 classes[2] = X86_64_SSEUP_CLASS;
9636 classes[3] = X86_64_SSEUP_CLASS;
9637 return 4;
9638 case E_V8DFmode:
9639 case E_V16SFmode:
9640 case E_V8DImode:
9641 case E_V16SImode:
9642 case E_V32HImode:
9643 case E_V64QImode:
9644 classes[0] = X86_64_SSE_CLASS;
9645 classes[1] = X86_64_SSEUP_CLASS;
9646 classes[2] = X86_64_SSEUP_CLASS;
9647 classes[3] = X86_64_SSEUP_CLASS;
9648 classes[4] = X86_64_SSEUP_CLASS;
9649 classes[5] = X86_64_SSEUP_CLASS;
9650 classes[6] = X86_64_SSEUP_CLASS;
9651 classes[7] = X86_64_SSEUP_CLASS;
9652 return 8;
9653 case E_V4SFmode:
9654 case E_V4SImode:
9655 case E_V16QImode:
9656 case E_V8HImode:
9657 case E_V2DFmode:
9658 case E_V2DImode:
9659 classes[0] = X86_64_SSE_CLASS;
9660 classes[1] = X86_64_SSEUP_CLASS;
9661 return 2;
9662 case E_V1TImode:
9663 case E_V1DImode:
9664 case E_V2SFmode:
9665 case E_V2SImode:
9666 case E_V4HImode:
9667 case E_V8QImode:
9668 classes[0] = X86_64_SSE_CLASS;
9669 return 1;
9670 case E_BLKmode:
9671 case E_VOIDmode:
9672 return 0;
9673 default:
9674 gcc_assert (VECTOR_MODE_P (mode));
9676 if (bytes > 16)
9677 return 0;
9679 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
9681 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
9682 classes[0] = X86_64_INTEGERSI_CLASS;
9683 else
9684 classes[0] = X86_64_INTEGER_CLASS;
9685 classes[1] = X86_64_INTEGER_CLASS;
9686 return 1 + (bytes > 8);
9690 /* Examine the argument and return set number of register required in each
9691 class. Return true iff parameter should be passed in memory. */
9693 static bool
9694 examine_argument (machine_mode mode, const_tree type, int in_return,
9695 int *int_nregs, int *sse_nregs)
9697 enum x86_64_reg_class regclass[MAX_CLASSES];
9698 int n = classify_argument (mode, type, regclass, 0);
9700 *int_nregs = 0;
9701 *sse_nregs = 0;
9703 if (!n)
9704 return true;
9705 for (n--; n >= 0; n--)
9706 switch (regclass[n])
9708 case X86_64_INTEGER_CLASS:
9709 case X86_64_INTEGERSI_CLASS:
9710 (*int_nregs)++;
9711 break;
9712 case X86_64_SSE_CLASS:
9713 case X86_64_SSESF_CLASS:
9714 case X86_64_SSEDF_CLASS:
9715 (*sse_nregs)++;
9716 break;
9717 case X86_64_NO_CLASS:
9718 case X86_64_SSEUP_CLASS:
9719 break;
9720 case X86_64_X87_CLASS:
9721 case X86_64_X87UP_CLASS:
9722 case X86_64_COMPLEX_X87_CLASS:
9723 if (!in_return)
9724 return true;
9725 break;
9726 case X86_64_MEMORY_CLASS:
9727 gcc_unreachable ();
9730 return false;
9733 /* Construct container for the argument used by GCC interface. See
9734 FUNCTION_ARG for the detailed description. */
9736 static rtx
9737 construct_container (machine_mode mode, machine_mode orig_mode,
9738 const_tree type, int in_return, int nintregs, int nsseregs,
9739 const int *intreg, int sse_regno)
9741 /* The following variables hold the static issued_error state. */
9742 static bool issued_sse_arg_error;
9743 static bool issued_sse_ret_error;
9744 static bool issued_x87_ret_error;
9746 machine_mode tmpmode;
9747 int bytes =
9748 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
9749 enum x86_64_reg_class regclass[MAX_CLASSES];
9750 int n;
9751 int i;
9752 int nexps = 0;
9753 int needed_sseregs, needed_intregs;
9754 rtx exp[MAX_CLASSES];
9755 rtx ret;
9757 n = classify_argument (mode, type, regclass, 0);
9758 if (!n)
9759 return NULL;
9760 if (examine_argument (mode, type, in_return, &needed_intregs,
9761 &needed_sseregs))
9762 return NULL;
9763 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
9764 return NULL;
9766 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
9767 some less clueful developer tries to use floating-point anyway. */
9768 if (needed_sseregs && !TARGET_SSE)
9770 if (in_return)
9772 if (!issued_sse_ret_error)
9774 error ("SSE register return with SSE disabled");
9775 issued_sse_ret_error = true;
9778 else if (!issued_sse_arg_error)
9780 error ("SSE register argument with SSE disabled");
9781 issued_sse_arg_error = true;
9783 return NULL;
9786 /* Likewise, error if the ABI requires us to return values in the
9787 x87 registers and the user specified -mno-80387. */
9788 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
9789 for (i = 0; i < n; i++)
9790 if (regclass[i] == X86_64_X87_CLASS
9791 || regclass[i] == X86_64_X87UP_CLASS
9792 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
9794 if (!issued_x87_ret_error)
9796 error ("x87 register return with x87 disabled");
9797 issued_x87_ret_error = true;
9799 return NULL;
9802 /* First construct simple cases. Avoid SCmode, since we want to use
9803 single register to pass this type. */
9804 if (n == 1 && mode != SCmode)
9805 switch (regclass[0])
9807 case X86_64_INTEGER_CLASS:
9808 case X86_64_INTEGERSI_CLASS:
9809 return gen_rtx_REG (mode, intreg[0]);
9810 case X86_64_SSE_CLASS:
9811 case X86_64_SSESF_CLASS:
9812 case X86_64_SSEDF_CLASS:
9813 if (mode != BLKmode)
9814 return gen_reg_or_parallel (mode, orig_mode,
9815 SSE_REGNO (sse_regno));
9816 break;
9817 case X86_64_X87_CLASS:
9818 case X86_64_COMPLEX_X87_CLASS:
9819 return gen_rtx_REG (mode, FIRST_STACK_REG);
9820 case X86_64_NO_CLASS:
9821 /* Zero sized array, struct or class. */
9822 return NULL;
9823 default:
9824 gcc_unreachable ();
9826 if (n == 2
9827 && regclass[0] == X86_64_SSE_CLASS
9828 && regclass[1] == X86_64_SSEUP_CLASS
9829 && mode != BLKmode)
9830 return gen_reg_or_parallel (mode, orig_mode,
9831 SSE_REGNO (sse_regno));
9832 if (n == 4
9833 && regclass[0] == X86_64_SSE_CLASS
9834 && regclass[1] == X86_64_SSEUP_CLASS
9835 && regclass[2] == X86_64_SSEUP_CLASS
9836 && regclass[3] == X86_64_SSEUP_CLASS
9837 && mode != BLKmode)
9838 return gen_reg_or_parallel (mode, orig_mode,
9839 SSE_REGNO (sse_regno));
9840 if (n == 8
9841 && regclass[0] == X86_64_SSE_CLASS
9842 && regclass[1] == X86_64_SSEUP_CLASS
9843 && regclass[2] == X86_64_SSEUP_CLASS
9844 && regclass[3] == X86_64_SSEUP_CLASS
9845 && regclass[4] == X86_64_SSEUP_CLASS
9846 && regclass[5] == X86_64_SSEUP_CLASS
9847 && regclass[6] == X86_64_SSEUP_CLASS
9848 && regclass[7] == X86_64_SSEUP_CLASS
9849 && mode != BLKmode)
9850 return gen_reg_or_parallel (mode, orig_mode,
9851 SSE_REGNO (sse_regno));
9852 if (n == 2
9853 && regclass[0] == X86_64_X87_CLASS
9854 && regclass[1] == X86_64_X87UP_CLASS)
9855 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
9857 if (n == 2
9858 && regclass[0] == X86_64_INTEGER_CLASS
9859 && regclass[1] == X86_64_INTEGER_CLASS
9860 && (mode == CDImode || mode == TImode)
9861 && intreg[0] + 1 == intreg[1])
9862 return gen_rtx_REG (mode, intreg[0]);
9864 /* Otherwise figure out the entries of the PARALLEL. */
9865 for (i = 0; i < n; i++)
9867 int pos;
9869 switch (regclass[i])
9871 case X86_64_NO_CLASS:
9872 break;
9873 case X86_64_INTEGER_CLASS:
9874 case X86_64_INTEGERSI_CLASS:
9875 /* Merge TImodes on aligned occasions here too. */
9876 if (i * 8 + 8 > bytes)
9877 tmpmode
9878 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
9879 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
9880 tmpmode = SImode;
9881 else
9882 tmpmode = DImode;
9883 /* We've requested 24 bytes we
9884 don't have mode for. Use DImode. */
9885 if (tmpmode == BLKmode)
9886 tmpmode = DImode;
9887 exp [nexps++]
9888 = gen_rtx_EXPR_LIST (VOIDmode,
9889 gen_rtx_REG (tmpmode, *intreg),
9890 GEN_INT (i*8));
9891 intreg++;
9892 break;
9893 case X86_64_SSESF_CLASS:
9894 exp [nexps++]
9895 = gen_rtx_EXPR_LIST (VOIDmode,
9896 gen_rtx_REG (SFmode,
9897 SSE_REGNO (sse_regno)),
9898 GEN_INT (i*8));
9899 sse_regno++;
9900 break;
9901 case X86_64_SSEDF_CLASS:
9902 exp [nexps++]
9903 = gen_rtx_EXPR_LIST (VOIDmode,
9904 gen_rtx_REG (DFmode,
9905 SSE_REGNO (sse_regno)),
9906 GEN_INT (i*8));
9907 sse_regno++;
9908 break;
9909 case X86_64_SSE_CLASS:
9910 pos = i;
9911 switch (n)
9913 case 1:
9914 tmpmode = DImode;
9915 break;
9916 case 2:
9917 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
9919 tmpmode = TImode;
9920 i++;
9922 else
9923 tmpmode = DImode;
9924 break;
9925 case 4:
9926 gcc_assert (i == 0
9927 && regclass[1] == X86_64_SSEUP_CLASS
9928 && regclass[2] == X86_64_SSEUP_CLASS
9929 && regclass[3] == X86_64_SSEUP_CLASS);
9930 tmpmode = OImode;
9931 i += 3;
9932 break;
9933 case 8:
9934 gcc_assert (i == 0
9935 && regclass[1] == X86_64_SSEUP_CLASS
9936 && regclass[2] == X86_64_SSEUP_CLASS
9937 && regclass[3] == X86_64_SSEUP_CLASS
9938 && regclass[4] == X86_64_SSEUP_CLASS
9939 && regclass[5] == X86_64_SSEUP_CLASS
9940 && regclass[6] == X86_64_SSEUP_CLASS
9941 && regclass[7] == X86_64_SSEUP_CLASS);
9942 tmpmode = XImode;
9943 i += 7;
9944 break;
9945 default:
9946 gcc_unreachable ();
9948 exp [nexps++]
9949 = gen_rtx_EXPR_LIST (VOIDmode,
9950 gen_rtx_REG (tmpmode,
9951 SSE_REGNO (sse_regno)),
9952 GEN_INT (pos*8));
9953 sse_regno++;
9954 break;
9955 default:
9956 gcc_unreachable ();
9960 /* Empty aligned struct, union or class. */
9961 if (nexps == 0)
9962 return NULL;
9964 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
9965 for (i = 0; i < nexps; i++)
9966 XVECEXP (ret, 0, i) = exp [i];
9967 return ret;
9970 /* Update the data in CUM to advance over an argument of mode MODE
9971 and data type TYPE. (TYPE is null for libcalls where that information
9972 may not be available.)
9974 Return a number of integer regsiters advanced over. */
9976 static int
9977 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
9978 const_tree type, HOST_WIDE_INT bytes,
9979 HOST_WIDE_INT words)
9981 int res = 0;
9982 bool error_p = false;
9984 if (TARGET_IAMCU)
9986 /* Intel MCU psABI passes scalars and aggregates no larger than 8
9987 bytes in registers. */
9988 if (!VECTOR_MODE_P (mode) && bytes <= 8)
9989 goto pass_in_reg;
9990 return res;
9993 switch (mode)
9995 default:
9996 break;
9998 case E_BLKmode:
9999 if (bytes < 0)
10000 break;
10001 /* FALLTHRU */
10003 case E_DImode:
10004 case E_SImode:
10005 case E_HImode:
10006 case E_QImode:
10007 pass_in_reg:
10008 cum->words += words;
10009 cum->nregs -= words;
10010 cum->regno += words;
10011 if (cum->nregs >= 0)
10012 res = words;
10013 if (cum->nregs <= 0)
10015 cum->nregs = 0;
10016 cfun->machine->arg_reg_available = false;
10017 cum->regno = 0;
10019 break;
10021 case E_OImode:
10022 /* OImode shouldn't be used directly. */
10023 gcc_unreachable ();
10025 case E_DFmode:
10026 if (cum->float_in_sse == -1)
10027 error_p = true;
10028 if (cum->float_in_sse < 2)
10029 break;
10030 /* FALLTHRU */
10031 case E_SFmode:
10032 if (cum->float_in_sse == -1)
10033 error_p = true;
10034 if (cum->float_in_sse < 1)
10035 break;
10036 /* FALLTHRU */
10038 case E_V8SFmode:
10039 case E_V8SImode:
10040 case E_V64QImode:
10041 case E_V32HImode:
10042 case E_V16SImode:
10043 case E_V8DImode:
10044 case E_V16SFmode:
10045 case E_V8DFmode:
10046 case E_V32QImode:
10047 case E_V16HImode:
10048 case E_V4DFmode:
10049 case E_V4DImode:
10050 case E_TImode:
10051 case E_V16QImode:
10052 case E_V8HImode:
10053 case E_V4SImode:
10054 case E_V2DImode:
10055 case E_V4SFmode:
10056 case E_V2DFmode:
10057 if (!type || !AGGREGATE_TYPE_P (type))
10059 cum->sse_words += words;
10060 cum->sse_nregs -= 1;
10061 cum->sse_regno += 1;
10062 if (cum->sse_nregs <= 0)
10064 cum->sse_nregs = 0;
10065 cum->sse_regno = 0;
10068 break;
10070 case E_V8QImode:
10071 case E_V4HImode:
10072 case E_V2SImode:
10073 case E_V2SFmode:
10074 case E_V1TImode:
10075 case E_V1DImode:
10076 if (!type || !AGGREGATE_TYPE_P (type))
10078 cum->mmx_words += words;
10079 cum->mmx_nregs -= 1;
10080 cum->mmx_regno += 1;
10081 if (cum->mmx_nregs <= 0)
10083 cum->mmx_nregs = 0;
10084 cum->mmx_regno = 0;
10087 break;
10089 if (error_p)
10091 cum->float_in_sse = 0;
10092 error ("calling %qD with SSE calling convention without "
10093 "SSE/SSE2 enabled", cum->decl);
10094 sorry ("this is a GCC bug that can be worked around by adding "
10095 "attribute used to function called");
10098 return res;
10101 static int
10102 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
10103 const_tree type, HOST_WIDE_INT words, bool named)
10105 int int_nregs, sse_nregs;
10107 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
10108 if (!named && (VALID_AVX512F_REG_MODE (mode)
10109 || VALID_AVX256_REG_MODE (mode)))
10110 return 0;
10112 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
10113 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
10115 cum->nregs -= int_nregs;
10116 cum->sse_nregs -= sse_nregs;
10117 cum->regno += int_nregs;
10118 cum->sse_regno += sse_nregs;
10119 return int_nregs;
10121 else
10123 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
10124 cum->words = ROUND_UP (cum->words, align);
10125 cum->words += words;
10126 return 0;
10130 static int
10131 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
10132 HOST_WIDE_INT words)
10134 /* Otherwise, this should be passed indirect. */
10135 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
10137 cum->words += words;
10138 if (cum->nregs > 0)
10140 cum->nregs -= 1;
10141 cum->regno += 1;
10142 return 1;
10144 return 0;
10147 /* Update the data in CUM to advance over an argument of mode MODE and
10148 data type TYPE. (TYPE is null for libcalls where that information
10149 may not be available.) */
10151 static void
10152 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
10153 const_tree type, bool named)
10155 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10156 HOST_WIDE_INT bytes, words;
10157 int nregs;
10159 /* The argument of interrupt handler is a special case and is
10160 handled in ix86_function_arg. */
10161 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10162 return;
10164 if (mode == BLKmode)
10165 bytes = int_size_in_bytes (type);
10166 else
10167 bytes = GET_MODE_SIZE (mode);
10168 words = CEIL (bytes, UNITS_PER_WORD);
10170 if (type)
10171 mode = type_natural_mode (type, NULL, false);
10173 if ((type && POINTER_BOUNDS_TYPE_P (type))
10174 || POINTER_BOUNDS_MODE_P (mode))
10176 /* If we pass bounds in BT then just update remained bounds count. */
10177 if (cum->bnds_in_bt)
10179 cum->bnds_in_bt--;
10180 return;
10183 /* Update remained number of bounds to force. */
10184 if (cum->force_bnd_pass)
10185 cum->force_bnd_pass--;
10187 cum->bnd_regno++;
10189 return;
10192 /* The first arg not going to Bounds Tables resets this counter. */
10193 cum->bnds_in_bt = 0;
10194 /* For unnamed args we always pass bounds to avoid bounds mess when
10195 passed and received types do not match. If bounds do not follow
10196 unnamed arg, still pretend required number of bounds were passed. */
10197 if (cum->force_bnd_pass)
10199 cum->bnd_regno += cum->force_bnd_pass;
10200 cum->force_bnd_pass = 0;
10203 if (TARGET_64BIT)
10205 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10207 if (call_abi == MS_ABI)
10208 nregs = function_arg_advance_ms_64 (cum, bytes, words);
10209 else
10210 nregs = function_arg_advance_64 (cum, mode, type, words, named);
10212 else
10213 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
10215 /* For stdarg we expect bounds to be passed for each value passed
10216 in register. */
10217 if (cum->stdarg)
10218 cum->force_bnd_pass = nregs;
10219 /* For pointers passed in memory we expect bounds passed in Bounds
10220 Table. */
10221 if (!nregs)
10223 /* Track if there are outgoing arguments on stack. */
10224 if (cum->caller)
10225 cfun->machine->outgoing_args_on_stack = true;
10227 cum->bnds_in_bt = chkp_type_bounds_count (type);
10231 /* Define where to put the arguments to a function.
10232 Value is zero to push the argument on the stack,
10233 or a hard register in which to store the argument.
10235 MODE is the argument's machine mode.
10236 TYPE is the data type of the argument (as a tree).
10237 This is null for libcalls where that information may
10238 not be available.
10239 CUM is a variable of type CUMULATIVE_ARGS which gives info about
10240 the preceding args and about the function being called.
10241 NAMED is nonzero if this argument is a named parameter
10242 (otherwise it is an extra parameter matching an ellipsis). */
10244 static rtx
10245 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
10246 machine_mode orig_mode, const_tree type,
10247 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
10249 bool error_p = false;
10251 /* Avoid the AL settings for the Unix64 ABI. */
10252 if (mode == VOIDmode)
10253 return constm1_rtx;
10255 if (TARGET_IAMCU)
10257 /* Intel MCU psABI passes scalars and aggregates no larger than 8
10258 bytes in registers. */
10259 if (!VECTOR_MODE_P (mode) && bytes <= 8)
10260 goto pass_in_reg;
10261 return NULL_RTX;
10264 switch (mode)
10266 default:
10267 break;
10269 case E_BLKmode:
10270 if (bytes < 0)
10271 break;
10272 /* FALLTHRU */
10273 case E_DImode:
10274 case E_SImode:
10275 case E_HImode:
10276 case E_QImode:
10277 pass_in_reg:
10278 if (words <= cum->nregs)
10280 int regno = cum->regno;
10282 /* Fastcall allocates the first two DWORD (SImode) or
10283 smaller arguments to ECX and EDX if it isn't an
10284 aggregate type . */
10285 if (cum->fastcall)
10287 if (mode == BLKmode
10288 || mode == DImode
10289 || (type && AGGREGATE_TYPE_P (type)))
10290 break;
10292 /* ECX not EAX is the first allocated register. */
10293 if (regno == AX_REG)
10294 regno = CX_REG;
10296 return gen_rtx_REG (mode, regno);
10298 break;
10300 case E_DFmode:
10301 if (cum->float_in_sse == -1)
10302 error_p = true;
10303 if (cum->float_in_sse < 2)
10304 break;
10305 /* FALLTHRU */
10306 case E_SFmode:
10307 if (cum->float_in_sse == -1)
10308 error_p = true;
10309 if (cum->float_in_sse < 1)
10310 break;
10311 /* FALLTHRU */
10312 case E_TImode:
10313 /* In 32bit, we pass TImode in xmm registers. */
10314 case E_V16QImode:
10315 case E_V8HImode:
10316 case E_V4SImode:
10317 case E_V2DImode:
10318 case E_V4SFmode:
10319 case E_V2DFmode:
10320 if (!type || !AGGREGATE_TYPE_P (type))
10322 if (cum->sse_nregs)
10323 return gen_reg_or_parallel (mode, orig_mode,
10324 cum->sse_regno + FIRST_SSE_REG);
10326 break;
10328 case E_OImode:
10329 case E_XImode:
10330 /* OImode and XImode shouldn't be used directly. */
10331 gcc_unreachable ();
10333 case E_V64QImode:
10334 case E_V32HImode:
10335 case E_V16SImode:
10336 case E_V8DImode:
10337 case E_V16SFmode:
10338 case E_V8DFmode:
10339 case E_V8SFmode:
10340 case E_V8SImode:
10341 case E_V32QImode:
10342 case E_V16HImode:
10343 case E_V4DFmode:
10344 case E_V4DImode:
10345 if (!type || !AGGREGATE_TYPE_P (type))
10347 if (cum->sse_nregs)
10348 return gen_reg_or_parallel (mode, orig_mode,
10349 cum->sse_regno + FIRST_SSE_REG);
10351 break;
10353 case E_V8QImode:
10354 case E_V4HImode:
10355 case E_V2SImode:
10356 case E_V2SFmode:
10357 case E_V1TImode:
10358 case E_V1DImode:
10359 if (!type || !AGGREGATE_TYPE_P (type))
10361 if (cum->mmx_nregs)
10362 return gen_reg_or_parallel (mode, orig_mode,
10363 cum->mmx_regno + FIRST_MMX_REG);
10365 break;
10367 if (error_p)
10369 cum->float_in_sse = 0;
10370 error ("calling %qD with SSE calling convention without "
10371 "SSE/SSE2 enabled", cum->decl);
10372 sorry ("this is a GCC bug that can be worked around by adding "
10373 "attribute used to function called");
10376 return NULL_RTX;
10379 static rtx
10380 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10381 machine_mode orig_mode, const_tree type, bool named)
10383 /* Handle a hidden AL argument containing number of registers
10384 for varargs x86-64 functions. */
10385 if (mode == VOIDmode)
10386 return GEN_INT (cum->maybe_vaarg
10387 ? (cum->sse_nregs < 0
10388 ? X86_64_SSE_REGPARM_MAX
10389 : cum->sse_regno)
10390 : -1);
10392 switch (mode)
10394 default:
10395 break;
10397 case E_V8SFmode:
10398 case E_V8SImode:
10399 case E_V32QImode:
10400 case E_V16HImode:
10401 case E_V4DFmode:
10402 case E_V4DImode:
10403 case E_V16SFmode:
10404 case E_V16SImode:
10405 case E_V64QImode:
10406 case E_V32HImode:
10407 case E_V8DFmode:
10408 case E_V8DImode:
10409 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10410 if (!named)
10411 return NULL;
10412 break;
10415 return construct_container (mode, orig_mode, type, 0, cum->nregs,
10416 cum->sse_nregs,
10417 &x86_64_int_parameter_registers [cum->regno],
10418 cum->sse_regno);
10421 static rtx
10422 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
10423 machine_mode orig_mode, bool named,
10424 HOST_WIDE_INT bytes)
10426 unsigned int regno;
10428 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
10429 We use value of -2 to specify that current function call is MSABI. */
10430 if (mode == VOIDmode)
10431 return GEN_INT (-2);
10433 /* If we've run out of registers, it goes on the stack. */
10434 if (cum->nregs == 0)
10435 return NULL_RTX;
10437 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
10439 /* Only floating point modes are passed in anything but integer regs. */
10440 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
10442 if (named)
10443 regno = cum->regno + FIRST_SSE_REG;
10444 else
10446 rtx t1, t2;
10448 /* Unnamed floating parameters are passed in both the
10449 SSE and integer registers. */
10450 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
10451 t2 = gen_rtx_REG (mode, regno);
10452 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
10453 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
10454 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
10457 /* Handle aggregated types passed in register. */
10458 if (orig_mode == BLKmode)
10460 if (bytes > 0 && bytes <= 8)
10461 mode = (bytes > 4 ? DImode : SImode);
10462 if (mode == BLKmode)
10463 mode = DImode;
10466 return gen_reg_or_parallel (mode, orig_mode, regno);
10469 /* Return where to put the arguments to a function.
10470 Return zero to push the argument on the stack, or a hard register in which to store the argument.
10472 MODE is the argument's machine mode. TYPE is the data type of the
10473 argument. It is null for libcalls where that information may not be
10474 available. CUM gives information about the preceding args and about
10475 the function being called. NAMED is nonzero if this argument is a
10476 named parameter (otherwise it is an extra parameter matching an
10477 ellipsis). */
10479 static rtx
10480 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
10481 const_tree type, bool named)
10483 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10484 machine_mode mode = omode;
10485 HOST_WIDE_INT bytes, words;
10486 rtx arg;
10488 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
10490 gcc_assert (type != NULL_TREE);
10491 if (POINTER_TYPE_P (type))
10493 /* This is the pointer argument. */
10494 gcc_assert (TYPE_MODE (type) == Pmode);
10495 /* It is at -WORD(AP) in the current frame in interrupt and
10496 exception handlers. */
10497 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
10499 else
10501 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
10502 && TREE_CODE (type) == INTEGER_TYPE
10503 && TYPE_MODE (type) == word_mode);
10504 /* The error code is the word-mode integer argument at
10505 -2 * WORD(AP) in the current frame of the exception
10506 handler. */
10507 arg = gen_rtx_MEM (word_mode,
10508 plus_constant (Pmode,
10509 arg_pointer_rtx,
10510 -2 * UNITS_PER_WORD));
10512 return arg;
10515 /* All pointer bounds arguments are handled separately here. */
10516 if ((type && POINTER_BOUNDS_TYPE_P (type))
10517 || POINTER_BOUNDS_MODE_P (mode))
10519 /* Return NULL if bounds are forced to go in Bounds Table. */
10520 if (cum->bnds_in_bt)
10521 arg = NULL;
10522 /* Return the next available bound reg if any. */
10523 else if (cum->bnd_regno <= LAST_BND_REG)
10524 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
10525 /* Return the next special slot number otherwise. */
10526 else
10527 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
10529 return arg;
10532 if (mode == BLKmode)
10533 bytes = int_size_in_bytes (type);
10534 else
10535 bytes = GET_MODE_SIZE (mode);
10536 words = CEIL (bytes, UNITS_PER_WORD);
10538 /* To simplify the code below, represent vector types with a vector mode
10539 even if MMX/SSE are not active. */
10540 if (type && TREE_CODE (type) == VECTOR_TYPE)
10541 mode = type_natural_mode (type, cum, false);
10543 if (TARGET_64BIT)
10545 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10547 if (call_abi == MS_ABI)
10548 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
10549 else
10550 arg = function_arg_64 (cum, mode, omode, type, named);
10552 else
10553 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
10555 /* Track if there are outgoing arguments on stack. */
10556 if (arg == NULL_RTX && cum->caller)
10557 cfun->machine->outgoing_args_on_stack = true;
10559 return arg;
10562 /* A C expression that indicates when an argument must be passed by
10563 reference. If nonzero for an argument, a copy of that argument is
10564 made in memory and a pointer to the argument is passed instead of
10565 the argument itself. The pointer is passed in whatever way is
10566 appropriate for passing a pointer to that type. */
10568 static bool
10569 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
10570 const_tree type, bool)
10572 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10574 /* Bounds are never passed by reference. */
10575 if ((type && POINTER_BOUNDS_TYPE_P (type))
10576 || POINTER_BOUNDS_MODE_P (mode))
10577 return false;
10579 if (TARGET_64BIT)
10581 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
10583 /* See Windows x64 Software Convention. */
10584 if (call_abi == MS_ABI)
10586 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
10588 if (type)
10590 /* Arrays are passed by reference. */
10591 if (TREE_CODE (type) == ARRAY_TYPE)
10592 return true;
10594 if (RECORD_OR_UNION_TYPE_P (type))
10596 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
10597 are passed by reference. */
10598 msize = int_size_in_bytes (type);
10602 /* __m128 is passed by reference. */
10603 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
10605 else if (type && int_size_in_bytes (type) == -1)
10606 return true;
10609 return false;
10612 /* Return true when TYPE should be 128bit aligned for 32bit argument
10613 passing ABI. XXX: This function is obsolete and is only used for
10614 checking psABI compatibility with previous versions of GCC. */
10616 static bool
10617 ix86_compat_aligned_value_p (const_tree type)
10619 machine_mode mode = TYPE_MODE (type);
10620 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
10621 || mode == TDmode
10622 || mode == TFmode
10623 || mode == TCmode)
10624 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
10625 return true;
10626 if (TYPE_ALIGN (type) < 128)
10627 return false;
10629 if (AGGREGATE_TYPE_P (type))
10631 /* Walk the aggregates recursively. */
10632 switch (TREE_CODE (type))
10634 case RECORD_TYPE:
10635 case UNION_TYPE:
10636 case QUAL_UNION_TYPE:
10638 tree field;
10640 /* Walk all the structure fields. */
10641 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
10643 if (TREE_CODE (field) == FIELD_DECL
10644 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
10645 return true;
10647 break;
10650 case ARRAY_TYPE:
10651 /* Just for use if some languages passes arrays by value. */
10652 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
10653 return true;
10654 break;
10656 default:
10657 gcc_unreachable ();
10660 return false;
10663 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
10664 XXX: This function is obsolete and is only used for checking psABI
10665 compatibility with previous versions of GCC. */
10667 static unsigned int
10668 ix86_compat_function_arg_boundary (machine_mode mode,
10669 const_tree type, unsigned int align)
10671 /* In 32bit, only _Decimal128 and __float128 are aligned to their
10672 natural boundaries. */
10673 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
10675 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
10676 make an exception for SSE modes since these require 128bit
10677 alignment.
10679 The handling here differs from field_alignment. ICC aligns MMX
10680 arguments to 4 byte boundaries, while structure fields are aligned
10681 to 8 byte boundaries. */
10682 if (!type)
10684 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
10685 align = PARM_BOUNDARY;
10687 else
10689 if (!ix86_compat_aligned_value_p (type))
10690 align = PARM_BOUNDARY;
10693 if (align > BIGGEST_ALIGNMENT)
10694 align = BIGGEST_ALIGNMENT;
10695 return align;
10698 /* Return true when TYPE should be 128bit aligned for 32bit argument
10699 passing ABI. */
10701 static bool
10702 ix86_contains_aligned_value_p (const_tree type)
10704 machine_mode mode = TYPE_MODE (type);
10706 if (mode == XFmode || mode == XCmode)
10707 return false;
10709 if (TYPE_ALIGN (type) < 128)
10710 return false;
10712 if (AGGREGATE_TYPE_P (type))
10714 /* Walk the aggregates recursively. */
10715 switch (TREE_CODE (type))
10717 case RECORD_TYPE:
10718 case UNION_TYPE:
10719 case QUAL_UNION_TYPE:
10721 tree field;
10723 /* Walk all the structure fields. */
10724 for (field = TYPE_FIELDS (type);
10725 field;
10726 field = DECL_CHAIN (field))
10728 if (TREE_CODE (field) == FIELD_DECL
10729 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
10730 return true;
10732 break;
10735 case ARRAY_TYPE:
10736 /* Just for use if some languages passes arrays by value. */
10737 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
10738 return true;
10739 break;
10741 default:
10742 gcc_unreachable ();
10745 else
10746 return TYPE_ALIGN (type) >= 128;
10748 return false;
10751 /* Gives the alignment boundary, in bits, of an argument with the
10752 specified mode and type. */
10754 static unsigned int
10755 ix86_function_arg_boundary (machine_mode mode, const_tree type)
10757 unsigned int align;
10758 if (type)
10760 /* Since the main variant type is used for call, we convert it to
10761 the main variant type. */
10762 type = TYPE_MAIN_VARIANT (type);
10763 align = TYPE_ALIGN (type);
10765 else
10766 align = GET_MODE_ALIGNMENT (mode);
10767 if (align < PARM_BOUNDARY)
10768 align = PARM_BOUNDARY;
10769 else
10771 static bool warned;
10772 unsigned int saved_align = align;
10774 if (!TARGET_64BIT)
10776 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
10777 if (!type)
10779 if (mode == XFmode || mode == XCmode)
10780 align = PARM_BOUNDARY;
10782 else if (!ix86_contains_aligned_value_p (type))
10783 align = PARM_BOUNDARY;
10785 if (align < 128)
10786 align = PARM_BOUNDARY;
10789 if (warn_psabi
10790 && !warned
10791 && align != ix86_compat_function_arg_boundary (mode, type,
10792 saved_align))
10794 warned = true;
10795 inform (input_location,
10796 "The ABI for passing parameters with %d-byte"
10797 " alignment has changed in GCC 4.6",
10798 align / BITS_PER_UNIT);
10802 return align;
10805 /* Return true if N is a possible register number of function value. */
10807 static bool
10808 ix86_function_value_regno_p (const unsigned int regno)
10810 switch (regno)
10812 case AX_REG:
10813 return true;
10814 case DX_REG:
10815 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
10816 case DI_REG:
10817 case SI_REG:
10818 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
10820 case BND0_REG:
10821 case BND1_REG:
10822 return chkp_function_instrumented_p (current_function_decl);
10824 /* Complex values are returned in %st(0)/%st(1) pair. */
10825 case ST0_REG:
10826 case ST1_REG:
10827 /* TODO: The function should depend on current function ABI but
10828 builtins.c would need updating then. Therefore we use the
10829 default ABI. */
10830 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
10831 return false;
10832 return TARGET_FLOAT_RETURNS_IN_80387;
10834 /* Complex values are returned in %xmm0/%xmm1 pair. */
10835 case XMM0_REG:
10836 case XMM1_REG:
10837 return TARGET_SSE;
10839 case MM0_REG:
10840 if (TARGET_MACHO || TARGET_64BIT)
10841 return false;
10842 return TARGET_MMX;
10845 return false;
10848 /* Define how to find the value returned by a function.
10849 VALTYPE is the data type of the value (as a tree).
10850 If the precise function being called is known, FUNC is its FUNCTION_DECL;
10851 otherwise, FUNC is 0. */
10853 static rtx
10854 function_value_32 (machine_mode orig_mode, machine_mode mode,
10855 const_tree fntype, const_tree fn)
10857 unsigned int regno;
10859 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
10860 we normally prevent this case when mmx is not available. However
10861 some ABIs may require the result to be returned like DImode. */
10862 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
10863 regno = FIRST_MMX_REG;
10865 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
10866 we prevent this case when sse is not available. However some ABIs
10867 may require the result to be returned like integer TImode. */
10868 else if (mode == TImode
10869 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
10870 regno = FIRST_SSE_REG;
10872 /* 32-byte vector modes in %ymm0. */
10873 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
10874 regno = FIRST_SSE_REG;
10876 /* 64-byte vector modes in %zmm0. */
10877 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
10878 regno = FIRST_SSE_REG;
10880 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
10881 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
10882 regno = FIRST_FLOAT_REG;
10883 else
10884 /* Most things go in %eax. */
10885 regno = AX_REG;
10887 /* Override FP return register with %xmm0 for local functions when
10888 SSE math is enabled or for functions with sseregparm attribute. */
10889 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
10891 int sse_level = ix86_function_sseregparm (fntype, fn, false);
10892 if (sse_level == -1)
10894 error ("calling %qD with SSE calling convention without "
10895 "SSE/SSE2 enabled", fn);
10896 sorry ("this is a GCC bug that can be worked around by adding "
10897 "attribute used to function called");
10899 else if ((sse_level >= 1 && mode == SFmode)
10900 || (sse_level == 2 && mode == DFmode))
10901 regno = FIRST_SSE_REG;
10904 /* OImode shouldn't be used directly. */
10905 gcc_assert (mode != OImode);
10907 return gen_rtx_REG (orig_mode, regno);
10910 static rtx
10911 function_value_64 (machine_mode orig_mode, machine_mode mode,
10912 const_tree valtype)
10914 rtx ret;
10916 /* Handle libcalls, which don't provide a type node. */
10917 if (valtype == NULL)
10919 unsigned int regno;
10921 switch (mode)
10923 case E_SFmode:
10924 case E_SCmode:
10925 case E_DFmode:
10926 case E_DCmode:
10927 case E_TFmode:
10928 case E_SDmode:
10929 case E_DDmode:
10930 case E_TDmode:
10931 regno = FIRST_SSE_REG;
10932 break;
10933 case E_XFmode:
10934 case E_XCmode:
10935 regno = FIRST_FLOAT_REG;
10936 break;
10937 case E_TCmode:
10938 return NULL;
10939 default:
10940 regno = AX_REG;
10943 return gen_rtx_REG (mode, regno);
10945 else if (POINTER_TYPE_P (valtype))
10947 /* Pointers are always returned in word_mode. */
10948 mode = word_mode;
10951 ret = construct_container (mode, orig_mode, valtype, 1,
10952 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
10953 x86_64_int_return_registers, 0);
10955 /* For zero sized structures, construct_container returns NULL, but we
10956 need to keep rest of compiler happy by returning meaningful value. */
10957 if (!ret)
10958 ret = gen_rtx_REG (orig_mode, AX_REG);
10960 return ret;
10963 static rtx
10964 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
10965 const_tree valtype)
10967 unsigned int regno = AX_REG;
10969 if (TARGET_SSE)
10971 switch (GET_MODE_SIZE (mode))
10973 case 16:
10974 if (valtype != NULL_TREE
10975 && !VECTOR_INTEGER_TYPE_P (valtype)
10976 && !VECTOR_INTEGER_TYPE_P (valtype)
10977 && !INTEGRAL_TYPE_P (valtype)
10978 && !VECTOR_FLOAT_TYPE_P (valtype))
10979 break;
10980 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
10981 && !COMPLEX_MODE_P (mode))
10982 regno = FIRST_SSE_REG;
10983 break;
10984 case 8:
10985 case 4:
10986 if (mode == SFmode || mode == DFmode)
10987 regno = FIRST_SSE_REG;
10988 break;
10989 default:
10990 break;
10993 return gen_rtx_REG (orig_mode, regno);
10996 static rtx
10997 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
10998 machine_mode orig_mode, machine_mode mode)
11000 const_tree fn, fntype;
11002 fn = NULL_TREE;
11003 if (fntype_or_decl && DECL_P (fntype_or_decl))
11004 fn = fntype_or_decl;
11005 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
11007 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
11008 || POINTER_BOUNDS_MODE_P (mode))
11009 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
11010 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
11011 return function_value_ms_64 (orig_mode, mode, valtype);
11012 else if (TARGET_64BIT)
11013 return function_value_64 (orig_mode, mode, valtype);
11014 else
11015 return function_value_32 (orig_mode, mode, fntype, fn);
11018 static rtx
11019 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
11021 machine_mode mode, orig_mode;
11023 orig_mode = TYPE_MODE (valtype);
11024 mode = type_natural_mode (valtype, NULL, true);
11025 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
11028 /* Return an RTX representing a place where a function returns
11029 or recieves pointer bounds or NULL if no bounds are returned.
11031 VALTYPE is a data type of a value returned by the function.
11033 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
11034 or FUNCTION_TYPE of the function.
11036 If OUTGOING is false, return a place in which the caller will
11037 see the return value. Otherwise, return a place where a
11038 function returns a value. */
11040 static rtx
11041 ix86_function_value_bounds (const_tree valtype,
11042 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
11043 bool outgoing ATTRIBUTE_UNUSED)
11045 rtx res = NULL_RTX;
11047 if (BOUNDED_TYPE_P (valtype))
11048 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
11049 else if (chkp_type_has_pointer (valtype))
11051 bitmap slots;
11052 rtx bounds[2];
11053 bitmap_iterator bi;
11054 unsigned i, bnd_no = 0;
11056 bitmap_obstack_initialize (NULL);
11057 slots = BITMAP_ALLOC (NULL);
11058 chkp_find_bound_slots (valtype, slots);
11060 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
11062 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
11063 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
11064 gcc_assert (bnd_no < 2);
11065 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
11068 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
11070 BITMAP_FREE (slots);
11071 bitmap_obstack_release (NULL);
11073 else
11074 res = NULL_RTX;
11076 return res;
11079 /* Pointer function arguments and return values are promoted to
11080 word_mode for normal functions. */
11082 static machine_mode
11083 ix86_promote_function_mode (const_tree type, machine_mode mode,
11084 int *punsignedp, const_tree fntype,
11085 int for_return)
11087 if (cfun->machine->func_type == TYPE_NORMAL
11088 && type != NULL_TREE
11089 && POINTER_TYPE_P (type))
11091 *punsignedp = POINTERS_EXTEND_UNSIGNED;
11092 return word_mode;
11094 return default_promote_function_mode (type, mode, punsignedp, fntype,
11095 for_return);
11098 /* Return true if a structure, union or array with MODE containing FIELD
11099 should be accessed using BLKmode. */
11101 static bool
11102 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
11104 /* Union with XFmode must be in BLKmode. */
11105 return (mode == XFmode
11106 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
11107 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
11111 ix86_libcall_value (machine_mode mode)
11113 return ix86_function_value_1 (NULL, NULL, mode, mode);
11116 /* Return true iff type is returned in memory. */
11118 static bool
11119 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
11121 #ifdef SUBTARGET_RETURN_IN_MEMORY
11122 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
11123 #else
11124 const machine_mode mode = type_natural_mode (type, NULL, true);
11125 HOST_WIDE_INT size;
11127 if (POINTER_BOUNDS_TYPE_P (type))
11128 return false;
11130 if (TARGET_64BIT)
11132 if (ix86_function_type_abi (fntype) == MS_ABI)
11134 size = int_size_in_bytes (type);
11136 /* __m128 is returned in xmm0. */
11137 if ((!type || VECTOR_INTEGER_TYPE_P (type)
11138 || INTEGRAL_TYPE_P (type)
11139 || VECTOR_FLOAT_TYPE_P (type))
11140 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
11141 && !COMPLEX_MODE_P (mode)
11142 && (GET_MODE_SIZE (mode) == 16 || size == 16))
11143 return false;
11145 /* Otherwise, the size must be exactly in [1248]. */
11146 return size != 1 && size != 2 && size != 4 && size != 8;
11148 else
11150 int needed_intregs, needed_sseregs;
11152 return examine_argument (mode, type, 1,
11153 &needed_intregs, &needed_sseregs);
11156 else
11158 size = int_size_in_bytes (type);
11160 /* Intel MCU psABI returns scalars and aggregates no larger than 8
11161 bytes in registers. */
11162 if (TARGET_IAMCU)
11163 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
11165 if (mode == BLKmode)
11166 return true;
11168 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
11169 return false;
11171 if (VECTOR_MODE_P (mode) || mode == TImode)
11173 /* User-created vectors small enough to fit in EAX. */
11174 if (size < 8)
11175 return false;
11177 /* Unless ABI prescibes otherwise,
11178 MMX/3dNow values are returned in MM0 if available. */
11180 if (size == 8)
11181 return TARGET_VECT8_RETURNS || !TARGET_MMX;
11183 /* SSE values are returned in XMM0 if available. */
11184 if (size == 16)
11185 return !TARGET_SSE;
11187 /* AVX values are returned in YMM0 if available. */
11188 if (size == 32)
11189 return !TARGET_AVX;
11191 /* AVX512F values are returned in ZMM0 if available. */
11192 if (size == 64)
11193 return !TARGET_AVX512F;
11196 if (mode == XFmode)
11197 return false;
11199 if (size > 12)
11200 return true;
11202 /* OImode shouldn't be used directly. */
11203 gcc_assert (mode != OImode);
11205 return false;
11207 #endif
11211 /* Create the va_list data type. */
11213 static tree
11214 ix86_build_builtin_va_list_64 (void)
11216 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
11218 record = lang_hooks.types.make_type (RECORD_TYPE);
11219 type_decl = build_decl (BUILTINS_LOCATION,
11220 TYPE_DECL, get_identifier ("__va_list_tag"), record);
11222 f_gpr = build_decl (BUILTINS_LOCATION,
11223 FIELD_DECL, get_identifier ("gp_offset"),
11224 unsigned_type_node);
11225 f_fpr = build_decl (BUILTINS_LOCATION,
11226 FIELD_DECL, get_identifier ("fp_offset"),
11227 unsigned_type_node);
11228 f_ovf = build_decl (BUILTINS_LOCATION,
11229 FIELD_DECL, get_identifier ("overflow_arg_area"),
11230 ptr_type_node);
11231 f_sav = build_decl (BUILTINS_LOCATION,
11232 FIELD_DECL, get_identifier ("reg_save_area"),
11233 ptr_type_node);
11235 va_list_gpr_counter_field = f_gpr;
11236 va_list_fpr_counter_field = f_fpr;
11238 DECL_FIELD_CONTEXT (f_gpr) = record;
11239 DECL_FIELD_CONTEXT (f_fpr) = record;
11240 DECL_FIELD_CONTEXT (f_ovf) = record;
11241 DECL_FIELD_CONTEXT (f_sav) = record;
11243 TYPE_STUB_DECL (record) = type_decl;
11244 TYPE_NAME (record) = type_decl;
11245 TYPE_FIELDS (record) = f_gpr;
11246 DECL_CHAIN (f_gpr) = f_fpr;
11247 DECL_CHAIN (f_fpr) = f_ovf;
11248 DECL_CHAIN (f_ovf) = f_sav;
11250 layout_type (record);
11252 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
11253 NULL_TREE, TYPE_ATTRIBUTES (record));
11255 /* The correct type is an array type of one element. */
11256 return build_array_type (record, build_index_type (size_zero_node));
11259 /* Setup the builtin va_list data type and for 64-bit the additional
11260 calling convention specific va_list data types. */
11262 static tree
11263 ix86_build_builtin_va_list (void)
11265 if (TARGET_64BIT)
11267 /* Initialize ABI specific va_list builtin types.
11269 In lto1, we can encounter two va_list types:
11270 - one as a result of the type-merge across TUs, and
11271 - the one constructed here.
11272 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
11273 a type identity check in canonical_va_list_type based on
11274 TYPE_MAIN_VARIANT (which we used to have) will not work.
11275 Instead, we tag each va_list_type_node with its unique attribute, and
11276 look for the attribute in the type identity check in
11277 canonical_va_list_type.
11279 Tagging sysv_va_list_type_node directly with the attribute is
11280 problematic since it's a array of one record, which will degrade into a
11281 pointer to record when used as parameter (see build_va_arg comments for
11282 an example), dropping the attribute in the process. So we tag the
11283 record instead. */
11285 /* For SYSV_ABI we use an array of one record. */
11286 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
11288 /* For MS_ABI we use plain pointer to argument area. */
11289 tree char_ptr_type = build_pointer_type (char_type_node);
11290 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
11291 TYPE_ATTRIBUTES (char_ptr_type));
11292 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
11294 return ((ix86_abi == MS_ABI)
11295 ? ms_va_list_type_node
11296 : sysv_va_list_type_node);
11298 else
11300 /* For i386 we use plain pointer to argument area. */
11301 return build_pointer_type (char_type_node);
11305 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
11307 static void
11308 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
11310 rtx save_area, mem;
11311 alias_set_type set;
11312 int i, max;
11314 /* GPR size of varargs save area. */
11315 if (cfun->va_list_gpr_size)
11316 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
11317 else
11318 ix86_varargs_gpr_size = 0;
11320 /* FPR size of varargs save area. We don't need it if we don't pass
11321 anything in SSE registers. */
11322 if (TARGET_SSE && cfun->va_list_fpr_size)
11323 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
11324 else
11325 ix86_varargs_fpr_size = 0;
11327 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
11328 return;
11330 save_area = frame_pointer_rtx;
11331 set = get_varargs_alias_set ();
11333 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11334 if (max > X86_64_REGPARM_MAX)
11335 max = X86_64_REGPARM_MAX;
11337 for (i = cum->regno; i < max; i++)
11339 mem = gen_rtx_MEM (word_mode,
11340 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
11341 MEM_NOTRAP_P (mem) = 1;
11342 set_mem_alias_set (mem, set);
11343 emit_move_insn (mem,
11344 gen_rtx_REG (word_mode,
11345 x86_64_int_parameter_registers[i]));
11348 if (ix86_varargs_fpr_size)
11350 machine_mode smode;
11351 rtx_code_label *label;
11352 rtx test;
11354 /* Now emit code to save SSE registers. The AX parameter contains number
11355 of SSE parameter registers used to call this function, though all we
11356 actually check here is the zero/non-zero status. */
11358 label = gen_label_rtx ();
11359 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
11360 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
11361 label));
11363 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
11364 we used movdqa (i.e. TImode) instead? Perhaps even better would
11365 be if we could determine the real mode of the data, via a hook
11366 into pass_stdarg. Ignore all that for now. */
11367 smode = V4SFmode;
11368 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
11369 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
11371 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
11372 if (max > X86_64_SSE_REGPARM_MAX)
11373 max = X86_64_SSE_REGPARM_MAX;
11375 for (i = cum->sse_regno; i < max; ++i)
11377 mem = plus_constant (Pmode, save_area,
11378 i * 16 + ix86_varargs_gpr_size);
11379 mem = gen_rtx_MEM (smode, mem);
11380 MEM_NOTRAP_P (mem) = 1;
11381 set_mem_alias_set (mem, set);
11382 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
11384 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
11387 emit_label (label);
11391 static void
11392 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
11394 alias_set_type set = get_varargs_alias_set ();
11395 int i;
11397 /* Reset to zero, as there might be a sysv vaarg used
11398 before. */
11399 ix86_varargs_gpr_size = 0;
11400 ix86_varargs_fpr_size = 0;
11402 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
11404 rtx reg, mem;
11406 mem = gen_rtx_MEM (Pmode,
11407 plus_constant (Pmode, virtual_incoming_args_rtx,
11408 i * UNITS_PER_WORD));
11409 MEM_NOTRAP_P (mem) = 1;
11410 set_mem_alias_set (mem, set);
11412 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
11413 emit_move_insn (mem, reg);
11417 static void
11418 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
11419 tree type, int *, int no_rtl)
11421 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11422 CUMULATIVE_ARGS next_cum;
11423 tree fntype;
11425 /* This argument doesn't appear to be used anymore. Which is good,
11426 because the old code here didn't suppress rtl generation. */
11427 gcc_assert (!no_rtl);
11429 if (!TARGET_64BIT)
11430 return;
11432 fntype = TREE_TYPE (current_function_decl);
11434 /* For varargs, we do not want to skip the dummy va_dcl argument.
11435 For stdargs, we do want to skip the last named argument. */
11436 next_cum = *cum;
11437 if (stdarg_p (fntype))
11438 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11439 true);
11441 if (cum->call_abi == MS_ABI)
11442 setup_incoming_varargs_ms_64 (&next_cum);
11443 else
11444 setup_incoming_varargs_64 (&next_cum);
11447 static void
11448 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
11449 machine_mode mode,
11450 tree type,
11451 int *pretend_size ATTRIBUTE_UNUSED,
11452 int no_rtl)
11454 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
11455 CUMULATIVE_ARGS next_cum;
11456 tree fntype;
11457 rtx save_area;
11458 int bnd_reg, i, max;
11460 gcc_assert (!no_rtl);
11462 /* Do nothing if we use plain pointer to argument area. */
11463 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
11464 return;
11466 fntype = TREE_TYPE (current_function_decl);
11468 /* For varargs, we do not want to skip the dummy va_dcl argument.
11469 For stdargs, we do want to skip the last named argument. */
11470 next_cum = *cum;
11471 if (stdarg_p (fntype))
11472 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
11473 true);
11474 save_area = frame_pointer_rtx;
11476 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
11477 if (max > X86_64_REGPARM_MAX)
11478 max = X86_64_REGPARM_MAX;
11480 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
11481 if (chkp_function_instrumented_p (current_function_decl))
11482 for (i = cum->regno; i < max; i++)
11484 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
11485 rtx ptr = gen_rtx_REG (Pmode,
11486 x86_64_int_parameter_registers[i]);
11487 rtx bounds;
11489 if (bnd_reg <= LAST_BND_REG)
11490 bounds = gen_rtx_REG (BNDmode, bnd_reg);
11491 else
11493 rtx ldx_addr =
11494 plus_constant (Pmode, arg_pointer_rtx,
11495 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
11496 bounds = gen_reg_rtx (BNDmode);
11497 emit_insn (BNDmode == BND64mode
11498 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
11499 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
11502 emit_insn (BNDmode == BND64mode
11503 ? gen_bnd64_stx (addr, ptr, bounds)
11504 : gen_bnd32_stx (addr, ptr, bounds));
11506 bnd_reg++;
11511 /* Checks if TYPE is of kind va_list char *. */
11513 static bool
11514 is_va_list_char_pointer (tree type)
11516 tree canonic;
11518 /* For 32-bit it is always true. */
11519 if (!TARGET_64BIT)
11520 return true;
11521 canonic = ix86_canonical_va_list_type (type);
11522 return (canonic == ms_va_list_type_node
11523 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
11526 /* Implement va_start. */
11528 static void
11529 ix86_va_start (tree valist, rtx nextarg)
11531 HOST_WIDE_INT words, n_gpr, n_fpr;
11532 tree f_gpr, f_fpr, f_ovf, f_sav;
11533 tree gpr, fpr, ovf, sav, t;
11534 tree type;
11535 rtx ovf_rtx;
11537 if (flag_split_stack
11538 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11540 unsigned int scratch_regno;
11542 /* When we are splitting the stack, we can't refer to the stack
11543 arguments using internal_arg_pointer, because they may be on
11544 the old stack. The split stack prologue will arrange to
11545 leave a pointer to the old stack arguments in a scratch
11546 register, which we here copy to a pseudo-register. The split
11547 stack prologue can't set the pseudo-register directly because
11548 it (the prologue) runs before any registers have been saved. */
11550 scratch_regno = split_stack_prologue_scratch_regno ();
11551 if (scratch_regno != INVALID_REGNUM)
11553 rtx reg;
11554 rtx_insn *seq;
11556 reg = gen_reg_rtx (Pmode);
11557 cfun->machine->split_stack_varargs_pointer = reg;
11559 start_sequence ();
11560 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
11561 seq = get_insns ();
11562 end_sequence ();
11564 push_topmost_sequence ();
11565 emit_insn_after (seq, entry_of_function ());
11566 pop_topmost_sequence ();
11570 /* Only 64bit target needs something special. */
11571 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11573 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11574 std_expand_builtin_va_start (valist, nextarg);
11575 else
11577 rtx va_r, next;
11579 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
11580 next = expand_binop (ptr_mode, add_optab,
11581 cfun->machine->split_stack_varargs_pointer,
11582 crtl->args.arg_offset_rtx,
11583 NULL_RTX, 0, OPTAB_LIB_WIDEN);
11584 convert_move (va_r, next, 0);
11586 /* Store zero bounds for va_list. */
11587 if (chkp_function_instrumented_p (current_function_decl))
11588 chkp_expand_bounds_reset_for_mem (valist,
11589 make_tree (TREE_TYPE (valist),
11590 next));
11593 return;
11596 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11597 f_fpr = DECL_CHAIN (f_gpr);
11598 f_ovf = DECL_CHAIN (f_fpr);
11599 f_sav = DECL_CHAIN (f_ovf);
11601 valist = build_simple_mem_ref (valist);
11602 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
11603 /* The following should be folded into the MEM_REF offset. */
11604 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
11605 f_gpr, NULL_TREE);
11606 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
11607 f_fpr, NULL_TREE);
11608 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
11609 f_ovf, NULL_TREE);
11610 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
11611 f_sav, NULL_TREE);
11613 /* Count number of gp and fp argument registers used. */
11614 words = crtl->args.info.words;
11615 n_gpr = crtl->args.info.regno;
11616 n_fpr = crtl->args.info.sse_regno;
11618 if (cfun->va_list_gpr_size)
11620 type = TREE_TYPE (gpr);
11621 t = build2 (MODIFY_EXPR, type,
11622 gpr, build_int_cst (type, n_gpr * 8));
11623 TREE_SIDE_EFFECTS (t) = 1;
11624 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11627 if (TARGET_SSE && cfun->va_list_fpr_size)
11629 type = TREE_TYPE (fpr);
11630 t = build2 (MODIFY_EXPR, type, fpr,
11631 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
11632 TREE_SIDE_EFFECTS (t) = 1;
11633 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11636 /* Find the overflow area. */
11637 type = TREE_TYPE (ovf);
11638 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
11639 ovf_rtx = crtl->args.internal_arg_pointer;
11640 else
11641 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
11642 t = make_tree (type, ovf_rtx);
11643 if (words != 0)
11644 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
11646 /* Store zero bounds for overflow area pointer. */
11647 if (chkp_function_instrumented_p (current_function_decl))
11648 chkp_expand_bounds_reset_for_mem (ovf, t);
11650 t = build2 (MODIFY_EXPR, type, ovf, t);
11651 TREE_SIDE_EFFECTS (t) = 1;
11652 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11654 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
11656 /* Find the register save area.
11657 Prologue of the function save it right above stack frame. */
11658 type = TREE_TYPE (sav);
11659 t = make_tree (type, frame_pointer_rtx);
11660 if (!ix86_varargs_gpr_size)
11661 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
11663 /* Store zero bounds for save area pointer. */
11664 if (chkp_function_instrumented_p (current_function_decl))
11665 chkp_expand_bounds_reset_for_mem (sav, t);
11667 t = build2 (MODIFY_EXPR, type, sav, t);
11668 TREE_SIDE_EFFECTS (t) = 1;
11669 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11673 /* Implement va_arg. */
11675 static tree
11676 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
11677 gimple_seq *post_p)
11679 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
11680 tree f_gpr, f_fpr, f_ovf, f_sav;
11681 tree gpr, fpr, ovf, sav, t;
11682 int size, rsize;
11683 tree lab_false, lab_over = NULL_TREE;
11684 tree addr, t2;
11685 rtx container;
11686 int indirect_p = 0;
11687 tree ptrtype;
11688 machine_mode nat_mode;
11689 unsigned int arg_boundary;
11691 /* Only 64bit target needs something special. */
11692 if (is_va_list_char_pointer (TREE_TYPE (valist)))
11693 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
11695 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
11696 f_fpr = DECL_CHAIN (f_gpr);
11697 f_ovf = DECL_CHAIN (f_fpr);
11698 f_sav = DECL_CHAIN (f_ovf);
11700 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
11701 valist, f_gpr, NULL_TREE);
11703 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
11704 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
11705 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
11707 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11708 if (indirect_p)
11709 type = build_pointer_type (type);
11710 size = int_size_in_bytes (type);
11711 rsize = CEIL (size, UNITS_PER_WORD);
11713 nat_mode = type_natural_mode (type, NULL, false);
11714 switch (nat_mode)
11716 case E_V8SFmode:
11717 case E_V8SImode:
11718 case E_V32QImode:
11719 case E_V16HImode:
11720 case E_V4DFmode:
11721 case E_V4DImode:
11722 case E_V16SFmode:
11723 case E_V16SImode:
11724 case E_V64QImode:
11725 case E_V32HImode:
11726 case E_V8DFmode:
11727 case E_V8DImode:
11728 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
11729 if (!TARGET_64BIT_MS_ABI)
11731 container = NULL;
11732 break;
11734 /* FALLTHRU */
11736 default:
11737 container = construct_container (nat_mode, TYPE_MODE (type),
11738 type, 0, X86_64_REGPARM_MAX,
11739 X86_64_SSE_REGPARM_MAX, intreg,
11741 break;
11744 /* Pull the value out of the saved registers. */
11746 addr = create_tmp_var (ptr_type_node, "addr");
11748 if (container)
11750 int needed_intregs, needed_sseregs;
11751 bool need_temp;
11752 tree int_addr, sse_addr;
11754 lab_false = create_artificial_label (UNKNOWN_LOCATION);
11755 lab_over = create_artificial_label (UNKNOWN_LOCATION);
11757 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
11759 need_temp = (!REG_P (container)
11760 && ((needed_intregs && TYPE_ALIGN (type) > 64)
11761 || TYPE_ALIGN (type) > 128));
11763 /* In case we are passing structure, verify that it is consecutive block
11764 on the register save area. If not we need to do moves. */
11765 if (!need_temp && !REG_P (container))
11767 /* Verify that all registers are strictly consecutive */
11768 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
11770 int i;
11772 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11774 rtx slot = XVECEXP (container, 0, i);
11775 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
11776 || INTVAL (XEXP (slot, 1)) != i * 16)
11777 need_temp = true;
11780 else
11782 int i;
11784 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
11786 rtx slot = XVECEXP (container, 0, i);
11787 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
11788 || INTVAL (XEXP (slot, 1)) != i * 8)
11789 need_temp = true;
11793 if (!need_temp)
11795 int_addr = addr;
11796 sse_addr = addr;
11798 else
11800 int_addr = create_tmp_var (ptr_type_node, "int_addr");
11801 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
11804 /* First ensure that we fit completely in registers. */
11805 if (needed_intregs)
11807 t = build_int_cst (TREE_TYPE (gpr),
11808 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
11809 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
11810 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11811 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11812 gimplify_and_add (t, pre_p);
11814 if (needed_sseregs)
11816 t = build_int_cst (TREE_TYPE (fpr),
11817 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
11818 + X86_64_REGPARM_MAX * 8);
11819 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
11820 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
11821 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
11822 gimplify_and_add (t, pre_p);
11825 /* Compute index to start of area used for integer regs. */
11826 if (needed_intregs)
11828 /* int_addr = gpr + sav; */
11829 t = fold_build_pointer_plus (sav, gpr);
11830 gimplify_assign (int_addr, t, pre_p);
11832 if (needed_sseregs)
11834 /* sse_addr = fpr + sav; */
11835 t = fold_build_pointer_plus (sav, fpr);
11836 gimplify_assign (sse_addr, t, pre_p);
11838 if (need_temp)
11840 int i, prev_size = 0;
11841 tree temp = create_tmp_var (type, "va_arg_tmp");
11843 /* addr = &temp; */
11844 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
11845 gimplify_assign (addr, t, pre_p);
11847 for (i = 0; i < XVECLEN (container, 0); i++)
11849 rtx slot = XVECEXP (container, 0, i);
11850 rtx reg = XEXP (slot, 0);
11851 machine_mode mode = GET_MODE (reg);
11852 tree piece_type;
11853 tree addr_type;
11854 tree daddr_type;
11855 tree src_addr, src;
11856 int src_offset;
11857 tree dest_addr, dest;
11858 int cur_size = GET_MODE_SIZE (mode);
11860 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
11861 prev_size = INTVAL (XEXP (slot, 1));
11862 if (prev_size + cur_size > size)
11864 cur_size = size - prev_size;
11865 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
11866 if (mode == BLKmode)
11867 mode = QImode;
11869 piece_type = lang_hooks.types.type_for_mode (mode, 1);
11870 if (mode == GET_MODE (reg))
11871 addr_type = build_pointer_type (piece_type);
11872 else
11873 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11874 true);
11875 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
11876 true);
11878 if (SSE_REGNO_P (REGNO (reg)))
11880 src_addr = sse_addr;
11881 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
11883 else
11885 src_addr = int_addr;
11886 src_offset = REGNO (reg) * 8;
11888 src_addr = fold_convert (addr_type, src_addr);
11889 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
11891 dest_addr = fold_convert (daddr_type, addr);
11892 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
11893 if (cur_size == GET_MODE_SIZE (mode))
11895 src = build_va_arg_indirect_ref (src_addr);
11896 dest = build_va_arg_indirect_ref (dest_addr);
11898 gimplify_assign (dest, src, pre_p);
11900 else
11902 tree copy
11903 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
11904 3, dest_addr, src_addr,
11905 size_int (cur_size));
11906 gimplify_and_add (copy, pre_p);
11908 prev_size += cur_size;
11912 if (needed_intregs)
11914 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
11915 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
11916 gimplify_assign (gpr, t, pre_p);
11919 if (needed_sseregs)
11921 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
11922 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
11923 gimplify_assign (unshare_expr (fpr), t, pre_p);
11926 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
11928 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
11931 /* ... otherwise out of the overflow area. */
11933 /* When we align parameter on stack for caller, if the parameter
11934 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
11935 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
11936 here with caller. */
11937 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
11938 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
11939 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
11941 /* Care for on-stack alignment if needed. */
11942 if (arg_boundary <= 64 || size == 0)
11943 t = ovf;
11944 else
11946 HOST_WIDE_INT align = arg_boundary / 8;
11947 t = fold_build_pointer_plus_hwi (ovf, align - 1);
11948 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
11949 build_int_cst (TREE_TYPE (t), -align));
11952 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
11953 gimplify_assign (addr, t, pre_p);
11955 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
11956 gimplify_assign (unshare_expr (ovf), t, pre_p);
11958 if (container)
11959 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
11961 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
11962 addr = fold_convert (ptrtype, addr);
11964 if (indirect_p)
11965 addr = build_va_arg_indirect_ref (addr);
11966 return build_va_arg_indirect_ref (addr);
11969 /* Return true if OPNUM's MEM should be matched
11970 in movabs* patterns. */
11972 bool
11973 ix86_check_movabs (rtx insn, int opnum)
11975 rtx set, mem;
11977 set = PATTERN (insn);
11978 if (GET_CODE (set) == PARALLEL)
11979 set = XVECEXP (set, 0, 0);
11980 gcc_assert (GET_CODE (set) == SET);
11981 mem = XEXP (set, opnum);
11982 while (SUBREG_P (mem))
11983 mem = SUBREG_REG (mem);
11984 gcc_assert (MEM_P (mem));
11985 return volatile_ok || !MEM_VOLATILE_P (mem);
11988 /* Return false if INSN contains a MEM with a non-default address space. */
11989 bool
11990 ix86_check_no_addr_space (rtx insn)
11992 subrtx_var_iterator::array_type array;
11993 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
11995 rtx x = *iter;
11996 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
11997 return false;
11999 return true;
12002 /* Initialize the table of extra 80387 mathematical constants. */
12004 static void
12005 init_ext_80387_constants (void)
12007 static const char * cst[5] =
12009 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
12010 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
12011 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
12012 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
12013 "3.1415926535897932385128089594061862044", /* 4: fldpi */
12015 int i;
12017 for (i = 0; i < 5; i++)
12019 real_from_string (&ext_80387_constants_table[i], cst[i]);
12020 /* Ensure each constant is rounded to XFmode precision. */
12021 real_convert (&ext_80387_constants_table[i],
12022 XFmode, &ext_80387_constants_table[i]);
12025 ext_80387_constants_init = 1;
12028 /* Return non-zero if the constant is something that
12029 can be loaded with a special instruction. */
12032 standard_80387_constant_p (rtx x)
12034 machine_mode mode = GET_MODE (x);
12036 const REAL_VALUE_TYPE *r;
12038 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
12039 return -1;
12041 if (x == CONST0_RTX (mode))
12042 return 1;
12043 if (x == CONST1_RTX (mode))
12044 return 2;
12046 r = CONST_DOUBLE_REAL_VALUE (x);
12048 /* For XFmode constants, try to find a special 80387 instruction when
12049 optimizing for size or on those CPUs that benefit from them. */
12050 if (mode == XFmode
12051 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
12053 int i;
12055 if (! ext_80387_constants_init)
12056 init_ext_80387_constants ();
12058 for (i = 0; i < 5; i++)
12059 if (real_identical (r, &ext_80387_constants_table[i]))
12060 return i + 3;
12063 /* Load of the constant -0.0 or -1.0 will be split as
12064 fldz;fchs or fld1;fchs sequence. */
12065 if (real_isnegzero (r))
12066 return 8;
12067 if (real_identical (r, &dconstm1))
12068 return 9;
12070 return 0;
12073 /* Return the opcode of the special instruction to be used to load
12074 the constant X. */
12076 const char *
12077 standard_80387_constant_opcode (rtx x)
12079 switch (standard_80387_constant_p (x))
12081 case 1:
12082 return "fldz";
12083 case 2:
12084 return "fld1";
12085 case 3:
12086 return "fldlg2";
12087 case 4:
12088 return "fldln2";
12089 case 5:
12090 return "fldl2e";
12091 case 6:
12092 return "fldl2t";
12093 case 7:
12094 return "fldpi";
12095 case 8:
12096 case 9:
12097 return "#";
12098 default:
12099 gcc_unreachable ();
12103 /* Return the CONST_DOUBLE representing the 80387 constant that is
12104 loaded by the specified special instruction. The argument IDX
12105 matches the return value from standard_80387_constant_p. */
12108 standard_80387_constant_rtx (int idx)
12110 int i;
12112 if (! ext_80387_constants_init)
12113 init_ext_80387_constants ();
12115 switch (idx)
12117 case 3:
12118 case 4:
12119 case 5:
12120 case 6:
12121 case 7:
12122 i = idx - 3;
12123 break;
12125 default:
12126 gcc_unreachable ();
12129 return const_double_from_real_value (ext_80387_constants_table[i],
12130 XFmode);
12133 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
12134 in supported SSE/AVX vector mode. */
12137 standard_sse_constant_p (rtx x, machine_mode pred_mode)
12139 machine_mode mode;
12141 if (!TARGET_SSE)
12142 return 0;
12144 mode = GET_MODE (x);
12146 if (x == const0_rtx || const0_operand (x, mode))
12147 return 1;
12149 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12151 /* VOIDmode integer constant, get mode from the predicate. */
12152 if (mode == VOIDmode)
12153 mode = pred_mode;
12155 switch (GET_MODE_SIZE (mode))
12157 case 64:
12158 if (TARGET_AVX512F)
12159 return 2;
12160 break;
12161 case 32:
12162 if (TARGET_AVX2)
12163 return 2;
12164 break;
12165 case 16:
12166 if (TARGET_SSE2)
12167 return 2;
12168 break;
12169 case 0:
12170 /* VOIDmode */
12171 gcc_unreachable ();
12172 default:
12173 break;
12177 return 0;
12180 /* Return the opcode of the special instruction to be used to load
12181 the constant X. */
12183 const char *
12184 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
12186 machine_mode mode;
12188 gcc_assert (TARGET_SSE);
12190 mode = GET_MODE (x);
12192 if (x == const0_rtx || const0_operand (x, mode))
12194 switch (get_attr_mode (insn))
12196 case MODE_XI:
12197 return "vpxord\t%g0, %g0, %g0";
12198 case MODE_OI:
12199 return (TARGET_AVX512VL
12200 ? "vpxord\t%x0, %x0, %x0"
12201 : "vpxor\t%x0, %x0, %x0");
12202 case MODE_TI:
12203 return (TARGET_AVX512VL
12204 ? "vpxord\t%t0, %t0, %t0"
12205 : "%vpxor\t%0, %d0");
12207 case MODE_V8DF:
12208 return (TARGET_AVX512DQ
12209 ? "vxorpd\t%g0, %g0, %g0"
12210 : "vpxorq\t%g0, %g0, %g0");
12211 case MODE_V4DF:
12212 return "vxorpd\t%x0, %x0, %x0";
12213 case MODE_V2DF:
12214 return "%vxorpd\t%0, %d0";
12216 case MODE_V16SF:
12217 return (TARGET_AVX512DQ
12218 ? "vxorps\t%g0, %g0, %g0"
12219 : "vpxord\t%g0, %g0, %g0");
12220 case MODE_V8SF:
12221 return "vxorps\t%x0, %x0, %x0";
12222 case MODE_V4SF:
12223 return "%vxorps\t%0, %d0";
12225 default:
12226 gcc_unreachable ();
12229 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
12231 enum attr_mode insn_mode = get_attr_mode (insn);
12233 switch (insn_mode)
12235 case MODE_XI:
12236 case MODE_V8DF:
12237 case MODE_V16SF:
12238 gcc_assert (TARGET_AVX512F);
12239 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
12241 case MODE_OI:
12242 case MODE_V4DF:
12243 case MODE_V8SF:
12244 gcc_assert (TARGET_AVX2);
12245 /* FALLTHRU */
12246 case MODE_TI:
12247 case MODE_V2DF:
12248 case MODE_V4SF:
12249 gcc_assert (TARGET_SSE2);
12250 return (TARGET_AVX
12251 ? "vpcmpeqd\t%0, %0, %0"
12252 : "pcmpeqd\t%0, %0");
12254 default:
12255 gcc_unreachable ();
12259 gcc_unreachable ();
12262 /* Returns true if INSN can be transformed from a memory load
12263 to a supported FP constant load. */
12265 bool
12266 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
12268 rtx src = find_constant_src (insn);
12270 gcc_assert (REG_P (dst));
12272 if (src == NULL
12273 || (SSE_REGNO_P (REGNO (dst))
12274 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
12275 || (STACK_REGNO_P (REGNO (dst))
12276 && standard_80387_constant_p (src) < 1))
12277 return false;
12279 return true;
12282 /* Returns true if OP contains a symbol reference */
12284 bool
12285 symbolic_reference_mentioned_p (rtx op)
12287 const char *fmt;
12288 int i;
12290 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
12291 return true;
12293 fmt = GET_RTX_FORMAT (GET_CODE (op));
12294 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
12296 if (fmt[i] == 'E')
12298 int j;
12300 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
12301 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
12302 return true;
12305 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
12306 return true;
12309 return false;
12312 /* Return true if it is appropriate to emit `ret' instructions in the
12313 body of a function. Do this only if the epilogue is simple, needing a
12314 couple of insns. Prior to reloading, we can't tell how many registers
12315 must be saved, so return false then. Return false if there is no frame
12316 marker to de-allocate. */
12318 bool
12319 ix86_can_use_return_insn_p (void)
12321 struct ix86_frame frame;
12323 if (ix86_function_naked (current_function_decl))
12324 return false;
12326 /* Don't use `ret' instruction in interrupt handler. */
12327 if (! reload_completed
12328 || frame_pointer_needed
12329 || cfun->machine->func_type != TYPE_NORMAL)
12330 return 0;
12332 /* Don't allow more than 32k pop, since that's all we can do
12333 with one instruction. */
12334 if (crtl->args.pops_args && crtl->args.size >= 32768)
12335 return 0;
12337 frame = cfun->machine->frame;
12338 return (frame.stack_pointer_offset == UNITS_PER_WORD
12339 && (frame.nregs + frame.nsseregs) == 0);
12342 /* Value should be nonzero if functions must have frame pointers.
12343 Zero means the frame pointer need not be set up (and parms may
12344 be accessed via the stack pointer) in functions that seem suitable. */
12346 static bool
12347 ix86_frame_pointer_required (void)
12349 /* If we accessed previous frames, then the generated code expects
12350 to be able to access the saved ebp value in our frame. */
12351 if (cfun->machine->accesses_prev_frame)
12352 return true;
12354 /* Several x86 os'es need a frame pointer for other reasons,
12355 usually pertaining to setjmp. */
12356 if (SUBTARGET_FRAME_POINTER_REQUIRED)
12357 return true;
12359 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
12360 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
12361 return true;
12363 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
12364 allocation is 4GB. */
12365 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
12366 return true;
12368 /* SSE saves require frame-pointer when stack is misaligned. */
12369 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
12370 return true;
12372 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
12373 turns off the frame pointer by default. Turn it back on now if
12374 we've not got a leaf function. */
12375 if (TARGET_OMIT_LEAF_FRAME_POINTER
12376 && (!crtl->is_leaf
12377 || ix86_current_function_calls_tls_descriptor))
12378 return true;
12380 if (crtl->profile && !flag_fentry)
12381 return true;
12383 return false;
12386 /* Record that the current function accesses previous call frames. */
12388 void
12389 ix86_setup_frame_addresses (void)
12391 cfun->machine->accesses_prev_frame = 1;
12394 #ifndef USE_HIDDEN_LINKONCE
12395 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
12396 # define USE_HIDDEN_LINKONCE 1
12397 # else
12398 # define USE_HIDDEN_LINKONCE 0
12399 # endif
12400 #endif
12402 static int pic_labels_used;
12404 /* Fills in the label name that should be used for a pc thunk for
12405 the given register. */
12407 static void
12408 get_pc_thunk_name (char name[32], unsigned int regno)
12410 gcc_assert (!TARGET_64BIT);
12412 if (USE_HIDDEN_LINKONCE)
12413 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
12414 else
12415 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
12419 /* This function generates code for -fpic that loads %ebx with
12420 the return address of the caller and then returns. */
12422 static void
12423 ix86_code_end (void)
12425 rtx xops[2];
12426 int regno;
12428 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
12430 char name[32];
12431 tree decl;
12433 if (!(pic_labels_used & (1 << regno)))
12434 continue;
12436 get_pc_thunk_name (name, regno);
12438 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
12439 get_identifier (name),
12440 build_function_type_list (void_type_node, NULL_TREE));
12441 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
12442 NULL_TREE, void_type_node);
12443 TREE_PUBLIC (decl) = 1;
12444 TREE_STATIC (decl) = 1;
12445 DECL_IGNORED_P (decl) = 1;
12447 #if TARGET_MACHO
12448 if (TARGET_MACHO)
12450 switch_to_section (darwin_sections[picbase_thunk_section]);
12451 fputs ("\t.weak_definition\t", asm_out_file);
12452 assemble_name (asm_out_file, name);
12453 fputs ("\n\t.private_extern\t", asm_out_file);
12454 assemble_name (asm_out_file, name);
12455 putc ('\n', asm_out_file);
12456 ASM_OUTPUT_LABEL (asm_out_file, name);
12457 DECL_WEAK (decl) = 1;
12459 else
12460 #endif
12461 if (USE_HIDDEN_LINKONCE)
12463 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
12465 targetm.asm_out.unique_section (decl, 0);
12466 switch_to_section (get_named_section (decl, NULL, 0));
12468 targetm.asm_out.globalize_label (asm_out_file, name);
12469 fputs ("\t.hidden\t", asm_out_file);
12470 assemble_name (asm_out_file, name);
12471 putc ('\n', asm_out_file);
12472 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
12474 else
12476 switch_to_section (text_section);
12477 ASM_OUTPUT_LABEL (asm_out_file, name);
12480 DECL_INITIAL (decl) = make_node (BLOCK);
12481 current_function_decl = decl;
12482 allocate_struct_function (decl, false);
12483 init_function_start (decl);
12484 /* We're about to hide the function body from callees of final_* by
12485 emitting it directly; tell them we're a thunk, if they care. */
12486 cfun->is_thunk = true;
12487 first_function_block_is_cold = false;
12488 /* Make sure unwind info is emitted for the thunk if needed. */
12489 final_start_function (emit_barrier (), asm_out_file, 1);
12491 /* Pad stack IP move with 4 instructions (two NOPs count
12492 as one instruction). */
12493 if (TARGET_PAD_SHORT_FUNCTION)
12495 int i = 8;
12497 while (i--)
12498 fputs ("\tnop\n", asm_out_file);
12501 xops[0] = gen_rtx_REG (Pmode, regno);
12502 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
12503 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
12504 output_asm_insn ("%!ret", NULL);
12505 final_end_function ();
12506 init_insn_lengths ();
12507 free_after_compilation (cfun);
12508 set_cfun (NULL);
12509 current_function_decl = NULL;
12512 if (flag_split_stack)
12513 file_end_indicate_split_stack ();
12516 /* Emit code for the SET_GOT patterns. */
12518 const char *
12519 output_set_got (rtx dest, rtx label)
12521 rtx xops[3];
12523 xops[0] = dest;
12525 if (TARGET_VXWORKS_RTP && flag_pic)
12527 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
12528 xops[2] = gen_rtx_MEM (Pmode,
12529 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
12530 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
12532 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
12533 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
12534 an unadorned address. */
12535 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
12536 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
12537 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
12538 return "";
12541 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
12543 if (flag_pic)
12545 char name[32];
12546 get_pc_thunk_name (name, REGNO (dest));
12547 pic_labels_used |= 1 << REGNO (dest);
12549 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
12550 xops[2] = gen_rtx_MEM (QImode, xops[2]);
12551 output_asm_insn ("%!call\t%X2", xops);
12553 #if TARGET_MACHO
12554 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
12555 This is what will be referenced by the Mach-O PIC subsystem. */
12556 if (machopic_should_output_picbase_label () || !label)
12557 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
12559 /* When we are restoring the pic base at the site of a nonlocal label,
12560 and we decided to emit the pic base above, we will still output a
12561 local label used for calculating the correction offset (even though
12562 the offset will be 0 in that case). */
12563 if (label)
12564 targetm.asm_out.internal_label (asm_out_file, "L",
12565 CODE_LABEL_NUMBER (label));
12566 #endif
12568 else
12570 if (TARGET_MACHO)
12571 /* We don't need a pic base, we're not producing pic. */
12572 gcc_unreachable ();
12574 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
12575 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
12576 targetm.asm_out.internal_label (asm_out_file, "L",
12577 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
12580 if (!TARGET_MACHO)
12581 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
12583 return "";
12586 /* Generate an "push" pattern for input ARG. */
12588 static rtx
12589 gen_push (rtx arg)
12591 struct machine_function *m = cfun->machine;
12593 if (m->fs.cfa_reg == stack_pointer_rtx)
12594 m->fs.cfa_offset += UNITS_PER_WORD;
12595 m->fs.sp_offset += UNITS_PER_WORD;
12597 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12598 arg = gen_rtx_REG (word_mode, REGNO (arg));
12600 return gen_rtx_SET (gen_rtx_MEM (word_mode,
12601 gen_rtx_PRE_DEC (Pmode,
12602 stack_pointer_rtx)),
12603 arg);
12606 /* Generate an "pop" pattern for input ARG. */
12608 static rtx
12609 gen_pop (rtx arg)
12611 if (REG_P (arg) && GET_MODE (arg) != word_mode)
12612 arg = gen_rtx_REG (word_mode, REGNO (arg));
12614 return gen_rtx_SET (arg,
12615 gen_rtx_MEM (word_mode,
12616 gen_rtx_POST_INC (Pmode,
12617 stack_pointer_rtx)));
12620 /* Return >= 0 if there is an unused call-clobbered register available
12621 for the entire function. */
12623 static unsigned int
12624 ix86_select_alt_pic_regnum (void)
12626 if (ix86_use_pseudo_pic_reg ())
12627 return INVALID_REGNUM;
12629 if (crtl->is_leaf
12630 && !crtl->profile
12631 && !ix86_current_function_calls_tls_descriptor)
12633 int i, drap;
12634 /* Can't use the same register for both PIC and DRAP. */
12635 if (crtl->drap_reg)
12636 drap = REGNO (crtl->drap_reg);
12637 else
12638 drap = -1;
12639 for (i = 2; i >= 0; --i)
12640 if (i != drap && !df_regs_ever_live_p (i))
12641 return i;
12644 return INVALID_REGNUM;
12647 /* Return true if REGNO is used by the epilogue. */
12649 bool
12650 ix86_epilogue_uses (int regno)
12652 /* If there are no caller-saved registers, we preserve all registers,
12653 except for MMX and x87 registers which aren't supported when saving
12654 and restoring registers. Don't explicitly save SP register since
12655 it is always preserved. */
12656 return (epilogue_completed
12657 && cfun->machine->no_caller_saved_registers
12658 && !fixed_regs[regno]
12659 && !STACK_REGNO_P (regno)
12660 && !MMX_REGNO_P (regno));
12663 /* Return nonzero if register REGNO can be used as a scratch register
12664 in peephole2. */
12666 static bool
12667 ix86_hard_regno_scratch_ok (unsigned int regno)
12669 /* If there are no caller-saved registers, we can't use any register
12670 as a scratch register after epilogue and use REGNO as scratch
12671 register only if it has been used before to avoid saving and
12672 restoring it. */
12673 return (!cfun->machine->no_caller_saved_registers
12674 || (!epilogue_completed
12675 && df_regs_ever_live_p (regno)));
12678 /* Return true if register class CL should be an additional allocno
12679 class. */
12681 static bool
12682 ix86_additional_allocno_class_p (reg_class_t cl)
12684 return cl == MOD4_SSE_REGS;
12687 /* Return TRUE if we need to save REGNO. */
12689 static bool
12690 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
12692 /* If there are no caller-saved registers, we preserve all registers,
12693 except for MMX and x87 registers which aren't supported when saving
12694 and restoring registers. Don't explicitly save SP register since
12695 it is always preserved. */
12696 if (cfun->machine->no_caller_saved_registers)
12698 /* Don't preserve registers used for function return value. */
12699 rtx reg = crtl->return_rtx;
12700 if (reg)
12702 unsigned int i = REGNO (reg);
12703 unsigned int nregs = hard_regno_nregs[i][GET_MODE (reg)];
12704 while (nregs-- > 0)
12705 if ((i + nregs) == regno)
12706 return false;
12708 reg = crtl->return_bnd;
12709 if (reg)
12711 i = REGNO (reg);
12712 nregs = hard_regno_nregs[i][GET_MODE (reg)];
12713 while (nregs-- > 0)
12714 if ((i + nregs) == regno)
12715 return false;
12719 return (df_regs_ever_live_p (regno)
12720 && !fixed_regs[regno]
12721 && !STACK_REGNO_P (regno)
12722 && !MMX_REGNO_P (regno)
12723 && (regno != HARD_FRAME_POINTER_REGNUM
12724 || !frame_pointer_needed));
12727 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
12728 && pic_offset_table_rtx)
12730 if (ix86_use_pseudo_pic_reg ())
12732 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
12733 _mcount in prologue. */
12734 if (!TARGET_64BIT && flag_pic && crtl->profile)
12735 return true;
12737 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
12738 || crtl->profile
12739 || crtl->calls_eh_return
12740 || crtl->uses_const_pool
12741 || cfun->has_nonlocal_label)
12742 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
12745 if (crtl->calls_eh_return && maybe_eh_return)
12747 unsigned i;
12748 for (i = 0; ; i++)
12750 unsigned test = EH_RETURN_DATA_REGNO (i);
12751 if (test == INVALID_REGNUM)
12752 break;
12753 if (test == regno)
12754 return true;
12758 if (ignore_outlined && cfun->machine->call_ms2sysv)
12760 unsigned count = cfun->machine->call_ms2sysv_extra_regs
12761 + xlogue_layout::MIN_REGS;
12762 if (xlogue_layout::is_stub_managed_reg (regno, count))
12763 return false;
12766 if (crtl->drap_reg
12767 && regno == REGNO (crtl->drap_reg)
12768 && !cfun->machine->no_drap_save_restore)
12769 return true;
12771 return (df_regs_ever_live_p (regno)
12772 && !call_used_regs[regno]
12773 && !fixed_regs[regno]
12774 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
12777 /* Return number of saved general prupose registers. */
12779 static int
12780 ix86_nsaved_regs (void)
12782 int nregs = 0;
12783 int regno;
12785 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12786 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12787 nregs ++;
12788 return nregs;
12791 /* Return number of saved SSE registers. */
12793 static int
12794 ix86_nsaved_sseregs (void)
12796 int nregs = 0;
12797 int regno;
12799 if (!TARGET_64BIT_MS_ABI)
12800 return 0;
12801 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12802 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12803 nregs ++;
12804 return nregs;
12807 /* Given FROM and TO register numbers, say whether this elimination is
12808 allowed. If stack alignment is needed, we can only replace argument
12809 pointer with hard frame pointer, or replace frame pointer with stack
12810 pointer. Otherwise, frame pointer elimination is automatically
12811 handled and all other eliminations are valid. */
12813 static bool
12814 ix86_can_eliminate (const int from, const int to)
12816 if (stack_realign_fp)
12817 return ((from == ARG_POINTER_REGNUM
12818 && to == HARD_FRAME_POINTER_REGNUM)
12819 || (from == FRAME_POINTER_REGNUM
12820 && to == STACK_POINTER_REGNUM));
12821 else
12822 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
12825 /* Return the offset between two registers, one to be eliminated, and the other
12826 its replacement, at the start of a routine. */
12828 HOST_WIDE_INT
12829 ix86_initial_elimination_offset (int from, int to)
12831 struct ix86_frame frame = cfun->machine->frame;
12833 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
12834 return frame.hard_frame_pointer_offset;
12835 else if (from == FRAME_POINTER_REGNUM
12836 && to == HARD_FRAME_POINTER_REGNUM)
12837 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
12838 else
12840 gcc_assert (to == STACK_POINTER_REGNUM);
12842 if (from == ARG_POINTER_REGNUM)
12843 return frame.stack_pointer_offset;
12845 gcc_assert (from == FRAME_POINTER_REGNUM);
12846 return frame.stack_pointer_offset - frame.frame_pointer_offset;
12850 /* In a dynamically-aligned function, we can't know the offset from
12851 stack pointer to frame pointer, so we must ensure that setjmp
12852 eliminates fp against the hard fp (%ebp) rather than trying to
12853 index from %esp up to the top of the frame across a gap that is
12854 of unknown (at compile-time) size. */
12855 static rtx
12856 ix86_builtin_setjmp_frame_value (void)
12858 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
12861 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
12862 static void warn_once_call_ms2sysv_xlogues (const char *feature)
12864 static bool warned_once = false;
12865 if (!warned_once)
12867 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
12868 feature);
12869 warned_once = true;
12873 /* When using -fsplit-stack, the allocation routines set a field in
12874 the TCB to the bottom of the stack plus this much space, measured
12875 in bytes. */
12877 #define SPLIT_STACK_AVAILABLE 256
12879 /* Fill structure ix86_frame about frame of currently computed function. */
12881 static void
12882 ix86_compute_frame_layout (void)
12884 struct ix86_frame *frame = &cfun->machine->frame;
12885 struct machine_function *m = cfun->machine;
12886 unsigned HOST_WIDE_INT stack_alignment_needed;
12887 HOST_WIDE_INT offset;
12888 unsigned HOST_WIDE_INT preferred_alignment;
12889 HOST_WIDE_INT size = get_frame_size ();
12890 HOST_WIDE_INT to_allocate;
12892 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
12893 * ms_abi functions that call a sysv function. We now need to prune away
12894 * cases where it should be disabled. */
12895 if (TARGET_64BIT && m->call_ms2sysv)
12897 gcc_assert (TARGET_64BIT_MS_ABI);
12898 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
12899 gcc_assert (!TARGET_SEH);
12900 gcc_assert (TARGET_SSE);
12901 gcc_assert (!ix86_using_red_zone ());
12903 if (crtl->calls_eh_return)
12905 gcc_assert (!reload_completed);
12906 m->call_ms2sysv = false;
12907 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
12910 else if (ix86_static_chain_on_stack)
12912 gcc_assert (!reload_completed);
12913 m->call_ms2sysv = false;
12914 warn_once_call_ms2sysv_xlogues ("static call chains");
12917 /* Finally, compute which registers the stub will manage. */
12918 else
12920 unsigned count = xlogue_layout::count_stub_managed_regs ();
12921 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
12922 m->call_ms2sysv_pad_in = 0;
12926 frame->nregs = ix86_nsaved_regs ();
12927 frame->nsseregs = ix86_nsaved_sseregs ();
12929 /* 64-bit MS ABI seem to require stack alignment to be always 16,
12930 except for function prologues, leaf functions and when the defult
12931 incoming stack boundary is overriden at command line or via
12932 force_align_arg_pointer attribute. */
12933 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
12934 && (!crtl->is_leaf || cfun->calls_alloca != 0
12935 || ix86_current_function_calls_tls_descriptor
12936 || ix86_incoming_stack_boundary < 128))
12938 crtl->preferred_stack_boundary = 128;
12939 crtl->stack_alignment_needed = 128;
12942 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
12943 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
12945 gcc_assert (!size || stack_alignment_needed);
12946 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
12947 gcc_assert (preferred_alignment <= stack_alignment_needed);
12949 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
12950 gcc_assert (TARGET_64BIT || !frame->nsseregs);
12951 if (TARGET_64BIT && m->call_ms2sysv)
12953 gcc_assert (stack_alignment_needed >= 16);
12954 gcc_assert (!frame->nsseregs);
12957 /* For SEH we have to limit the amount of code movement into the prologue.
12958 At present we do this via a BLOCKAGE, at which point there's very little
12959 scheduling that can be done, which means that there's very little point
12960 in doing anything except PUSHs. */
12961 if (TARGET_SEH)
12962 m->use_fast_prologue_epilogue = false;
12963 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
12965 int count = frame->nregs;
12966 struct cgraph_node *node = cgraph_node::get (current_function_decl);
12968 /* The fast prologue uses move instead of push to save registers. This
12969 is significantly longer, but also executes faster as modern hardware
12970 can execute the moves in parallel, but can't do that for push/pop.
12972 Be careful about choosing what prologue to emit: When function takes
12973 many instructions to execute we may use slow version as well as in
12974 case function is known to be outside hot spot (this is known with
12975 feedback only). Weight the size of function by number of registers
12976 to save as it is cheap to use one or two push instructions but very
12977 slow to use many of them. */
12978 if (count)
12979 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
12980 if (node->frequency < NODE_FREQUENCY_NORMAL
12981 || (flag_branch_probabilities
12982 && node->frequency < NODE_FREQUENCY_HOT))
12983 m->use_fast_prologue_epilogue = false;
12984 else
12985 m->use_fast_prologue_epilogue
12986 = !expensive_function_p (count);
12989 frame->save_regs_using_mov
12990 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
12991 /* If static stack checking is enabled and done with probes,
12992 the registers need to be saved before allocating the frame. */
12993 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
12995 /* Skip return address and error code in exception handler. */
12996 offset = INCOMING_FRAME_SP_OFFSET;
12998 /* Skip pushed static chain. */
12999 if (ix86_static_chain_on_stack)
13000 offset += UNITS_PER_WORD;
13002 /* Skip saved base pointer. */
13003 if (frame_pointer_needed)
13004 offset += UNITS_PER_WORD;
13005 frame->hfp_save_offset = offset;
13007 /* The traditional frame pointer location is at the top of the frame. */
13008 frame->hard_frame_pointer_offset = offset;
13010 /* Register save area */
13011 offset += frame->nregs * UNITS_PER_WORD;
13012 frame->reg_save_offset = offset;
13014 /* On SEH target, registers are pushed just before the frame pointer
13015 location. */
13016 if (TARGET_SEH)
13017 frame->hard_frame_pointer_offset = offset;
13019 /* Calculate the size of the va-arg area (not including padding, if any). */
13020 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
13022 if (stack_realign_fp)
13024 /* We may need a 16-byte aligned stack for the remainder of the
13025 register save area, but the stack frame for the local function
13026 may require a greater alignment if using AVX/2/512. In order
13027 to avoid wasting space, we first calculate the space needed for
13028 the rest of the register saves, add that to the stack pointer,
13029 and then realign the stack to the boundary of the start of the
13030 frame for the local function. */
13031 HOST_WIDE_INT space_needed = 0;
13032 HOST_WIDE_INT sse_reg_space_needed = 0;
13034 if (TARGET_64BIT)
13036 if (m->call_ms2sysv)
13038 m->call_ms2sysv_pad_in = 0;
13039 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
13042 else if (frame->nsseregs)
13043 /* The only ABI that has saved SSE registers (Win64) also has a
13044 16-byte aligned default stack. However, many programs violate
13045 the ABI, and Wine64 forces stack realignment to compensate. */
13046 space_needed = frame->nsseregs * 16;
13048 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
13050 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
13051 rounding to be pedantic. */
13052 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
13054 else
13055 space_needed = frame->va_arg_size;
13057 /* Record the allocation size required prior to the realignment AND. */
13058 frame->stack_realign_allocate = space_needed;
13060 /* The re-aligned stack starts at frame->stack_realign_offset. Values
13061 before this point are not directly comparable with values below
13062 this point. Use sp_valid_at to determine if the stack pointer is
13063 valid for a given offset, fp_valid_at for the frame pointer, or
13064 choose_baseaddr to have a base register chosen for you.
13066 Note that the result of (frame->stack_realign_offset
13067 & (stack_alignment_needed - 1)) may not equal zero. */
13068 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
13069 frame->stack_realign_offset = offset - space_needed;
13070 frame->sse_reg_save_offset = frame->stack_realign_offset
13071 + sse_reg_space_needed;
13073 else
13075 frame->stack_realign_offset = offset;
13077 if (TARGET_64BIT && m->call_ms2sysv)
13079 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
13080 offset += xlogue_layout::get_instance ().get_stack_space_used ();
13083 /* Align and set SSE register save area. */
13084 else if (frame->nsseregs)
13086 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
13087 required and the DRAP re-alignment boundary is at least 16 bytes,
13088 then we want the SSE register save area properly aligned. */
13089 if (ix86_incoming_stack_boundary >= 128
13090 || (stack_realign_drap && stack_alignment_needed >= 16))
13091 offset = ROUND_UP (offset, 16);
13092 offset += frame->nsseregs * 16;
13094 frame->sse_reg_save_offset = offset;
13095 offset += frame->va_arg_size;
13098 /* Align start of frame for local function. */
13099 if (m->call_ms2sysv
13100 || frame->va_arg_size != 0
13101 || size != 0
13102 || !crtl->is_leaf
13103 || cfun->calls_alloca
13104 || ix86_current_function_calls_tls_descriptor)
13105 offset = ROUND_UP (offset, stack_alignment_needed);
13107 /* Frame pointer points here. */
13108 frame->frame_pointer_offset = offset;
13110 offset += size;
13112 /* Add outgoing arguments area. Can be skipped if we eliminated
13113 all the function calls as dead code.
13114 Skipping is however impossible when function calls alloca. Alloca
13115 expander assumes that last crtl->outgoing_args_size
13116 of stack frame are unused. */
13117 if (ACCUMULATE_OUTGOING_ARGS
13118 && (!crtl->is_leaf || cfun->calls_alloca
13119 || ix86_current_function_calls_tls_descriptor))
13121 offset += crtl->outgoing_args_size;
13122 frame->outgoing_arguments_size = crtl->outgoing_args_size;
13124 else
13125 frame->outgoing_arguments_size = 0;
13127 /* Align stack boundary. Only needed if we're calling another function
13128 or using alloca. */
13129 if (!crtl->is_leaf || cfun->calls_alloca
13130 || ix86_current_function_calls_tls_descriptor)
13131 offset = ROUND_UP (offset, preferred_alignment);
13133 /* We've reached end of stack frame. */
13134 frame->stack_pointer_offset = offset;
13136 /* Size prologue needs to allocate. */
13137 to_allocate = offset - frame->sse_reg_save_offset;
13139 if ((!to_allocate && frame->nregs <= 1)
13140 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
13141 frame->save_regs_using_mov = false;
13143 if (ix86_using_red_zone ()
13144 && crtl->sp_is_unchanging
13145 && crtl->is_leaf
13146 && !ix86_pc_thunk_call_expanded
13147 && !ix86_current_function_calls_tls_descriptor)
13149 frame->red_zone_size = to_allocate;
13150 if (frame->save_regs_using_mov)
13151 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
13152 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
13153 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
13155 else
13156 frame->red_zone_size = 0;
13157 frame->stack_pointer_offset -= frame->red_zone_size;
13159 /* The SEH frame pointer location is near the bottom of the frame.
13160 This is enforced by the fact that the difference between the
13161 stack pointer and the frame pointer is limited to 240 bytes in
13162 the unwind data structure. */
13163 if (TARGET_SEH)
13165 HOST_WIDE_INT diff;
13167 /* If we can leave the frame pointer where it is, do so. Also, returns
13168 the establisher frame for __builtin_frame_address (0). */
13169 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
13170 if (diff <= SEH_MAX_FRAME_SIZE
13171 && (diff > 240 || (diff & 15) != 0)
13172 && !crtl->accesses_prior_frames)
13174 /* Ideally we'd determine what portion of the local stack frame
13175 (within the constraint of the lowest 240) is most heavily used.
13176 But without that complication, simply bias the frame pointer
13177 by 128 bytes so as to maximize the amount of the local stack
13178 frame that is addressable with 8-bit offsets. */
13179 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
13184 /* This is semi-inlined memory_address_length, but simplified
13185 since we know that we're always dealing with reg+offset, and
13186 to avoid having to create and discard all that rtl. */
13188 static inline int
13189 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
13191 int len = 4;
13193 if (offset == 0)
13195 /* EBP and R13 cannot be encoded without an offset. */
13196 len = (regno == BP_REG || regno == R13_REG);
13198 else if (IN_RANGE (offset, -128, 127))
13199 len = 1;
13201 /* ESP and R12 must be encoded with a SIB byte. */
13202 if (regno == SP_REG || regno == R12_REG)
13203 len++;
13205 return len;
13208 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
13209 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13211 static bool
13212 sp_valid_at (HOST_WIDE_INT cfa_offset)
13214 const struct machine_frame_state &fs = cfun->machine->fs;
13215 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
13217 /* Validate that the cfa_offset isn't in a "no-man's land". */
13218 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
13219 return false;
13221 return fs.sp_valid;
13224 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
13225 the frame save area. The register is saved at CFA - CFA_OFFSET. */
13227 static inline bool
13228 fp_valid_at (HOST_WIDE_INT cfa_offset)
13230 const struct machine_frame_state &fs = cfun->machine->fs;
13231 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
13233 /* Validate that the cfa_offset isn't in a "no-man's land". */
13234 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
13235 return false;
13237 return fs.fp_valid;
13240 /* Choose a base register based upon alignment requested, speed and/or
13241 size. */
13243 static void
13244 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
13245 HOST_WIDE_INT &base_offset,
13246 unsigned int align_reqested, unsigned int *align)
13248 const struct machine_function *m = cfun->machine;
13249 unsigned int hfp_align;
13250 unsigned int drap_align;
13251 unsigned int sp_align;
13252 bool hfp_ok = fp_valid_at (cfa_offset);
13253 bool drap_ok = m->fs.drap_valid;
13254 bool sp_ok = sp_valid_at (cfa_offset);
13256 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
13258 /* Filter out any registers that don't meet the requested alignment
13259 criteria. */
13260 if (align_reqested)
13262 if (m->fs.realigned)
13263 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
13264 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
13265 notes (which we would need to use a realigned stack pointer),
13266 so disable on SEH targets. */
13267 else if (m->fs.sp_realigned)
13268 sp_align = crtl->stack_alignment_needed;
13270 hfp_ok = hfp_ok && hfp_align >= align_reqested;
13271 drap_ok = drap_ok && drap_align >= align_reqested;
13272 sp_ok = sp_ok && sp_align >= align_reqested;
13275 if (m->use_fast_prologue_epilogue)
13277 /* Choose the base register most likely to allow the most scheduling
13278 opportunities. Generally FP is valid throughout the function,
13279 while DRAP must be reloaded within the epilogue. But choose either
13280 over the SP due to increased encoding size. */
13282 if (hfp_ok)
13284 base_reg = hard_frame_pointer_rtx;
13285 base_offset = m->fs.fp_offset - cfa_offset;
13287 else if (drap_ok)
13289 base_reg = crtl->drap_reg;
13290 base_offset = 0 - cfa_offset;
13292 else if (sp_ok)
13294 base_reg = stack_pointer_rtx;
13295 base_offset = m->fs.sp_offset - cfa_offset;
13298 else
13300 HOST_WIDE_INT toffset;
13301 int len = 16, tlen;
13303 /* Choose the base register with the smallest address encoding.
13304 With a tie, choose FP > DRAP > SP. */
13305 if (sp_ok)
13307 base_reg = stack_pointer_rtx;
13308 base_offset = m->fs.sp_offset - cfa_offset;
13309 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
13311 if (drap_ok)
13313 toffset = 0 - cfa_offset;
13314 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
13315 if (tlen <= len)
13317 base_reg = crtl->drap_reg;
13318 base_offset = toffset;
13319 len = tlen;
13322 if (hfp_ok)
13324 toffset = m->fs.fp_offset - cfa_offset;
13325 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
13326 if (tlen <= len)
13328 base_reg = hard_frame_pointer_rtx;
13329 base_offset = toffset;
13330 len = tlen;
13335 /* Set the align return value. */
13336 if (align)
13338 if (base_reg == stack_pointer_rtx)
13339 *align = sp_align;
13340 else if (base_reg == crtl->drap_reg)
13341 *align = drap_align;
13342 else if (base_reg == hard_frame_pointer_rtx)
13343 *align = hfp_align;
13347 /* Return an RTX that points to CFA_OFFSET within the stack frame and
13348 the alignment of address. If ALIGN is non-null, it should point to
13349 an alignment value (in bits) that is preferred or zero and will
13350 recieve the alignment of the base register that was selected,
13351 irrespective of rather or not CFA_OFFSET is a multiple of that
13352 alignment value.
13354 The valid base registers are taken from CFUN->MACHINE->FS. */
13356 static rtx
13357 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align)
13359 rtx base_reg = NULL;
13360 HOST_WIDE_INT base_offset = 0;
13362 /* If a specific alignment is requested, try to get a base register
13363 with that alignment first. */
13364 if (align && *align)
13365 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
13367 if (!base_reg)
13368 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
13370 gcc_assert (base_reg != NULL);
13371 return plus_constant (Pmode, base_reg, base_offset);
13374 /* Emit code to save registers in the prologue. */
13376 static void
13377 ix86_emit_save_regs (void)
13379 unsigned int regno;
13380 rtx_insn *insn;
13382 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
13383 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13385 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
13386 RTX_FRAME_RELATED_P (insn) = 1;
13390 /* Emit a single register save at CFA - CFA_OFFSET. */
13392 static void
13393 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
13394 HOST_WIDE_INT cfa_offset)
13396 struct machine_function *m = cfun->machine;
13397 rtx reg = gen_rtx_REG (mode, regno);
13398 rtx mem, addr, base, insn;
13399 unsigned int align = GET_MODE_ALIGNMENT (mode);
13401 addr = choose_baseaddr (cfa_offset, &align);
13402 mem = gen_frame_mem (mode, addr);
13404 /* The location aligment depends upon the base register. */
13405 align = MIN (GET_MODE_ALIGNMENT (mode), align);
13406 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13407 set_mem_align (mem, align);
13409 insn = emit_insn (gen_rtx_SET (mem, reg));
13410 RTX_FRAME_RELATED_P (insn) = 1;
13412 base = addr;
13413 if (GET_CODE (base) == PLUS)
13414 base = XEXP (base, 0);
13415 gcc_checking_assert (REG_P (base));
13417 /* When saving registers into a re-aligned local stack frame, avoid
13418 any tricky guessing by dwarf2out. */
13419 if (m->fs.realigned)
13421 gcc_checking_assert (stack_realign_drap);
13423 if (regno == REGNO (crtl->drap_reg))
13425 /* A bit of a hack. We force the DRAP register to be saved in
13426 the re-aligned stack frame, which provides us with a copy
13427 of the CFA that will last past the prologue. Install it. */
13428 gcc_checking_assert (cfun->machine->fs.fp_valid);
13429 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13430 cfun->machine->fs.fp_offset - cfa_offset);
13431 mem = gen_rtx_MEM (mode, addr);
13432 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
13434 else
13436 /* The frame pointer is a stable reference within the
13437 aligned frame. Use it. */
13438 gcc_checking_assert (cfun->machine->fs.fp_valid);
13439 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
13440 cfun->machine->fs.fp_offset - cfa_offset);
13441 mem = gen_rtx_MEM (mode, addr);
13442 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13446 else if (base == stack_pointer_rtx && m->fs.sp_realigned
13447 && cfa_offset >= m->fs.sp_realigned_offset)
13449 gcc_checking_assert (stack_realign_fp);
13450 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
13453 /* The memory may not be relative to the current CFA register,
13454 which means that we may need to generate a new pattern for
13455 use by the unwind info. */
13456 else if (base != m->fs.cfa_reg)
13458 addr = plus_constant (Pmode, m->fs.cfa_reg,
13459 m->fs.cfa_offset - cfa_offset);
13460 mem = gen_rtx_MEM (mode, addr);
13461 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
13465 /* Emit code to save registers using MOV insns.
13466 First register is stored at CFA - CFA_OFFSET. */
13467 static void
13468 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
13470 unsigned int regno;
13472 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13473 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13475 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
13476 cfa_offset -= UNITS_PER_WORD;
13480 /* Emit code to save SSE registers using MOV insns.
13481 First register is stored at CFA - CFA_OFFSET. */
13482 static void
13483 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
13485 unsigned int regno;
13487 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13488 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
13490 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
13491 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13495 static GTY(()) rtx queued_cfa_restores;
13497 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
13498 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
13499 Don't add the note if the previously saved value will be left untouched
13500 within stack red-zone till return, as unwinders can find the same value
13501 in the register and on the stack. */
13503 static void
13504 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
13506 if (!crtl->shrink_wrapped
13507 && cfa_offset <= cfun->machine->fs.red_zone_offset)
13508 return;
13510 if (insn)
13512 add_reg_note (insn, REG_CFA_RESTORE, reg);
13513 RTX_FRAME_RELATED_P (insn) = 1;
13515 else
13516 queued_cfa_restores
13517 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
13520 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
13522 static void
13523 ix86_add_queued_cfa_restore_notes (rtx insn)
13525 rtx last;
13526 if (!queued_cfa_restores)
13527 return;
13528 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
13530 XEXP (last, 1) = REG_NOTES (insn);
13531 REG_NOTES (insn) = queued_cfa_restores;
13532 queued_cfa_restores = NULL_RTX;
13533 RTX_FRAME_RELATED_P (insn) = 1;
13536 /* Expand prologue or epilogue stack adjustment.
13537 The pattern exist to put a dependency on all ebp-based memory accesses.
13538 STYLE should be negative if instructions should be marked as frame related,
13539 zero if %r11 register is live and cannot be freely used and positive
13540 otherwise. */
13542 static void
13543 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
13544 int style, bool set_cfa)
13546 struct machine_function *m = cfun->machine;
13547 rtx insn;
13548 bool add_frame_related_expr = false;
13550 if (Pmode == SImode)
13551 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
13552 else if (x86_64_immediate_operand (offset, DImode))
13553 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
13554 else
13556 rtx tmp;
13557 /* r11 is used by indirect sibcall return as well, set before the
13558 epilogue and used after the epilogue. */
13559 if (style)
13560 tmp = gen_rtx_REG (DImode, R11_REG);
13561 else
13563 gcc_assert (src != hard_frame_pointer_rtx
13564 && dest != hard_frame_pointer_rtx);
13565 tmp = hard_frame_pointer_rtx;
13567 insn = emit_insn (gen_rtx_SET (tmp, offset));
13568 if (style < 0)
13569 add_frame_related_expr = true;
13571 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
13574 insn = emit_insn (insn);
13575 if (style >= 0)
13576 ix86_add_queued_cfa_restore_notes (insn);
13578 if (set_cfa)
13580 rtx r;
13582 gcc_assert (m->fs.cfa_reg == src);
13583 m->fs.cfa_offset += INTVAL (offset);
13584 m->fs.cfa_reg = dest;
13586 r = gen_rtx_PLUS (Pmode, src, offset);
13587 r = gen_rtx_SET (dest, r);
13588 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
13589 RTX_FRAME_RELATED_P (insn) = 1;
13591 else if (style < 0)
13593 RTX_FRAME_RELATED_P (insn) = 1;
13594 if (add_frame_related_expr)
13596 rtx r = gen_rtx_PLUS (Pmode, src, offset);
13597 r = gen_rtx_SET (dest, r);
13598 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
13602 if (dest == stack_pointer_rtx)
13604 HOST_WIDE_INT ooffset = m->fs.sp_offset;
13605 bool valid = m->fs.sp_valid;
13606 bool realigned = m->fs.sp_realigned;
13608 if (src == hard_frame_pointer_rtx)
13610 valid = m->fs.fp_valid;
13611 realigned = false;
13612 ooffset = m->fs.fp_offset;
13614 else if (src == crtl->drap_reg)
13616 valid = m->fs.drap_valid;
13617 realigned = false;
13618 ooffset = 0;
13620 else
13622 /* Else there are two possibilities: SP itself, which we set
13623 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
13624 taken care of this by hand along the eh_return path. */
13625 gcc_checking_assert (src == stack_pointer_rtx
13626 || offset == const0_rtx);
13629 m->fs.sp_offset = ooffset - INTVAL (offset);
13630 m->fs.sp_valid = valid;
13631 m->fs.sp_realigned = realigned;
13635 /* Find an available register to be used as dynamic realign argument
13636 pointer regsiter. Such a register will be written in prologue and
13637 used in begin of body, so it must not be
13638 1. parameter passing register.
13639 2. GOT pointer.
13640 We reuse static-chain register if it is available. Otherwise, we
13641 use DI for i386 and R13 for x86-64. We chose R13 since it has
13642 shorter encoding.
13644 Return: the regno of chosen register. */
13646 static unsigned int
13647 find_drap_reg (void)
13649 tree decl = cfun->decl;
13651 /* Always use callee-saved register if there are no caller-saved
13652 registers. */
13653 if (TARGET_64BIT)
13655 /* Use R13 for nested function or function need static chain.
13656 Since function with tail call may use any caller-saved
13657 registers in epilogue, DRAP must not use caller-saved
13658 register in such case. */
13659 if (DECL_STATIC_CHAIN (decl)
13660 || cfun->machine->no_caller_saved_registers
13661 || crtl->tail_call_emit)
13662 return R13_REG;
13664 return R10_REG;
13666 else
13668 /* Use DI for nested function or function need static chain.
13669 Since function with tail call may use any caller-saved
13670 registers in epilogue, DRAP must not use caller-saved
13671 register in such case. */
13672 if (DECL_STATIC_CHAIN (decl)
13673 || cfun->machine->no_caller_saved_registers
13674 || crtl->tail_call_emit)
13675 return DI_REG;
13677 /* Reuse static chain register if it isn't used for parameter
13678 passing. */
13679 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
13681 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
13682 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
13683 return CX_REG;
13685 return DI_REG;
13689 /* Handle a "force_align_arg_pointer" attribute. */
13691 static tree
13692 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
13693 tree, int, bool *no_add_attrs)
13695 if (TREE_CODE (*node) != FUNCTION_TYPE
13696 && TREE_CODE (*node) != METHOD_TYPE
13697 && TREE_CODE (*node) != FIELD_DECL
13698 && TREE_CODE (*node) != TYPE_DECL)
13700 warning (OPT_Wattributes, "%qE attribute only applies to functions",
13701 name);
13702 *no_add_attrs = true;
13705 return NULL_TREE;
13708 /* Return minimum incoming stack alignment. */
13710 static unsigned int
13711 ix86_minimum_incoming_stack_boundary (bool sibcall)
13713 unsigned int incoming_stack_boundary;
13715 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
13716 if (cfun->machine->func_type != TYPE_NORMAL)
13717 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
13718 /* Prefer the one specified at command line. */
13719 else if (ix86_user_incoming_stack_boundary)
13720 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
13721 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
13722 if -mstackrealign is used, it isn't used for sibcall check and
13723 estimated stack alignment is 128bit. */
13724 else if (!sibcall
13725 && ix86_force_align_arg_pointer
13726 && crtl->stack_alignment_estimated == 128)
13727 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13728 else
13729 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
13731 /* Incoming stack alignment can be changed on individual functions
13732 via force_align_arg_pointer attribute. We use the smallest
13733 incoming stack boundary. */
13734 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
13735 && lookup_attribute (ix86_force_align_arg_pointer_string,
13736 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
13737 incoming_stack_boundary = MIN_STACK_BOUNDARY;
13739 /* The incoming stack frame has to be aligned at least at
13740 parm_stack_boundary. */
13741 if (incoming_stack_boundary < crtl->parm_stack_boundary)
13742 incoming_stack_boundary = crtl->parm_stack_boundary;
13744 /* Stack at entrance of main is aligned by runtime. We use the
13745 smallest incoming stack boundary. */
13746 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
13747 && DECL_NAME (current_function_decl)
13748 && MAIN_NAME_P (DECL_NAME (current_function_decl))
13749 && DECL_FILE_SCOPE_P (current_function_decl))
13750 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
13752 return incoming_stack_boundary;
13755 /* Update incoming stack boundary and estimated stack alignment. */
13757 static void
13758 ix86_update_stack_boundary (void)
13760 ix86_incoming_stack_boundary
13761 = ix86_minimum_incoming_stack_boundary (false);
13763 /* x86_64 vararg needs 16byte stack alignment for register save
13764 area. */
13765 if (TARGET_64BIT
13766 && cfun->stdarg
13767 && crtl->stack_alignment_estimated < 128)
13768 crtl->stack_alignment_estimated = 128;
13770 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
13771 if (ix86_tls_descriptor_calls_expanded_in_cfun
13772 && crtl->preferred_stack_boundary < 128)
13773 crtl->preferred_stack_boundary = 128;
13776 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
13777 needed or an rtx for DRAP otherwise. */
13779 static rtx
13780 ix86_get_drap_rtx (void)
13782 /* We must use DRAP if there are outgoing arguments on stack and
13783 ACCUMULATE_OUTGOING_ARGS is false. */
13784 if (ix86_force_drap
13785 || (cfun->machine->outgoing_args_on_stack
13786 && !ACCUMULATE_OUTGOING_ARGS))
13787 crtl->need_drap = true;
13789 if (stack_realign_drap)
13791 /* Assign DRAP to vDRAP and returns vDRAP */
13792 unsigned int regno = find_drap_reg ();
13793 rtx drap_vreg;
13794 rtx arg_ptr;
13795 rtx_insn *seq, *insn;
13797 arg_ptr = gen_rtx_REG (Pmode, regno);
13798 crtl->drap_reg = arg_ptr;
13800 start_sequence ();
13801 drap_vreg = copy_to_reg (arg_ptr);
13802 seq = get_insns ();
13803 end_sequence ();
13805 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
13806 if (!optimize)
13808 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
13809 RTX_FRAME_RELATED_P (insn) = 1;
13811 return drap_vreg;
13813 else
13814 return NULL;
13817 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
13819 static rtx
13820 ix86_internal_arg_pointer (void)
13822 return virtual_incoming_args_rtx;
13825 struct scratch_reg {
13826 rtx reg;
13827 bool saved;
13830 /* Return a short-lived scratch register for use on function entry.
13831 In 32-bit mode, it is valid only after the registers are saved
13832 in the prologue. This register must be released by means of
13833 release_scratch_register_on_entry once it is dead. */
13835 static void
13836 get_scratch_register_on_entry (struct scratch_reg *sr)
13838 int regno;
13840 sr->saved = false;
13842 if (TARGET_64BIT)
13844 /* We always use R11 in 64-bit mode. */
13845 regno = R11_REG;
13847 else
13849 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
13850 bool fastcall_p
13851 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13852 bool thiscall_p
13853 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
13854 bool static_chain_p = DECL_STATIC_CHAIN (decl);
13855 int regparm = ix86_function_regparm (fntype, decl);
13856 int drap_regno
13857 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
13859 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
13860 for the static chain register. */
13861 if ((regparm < 1 || (fastcall_p && !static_chain_p))
13862 && drap_regno != AX_REG)
13863 regno = AX_REG;
13864 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
13865 for the static chain register. */
13866 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
13867 regno = AX_REG;
13868 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
13869 regno = DX_REG;
13870 /* ecx is the static chain register. */
13871 else if (regparm < 3 && !fastcall_p && !thiscall_p
13872 && !static_chain_p
13873 && drap_regno != CX_REG)
13874 regno = CX_REG;
13875 else if (ix86_save_reg (BX_REG, true, false))
13876 regno = BX_REG;
13877 /* esi is the static chain register. */
13878 else if (!(regparm == 3 && static_chain_p)
13879 && ix86_save_reg (SI_REG, true, false))
13880 regno = SI_REG;
13881 else if (ix86_save_reg (DI_REG, true, false))
13882 regno = DI_REG;
13883 else
13885 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
13886 sr->saved = true;
13890 sr->reg = gen_rtx_REG (Pmode, regno);
13891 if (sr->saved)
13893 rtx_insn *insn = emit_insn (gen_push (sr->reg));
13894 RTX_FRAME_RELATED_P (insn) = 1;
13898 /* Release a scratch register obtained from the preceding function. */
13900 static void
13901 release_scratch_register_on_entry (struct scratch_reg *sr)
13903 if (sr->saved)
13905 struct machine_function *m = cfun->machine;
13906 rtx x, insn = emit_insn (gen_pop (sr->reg));
13908 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
13909 RTX_FRAME_RELATED_P (insn) = 1;
13910 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
13911 x = gen_rtx_SET (stack_pointer_rtx, x);
13912 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
13913 m->fs.sp_offset -= UNITS_PER_WORD;
13917 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
13919 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
13921 static void
13922 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
13924 /* We skip the probe for the first interval + a small dope of 4 words and
13925 probe that many bytes past the specified size to maintain a protection
13926 area at the botton of the stack. */
13927 const int dope = 4 * UNITS_PER_WORD;
13928 rtx size_rtx = GEN_INT (size), last;
13930 /* See if we have a constant small number of probes to generate. If so,
13931 that's the easy case. The run-time loop is made up of 9 insns in the
13932 generic case while the compile-time loop is made up of 3+2*(n-1) insns
13933 for n # of intervals. */
13934 if (size <= 4 * PROBE_INTERVAL)
13936 HOST_WIDE_INT i, adjust;
13937 bool first_probe = true;
13939 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
13940 values of N from 1 until it exceeds SIZE. If only one probe is
13941 needed, this will not generate any code. Then adjust and probe
13942 to PROBE_INTERVAL + SIZE. */
13943 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
13945 if (first_probe)
13947 adjust = 2 * PROBE_INTERVAL + dope;
13948 first_probe = false;
13950 else
13951 adjust = PROBE_INTERVAL;
13953 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13954 plus_constant (Pmode, stack_pointer_rtx,
13955 -adjust)));
13956 emit_stack_probe (stack_pointer_rtx);
13959 if (first_probe)
13960 adjust = size + PROBE_INTERVAL + dope;
13961 else
13962 adjust = size + PROBE_INTERVAL - i;
13964 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13965 plus_constant (Pmode, stack_pointer_rtx,
13966 -adjust)));
13967 emit_stack_probe (stack_pointer_rtx);
13969 /* Adjust back to account for the additional first interval. */
13970 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
13971 plus_constant (Pmode, stack_pointer_rtx,
13972 PROBE_INTERVAL + dope)));
13975 /* Otherwise, do the same as above, but in a loop. Note that we must be
13976 extra careful with variables wrapping around because we might be at
13977 the very top (or the very bottom) of the address space and we have
13978 to be able to handle this case properly; in particular, we use an
13979 equality test for the loop condition. */
13980 else
13982 HOST_WIDE_INT rounded_size;
13983 struct scratch_reg sr;
13985 get_scratch_register_on_entry (&sr);
13988 /* Step 1: round SIZE to the previous multiple of the interval. */
13990 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
13993 /* Step 2: compute initial and final value of the loop counter. */
13995 /* SP = SP_0 + PROBE_INTERVAL. */
13996 emit_insn (gen_rtx_SET (stack_pointer_rtx,
13997 plus_constant (Pmode, stack_pointer_rtx,
13998 - (PROBE_INTERVAL + dope))));
14000 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
14001 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
14002 emit_insn (gen_rtx_SET (sr.reg,
14003 plus_constant (Pmode, stack_pointer_rtx,
14004 -rounded_size)));
14005 else
14007 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
14008 emit_insn (gen_rtx_SET (sr.reg,
14009 gen_rtx_PLUS (Pmode, sr.reg,
14010 stack_pointer_rtx)));
14014 /* Step 3: the loop
14018 SP = SP + PROBE_INTERVAL
14019 probe at SP
14021 while (SP != LAST_ADDR)
14023 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
14024 values of N from 1 until it is equal to ROUNDED_SIZE. */
14026 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
14029 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
14030 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
14032 if (size != rounded_size)
14034 emit_insn (gen_rtx_SET (stack_pointer_rtx,
14035 plus_constant (Pmode, stack_pointer_rtx,
14036 rounded_size - size)));
14037 emit_stack_probe (stack_pointer_rtx);
14040 /* Adjust back to account for the additional first interval. */
14041 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
14042 plus_constant (Pmode, stack_pointer_rtx,
14043 PROBE_INTERVAL + dope)));
14045 release_scratch_register_on_entry (&sr);
14048 /* Even if the stack pointer isn't the CFA register, we need to correctly
14049 describe the adjustments made to it, in particular differentiate the
14050 frame-related ones from the frame-unrelated ones. */
14051 if (size > 0)
14053 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
14054 XVECEXP (expr, 0, 0)
14055 = gen_rtx_SET (stack_pointer_rtx,
14056 plus_constant (Pmode, stack_pointer_rtx, -size));
14057 XVECEXP (expr, 0, 1)
14058 = gen_rtx_SET (stack_pointer_rtx,
14059 plus_constant (Pmode, stack_pointer_rtx,
14060 PROBE_INTERVAL + dope + size));
14061 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
14062 RTX_FRAME_RELATED_P (last) = 1;
14064 cfun->machine->fs.sp_offset += size;
14067 /* Make sure nothing is scheduled before we are done. */
14068 emit_insn (gen_blockage ());
14071 /* Adjust the stack pointer up to REG while probing it. */
14073 const char *
14074 output_adjust_stack_and_probe (rtx reg)
14076 static int labelno = 0;
14077 char loop_lab[32];
14078 rtx xops[2];
14080 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14082 /* Loop. */
14083 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14085 /* SP = SP + PROBE_INTERVAL. */
14086 xops[0] = stack_pointer_rtx;
14087 xops[1] = GEN_INT (PROBE_INTERVAL);
14088 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14090 /* Probe at SP. */
14091 xops[1] = const0_rtx;
14092 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
14094 /* Test if SP == LAST_ADDR. */
14095 xops[0] = stack_pointer_rtx;
14096 xops[1] = reg;
14097 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14099 /* Branch. */
14100 fputs ("\tjne\t", asm_out_file);
14101 assemble_name_raw (asm_out_file, loop_lab);
14102 fputc ('\n', asm_out_file);
14104 return "";
14107 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
14108 inclusive. These are offsets from the current stack pointer. */
14110 static void
14111 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
14113 /* See if we have a constant small number of probes to generate. If so,
14114 that's the easy case. The run-time loop is made up of 6 insns in the
14115 generic case while the compile-time loop is made up of n insns for n #
14116 of intervals. */
14117 if (size <= 6 * PROBE_INTERVAL)
14119 HOST_WIDE_INT i;
14121 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
14122 it exceeds SIZE. If only one probe is needed, this will not
14123 generate any code. Then probe at FIRST + SIZE. */
14124 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
14125 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14126 -(first + i)));
14128 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
14129 -(first + size)));
14132 /* Otherwise, do the same as above, but in a loop. Note that we must be
14133 extra careful with variables wrapping around because we might be at
14134 the very top (or the very bottom) of the address space and we have
14135 to be able to handle this case properly; in particular, we use an
14136 equality test for the loop condition. */
14137 else
14139 HOST_WIDE_INT rounded_size, last;
14140 struct scratch_reg sr;
14142 get_scratch_register_on_entry (&sr);
14145 /* Step 1: round SIZE to the previous multiple of the interval. */
14147 rounded_size = ROUND_DOWN (size, PROBE_INTERVAL);
14150 /* Step 2: compute initial and final value of the loop counter. */
14152 /* TEST_OFFSET = FIRST. */
14153 emit_move_insn (sr.reg, GEN_INT (-first));
14155 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
14156 last = first + rounded_size;
14159 /* Step 3: the loop
14163 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
14164 probe at TEST_ADDR
14166 while (TEST_ADDR != LAST_ADDR)
14168 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
14169 until it is equal to ROUNDED_SIZE. */
14171 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
14174 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
14175 that SIZE is equal to ROUNDED_SIZE. */
14177 if (size != rounded_size)
14178 emit_stack_probe (plus_constant (Pmode,
14179 gen_rtx_PLUS (Pmode,
14180 stack_pointer_rtx,
14181 sr.reg),
14182 rounded_size - size));
14184 release_scratch_register_on_entry (&sr);
14187 /* Make sure nothing is scheduled before we are done. */
14188 emit_insn (gen_blockage ());
14191 /* Probe a range of stack addresses from REG to END, inclusive. These are
14192 offsets from the current stack pointer. */
14194 const char *
14195 output_probe_stack_range (rtx reg, rtx end)
14197 static int labelno = 0;
14198 char loop_lab[32];
14199 rtx xops[3];
14201 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
14203 /* Loop. */
14204 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
14206 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
14207 xops[0] = reg;
14208 xops[1] = GEN_INT (PROBE_INTERVAL);
14209 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
14211 /* Probe at TEST_ADDR. */
14212 xops[0] = stack_pointer_rtx;
14213 xops[1] = reg;
14214 xops[2] = const0_rtx;
14215 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
14217 /* Test if TEST_ADDR == LAST_ADDR. */
14218 xops[0] = reg;
14219 xops[1] = end;
14220 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
14222 /* Branch. */
14223 fputs ("\tjne\t", asm_out_file);
14224 assemble_name_raw (asm_out_file, loop_lab);
14225 fputc ('\n', asm_out_file);
14227 return "";
14230 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
14231 will guide prologue/epilogue to be generated in correct form. */
14233 static void
14234 ix86_finalize_stack_frame_flags (void)
14236 /* Check if stack realign is really needed after reload, and
14237 stores result in cfun */
14238 unsigned int incoming_stack_boundary
14239 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
14240 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
14241 unsigned int stack_realign
14242 = (incoming_stack_boundary
14243 < (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
14244 ? crtl->max_used_stack_slot_alignment
14245 : crtl->stack_alignment_needed));
14246 bool recompute_frame_layout_p = false;
14248 if (crtl->stack_realign_finalized)
14250 /* After stack_realign_needed is finalized, we can't no longer
14251 change it. */
14252 gcc_assert (crtl->stack_realign_needed == stack_realign);
14253 return;
14256 /* If the only reason for frame_pointer_needed is that we conservatively
14257 assumed stack realignment might be needed or -fno-omit-frame-pointer
14258 is used, but in the end nothing that needed the stack alignment had
14259 been spilled nor stack access, clear frame_pointer_needed and say we
14260 don't need stack realignment. */
14261 if ((stack_realign || !flag_omit_frame_pointer)
14262 && frame_pointer_needed
14263 && crtl->is_leaf
14264 && crtl->sp_is_unchanging
14265 && !ix86_current_function_calls_tls_descriptor
14266 && !crtl->accesses_prior_frames
14267 && !cfun->calls_alloca
14268 && !crtl->calls_eh_return
14269 /* See ira_setup_eliminable_regset for the rationale. */
14270 && !(STACK_CHECK_MOVING_SP
14271 && flag_stack_check
14272 && flag_exceptions
14273 && cfun->can_throw_non_call_exceptions)
14274 && !ix86_frame_pointer_required ()
14275 && get_frame_size () == 0
14276 && ix86_nsaved_sseregs () == 0
14277 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
14279 HARD_REG_SET set_up_by_prologue, prologue_used;
14280 basic_block bb;
14282 CLEAR_HARD_REG_SET (prologue_used);
14283 CLEAR_HARD_REG_SET (set_up_by_prologue);
14284 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
14285 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
14286 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
14287 HARD_FRAME_POINTER_REGNUM);
14288 FOR_EACH_BB_FN (bb, cfun)
14290 rtx_insn *insn;
14291 FOR_BB_INSNS (bb, insn)
14292 if (NONDEBUG_INSN_P (insn)
14293 && requires_stack_frame_p (insn, prologue_used,
14294 set_up_by_prologue))
14296 if (crtl->stack_realign_needed != stack_realign)
14297 recompute_frame_layout_p = true;
14298 crtl->stack_realign_needed = stack_realign;
14299 crtl->stack_realign_finalized = true;
14300 if (recompute_frame_layout_p)
14301 ix86_compute_frame_layout ();
14302 return;
14306 /* If drap has been set, but it actually isn't live at the start
14307 of the function, there is no reason to set it up. */
14308 if (crtl->drap_reg)
14310 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14311 if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
14313 crtl->drap_reg = NULL_RTX;
14314 crtl->need_drap = false;
14317 else
14318 cfun->machine->no_drap_save_restore = true;
14320 frame_pointer_needed = false;
14321 stack_realign = false;
14322 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
14323 crtl->stack_alignment_needed = incoming_stack_boundary;
14324 crtl->stack_alignment_estimated = incoming_stack_boundary;
14325 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
14326 crtl->preferred_stack_boundary = incoming_stack_boundary;
14327 df_finish_pass (true);
14328 df_scan_alloc (NULL);
14329 df_scan_blocks ();
14330 df_compute_regs_ever_live (true);
14331 df_analyze ();
14333 if (flag_var_tracking)
14335 /* Since frame pointer is no longer available, replace it with
14336 stack pointer - UNITS_PER_WORD in debug insns. */
14337 df_ref ref, next;
14338 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
14339 ref; ref = next)
14341 rtx_insn *insn = DF_REF_INSN (ref);
14342 /* Make sure the next ref is for a different instruction,
14343 so that we're not affected by the rescan. */
14344 next = DF_REF_NEXT_REG (ref);
14345 while (next && DF_REF_INSN (next) == insn)
14346 next = DF_REF_NEXT_REG (next);
14348 if (DEBUG_INSN_P (insn))
14350 bool changed = false;
14351 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
14353 rtx *loc = DF_REF_LOC (ref);
14354 if (*loc == hard_frame_pointer_rtx)
14356 *loc = plus_constant (Pmode,
14357 stack_pointer_rtx,
14358 -UNITS_PER_WORD);
14359 changed = true;
14362 if (changed)
14363 df_insn_rescan (insn);
14368 recompute_frame_layout_p = true;
14371 if (crtl->stack_realign_needed != stack_realign)
14372 recompute_frame_layout_p = true;
14373 crtl->stack_realign_needed = stack_realign;
14374 crtl->stack_realign_finalized = true;
14375 if (recompute_frame_layout_p)
14376 ix86_compute_frame_layout ();
14379 /* Delete SET_GOT right after entry block if it is allocated to reg. */
14381 static void
14382 ix86_elim_entry_set_got (rtx reg)
14384 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
14385 rtx_insn *c_insn = BB_HEAD (bb);
14386 if (!NONDEBUG_INSN_P (c_insn))
14387 c_insn = next_nonnote_nondebug_insn (c_insn);
14388 if (c_insn && NONJUMP_INSN_P (c_insn))
14390 rtx pat = PATTERN (c_insn);
14391 if (GET_CODE (pat) == PARALLEL)
14393 rtx vec = XVECEXP (pat, 0, 0);
14394 if (GET_CODE (vec) == SET
14395 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
14396 && REGNO (XEXP (vec, 0)) == REGNO (reg))
14397 delete_insn (c_insn);
14402 static rtx
14403 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
14405 rtx addr, mem;
14407 if (offset)
14408 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
14409 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
14410 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
14413 static inline rtx
14414 gen_frame_load (rtx reg, rtx frame_reg, int offset)
14416 return gen_frame_set (reg, frame_reg, offset, false);
14419 static inline rtx
14420 gen_frame_store (rtx reg, rtx frame_reg, int offset)
14422 return gen_frame_set (reg, frame_reg, offset, true);
14425 static void
14426 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
14428 struct machine_function *m = cfun->machine;
14429 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14430 + m->call_ms2sysv_extra_regs;
14431 rtvec v = rtvec_alloc (ncregs + 1);
14432 unsigned int align, i, vi = 0;
14433 rtx_insn *insn;
14434 rtx sym, addr;
14435 rtx rax = gen_rtx_REG (word_mode, AX_REG);
14436 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14437 HOST_WIDE_INT allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14439 /* AL should only be live with sysv_abi. */
14440 gcc_assert (!ix86_eax_live_at_start_p ());
14442 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
14443 we've actually realigned the stack or not. */
14444 align = GET_MODE_ALIGNMENT (V4SFmode);
14445 addr = choose_baseaddr (frame.stack_realign_offset
14446 + xlogue.get_stub_ptr_offset (), &align);
14447 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14448 emit_insn (gen_rtx_SET (rax, addr));
14450 /* Allocate stack if not already done. */
14451 if (allocate > 0)
14452 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14453 GEN_INT (-allocate), -1, false);
14455 /* Get the stub symbol. */
14456 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
14457 : XLOGUE_STUB_SAVE);
14458 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14460 for (i = 0; i < ncregs; ++i)
14462 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14463 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
14464 r.regno);
14465 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
14468 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
14470 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
14471 RTX_FRAME_RELATED_P (insn) = true;
14474 /* Expand the prologue into a bunch of separate insns. */
14476 void
14477 ix86_expand_prologue (void)
14479 struct machine_function *m = cfun->machine;
14480 rtx insn, t;
14481 struct ix86_frame frame;
14482 HOST_WIDE_INT allocate;
14483 bool int_registers_saved;
14484 bool sse_registers_saved;
14485 rtx static_chain = NULL_RTX;
14487 if (ix86_function_naked (current_function_decl))
14488 return;
14490 ix86_finalize_stack_frame_flags ();
14492 /* DRAP should not coexist with stack_realign_fp */
14493 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
14495 memset (&m->fs, 0, sizeof (m->fs));
14497 /* Initialize CFA state for before the prologue. */
14498 m->fs.cfa_reg = stack_pointer_rtx;
14499 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
14501 /* Track SP offset to the CFA. We continue tracking this after we've
14502 swapped the CFA register away from SP. In the case of re-alignment
14503 this is fudged; we're interested to offsets within the local frame. */
14504 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14505 m->fs.sp_valid = true;
14506 m->fs.sp_realigned = false;
14508 frame = m->frame;
14510 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
14512 /* We should have already generated an error for any use of
14513 ms_hook on a nested function. */
14514 gcc_checking_assert (!ix86_static_chain_on_stack);
14516 /* Check if profiling is active and we shall use profiling before
14517 prologue variant. If so sorry. */
14518 if (crtl->profile && flag_fentry != 0)
14519 sorry ("ms_hook_prologue attribute isn%'t compatible "
14520 "with -mfentry for 32-bit");
14522 /* In ix86_asm_output_function_label we emitted:
14523 8b ff movl.s %edi,%edi
14524 55 push %ebp
14525 8b ec movl.s %esp,%ebp
14527 This matches the hookable function prologue in Win32 API
14528 functions in Microsoft Windows XP Service Pack 2 and newer.
14529 Wine uses this to enable Windows apps to hook the Win32 API
14530 functions provided by Wine.
14532 What that means is that we've already set up the frame pointer. */
14534 if (frame_pointer_needed
14535 && !(crtl->drap_reg && crtl->stack_realign_needed))
14537 rtx push, mov;
14539 /* We've decided to use the frame pointer already set up.
14540 Describe this to the unwinder by pretending that both
14541 push and mov insns happen right here.
14543 Putting the unwind info here at the end of the ms_hook
14544 is done so that we can make absolutely certain we get
14545 the required byte sequence at the start of the function,
14546 rather than relying on an assembler that can produce
14547 the exact encoding required.
14549 However it does mean (in the unpatched case) that we have
14550 a 1 insn window where the asynchronous unwind info is
14551 incorrect. However, if we placed the unwind info at
14552 its correct location we would have incorrect unwind info
14553 in the patched case. Which is probably all moot since
14554 I don't expect Wine generates dwarf2 unwind info for the
14555 system libraries that use this feature. */
14557 insn = emit_insn (gen_blockage ());
14559 push = gen_push (hard_frame_pointer_rtx);
14560 mov = gen_rtx_SET (hard_frame_pointer_rtx,
14561 stack_pointer_rtx);
14562 RTX_FRAME_RELATED_P (push) = 1;
14563 RTX_FRAME_RELATED_P (mov) = 1;
14565 RTX_FRAME_RELATED_P (insn) = 1;
14566 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14567 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
14569 /* Note that gen_push incremented m->fs.cfa_offset, even
14570 though we didn't emit the push insn here. */
14571 m->fs.cfa_reg = hard_frame_pointer_rtx;
14572 m->fs.fp_offset = m->fs.cfa_offset;
14573 m->fs.fp_valid = true;
14575 else
14577 /* The frame pointer is not needed so pop %ebp again.
14578 This leaves us with a pristine state. */
14579 emit_insn (gen_pop (hard_frame_pointer_rtx));
14583 /* The first insn of a function that accepts its static chain on the
14584 stack is to push the register that would be filled in by a direct
14585 call. This insn will be skipped by the trampoline. */
14586 else if (ix86_static_chain_on_stack)
14588 static_chain = ix86_static_chain (cfun->decl, false);
14589 insn = emit_insn (gen_push (static_chain));
14590 emit_insn (gen_blockage ());
14592 /* We don't want to interpret this push insn as a register save,
14593 only as a stack adjustment. The real copy of the register as
14594 a save will be done later, if needed. */
14595 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
14596 t = gen_rtx_SET (stack_pointer_rtx, t);
14597 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
14598 RTX_FRAME_RELATED_P (insn) = 1;
14601 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
14602 of DRAP is needed and stack realignment is really needed after reload */
14603 if (stack_realign_drap)
14605 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14607 /* Can't use DRAP in interrupt function. */
14608 if (cfun->machine->func_type != TYPE_NORMAL)
14609 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
14610 "in interrupt service routine. This may be worked "
14611 "around by avoiding functions with aggregate return.");
14613 /* Only need to push parameter pointer reg if it is caller saved. */
14614 if (!call_used_regs[REGNO (crtl->drap_reg)])
14616 /* Push arg pointer reg */
14617 insn = emit_insn (gen_push (crtl->drap_reg));
14618 RTX_FRAME_RELATED_P (insn) = 1;
14621 /* Grab the argument pointer. */
14622 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
14623 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14624 RTX_FRAME_RELATED_P (insn) = 1;
14625 m->fs.cfa_reg = crtl->drap_reg;
14626 m->fs.cfa_offset = 0;
14628 /* Align the stack. */
14629 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14630 stack_pointer_rtx,
14631 GEN_INT (-align_bytes)));
14632 RTX_FRAME_RELATED_P (insn) = 1;
14634 /* Replicate the return address on the stack so that return
14635 address can be reached via (argp - 1) slot. This is needed
14636 to implement macro RETURN_ADDR_RTX and intrinsic function
14637 expand_builtin_return_addr etc. */
14638 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
14639 t = gen_frame_mem (word_mode, t);
14640 insn = emit_insn (gen_push (t));
14641 RTX_FRAME_RELATED_P (insn) = 1;
14643 /* For the purposes of frame and register save area addressing,
14644 we've started over with a new frame. */
14645 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
14646 m->fs.realigned = true;
14648 if (static_chain)
14650 /* Replicate static chain on the stack so that static chain
14651 can be reached via (argp - 2) slot. This is needed for
14652 nested function with stack realignment. */
14653 insn = emit_insn (gen_push (static_chain));
14654 RTX_FRAME_RELATED_P (insn) = 1;
14658 int_registers_saved = (frame.nregs == 0);
14659 sse_registers_saved = (frame.nsseregs == 0);
14661 if (frame_pointer_needed && !m->fs.fp_valid)
14663 /* Note: AT&T enter does NOT have reversed args. Enter is probably
14664 slower on all targets. Also sdb doesn't like it. */
14665 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
14666 RTX_FRAME_RELATED_P (insn) = 1;
14668 /* Push registers now, before setting the frame pointer
14669 on SEH target. */
14670 if (!int_registers_saved
14671 && TARGET_SEH
14672 && !frame.save_regs_using_mov)
14674 ix86_emit_save_regs ();
14675 int_registers_saved = true;
14676 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14679 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
14681 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
14682 RTX_FRAME_RELATED_P (insn) = 1;
14684 if (m->fs.cfa_reg == stack_pointer_rtx)
14685 m->fs.cfa_reg = hard_frame_pointer_rtx;
14686 m->fs.fp_offset = m->fs.sp_offset;
14687 m->fs.fp_valid = true;
14691 if (!int_registers_saved)
14693 /* If saving registers via PUSH, do so now. */
14694 if (!frame.save_regs_using_mov)
14696 ix86_emit_save_regs ();
14697 int_registers_saved = true;
14698 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
14701 /* When using red zone we may start register saving before allocating
14702 the stack frame saving one cycle of the prologue. However, avoid
14703 doing this if we have to probe the stack; at least on x86_64 the
14704 stack probe can turn into a call that clobbers a red zone location. */
14705 else if (ix86_using_red_zone ()
14706 && (! TARGET_STACK_PROBE
14707 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
14709 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14710 int_registers_saved = true;
14714 if (stack_realign_fp)
14716 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
14717 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
14719 /* Record last valid frame pointer offset. */
14720 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
14722 /* The computation of the size of the re-aligned stack frame means
14723 that we must allocate the size of the register save area before
14724 performing the actual alignment. Otherwise we cannot guarantee
14725 that there's enough storage above the realignment point. */
14726 allocate = frame.reg_save_offset - m->fs.sp_offset
14727 + frame.stack_realign_allocate;
14728 if (allocate)
14729 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14730 GEN_INT (-allocate), -1, false);
14732 /* Align the stack. */
14733 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
14734 stack_pointer_rtx,
14735 GEN_INT (-align_bytes)));
14736 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
14737 m->fs.sp_realigned_offset = m->fs.sp_offset
14738 - frame.stack_realign_allocate;
14739 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
14740 Beyond this point, stack access should be done via choose_baseaddr or
14741 by using sp_valid_at and fp_valid_at to determine the correct base
14742 register. Henceforth, any CFA offset should be thought of as logical
14743 and not physical. */
14744 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
14745 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
14746 m->fs.sp_realigned = true;
14748 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
14749 is needed to describe where a register is saved using a realigned
14750 stack pointer, so we need to invalidate the stack pointer for that
14751 target. */
14752 if (TARGET_SEH)
14753 m->fs.sp_valid = false;
14756 if (m->call_ms2sysv)
14757 ix86_emit_outlined_ms2sysv_save (frame);
14759 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
14761 if (flag_stack_usage_info)
14763 /* We start to count from ARG_POINTER. */
14764 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
14766 /* If it was realigned, take into account the fake frame. */
14767 if (stack_realign_drap)
14769 if (ix86_static_chain_on_stack)
14770 stack_size += UNITS_PER_WORD;
14772 if (!call_used_regs[REGNO (crtl->drap_reg)])
14773 stack_size += UNITS_PER_WORD;
14775 /* This over-estimates by 1 minimal-stack-alignment-unit but
14776 mitigates that by counting in the new return address slot. */
14777 current_function_dynamic_stack_size
14778 += crtl->stack_alignment_needed / BITS_PER_UNIT;
14781 current_function_static_stack_size = stack_size;
14784 /* On SEH target with very large frame size, allocate an area to save
14785 SSE registers (as the very large allocation won't be described). */
14786 if (TARGET_SEH
14787 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
14788 && !sse_registers_saved)
14790 HOST_WIDE_INT sse_size =
14791 frame.sse_reg_save_offset - frame.reg_save_offset;
14793 gcc_assert (int_registers_saved);
14795 /* No need to do stack checking as the area will be immediately
14796 written. */
14797 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14798 GEN_INT (-sse_size), -1,
14799 m->fs.cfa_reg == stack_pointer_rtx);
14800 allocate -= sse_size;
14801 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14802 sse_registers_saved = true;
14805 /* The stack has already been decremented by the instruction calling us
14806 so probe if the size is non-negative to preserve the protection area. */
14807 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
14809 /* We expect the GP registers to be saved when probes are used. */
14810 gcc_assert (int_registers_saved);
14812 if (STACK_CHECK_MOVING_SP)
14814 if (!(crtl->is_leaf && !cfun->calls_alloca
14815 && allocate <= PROBE_INTERVAL))
14817 ix86_adjust_stack_and_probe (allocate);
14818 allocate = 0;
14821 else
14823 HOST_WIDE_INT size = allocate;
14825 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
14826 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
14828 if (TARGET_STACK_PROBE)
14830 if (crtl->is_leaf && !cfun->calls_alloca)
14832 if (size > PROBE_INTERVAL)
14833 ix86_emit_probe_stack_range (0, size);
14835 else
14836 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
14838 else
14840 if (crtl->is_leaf && !cfun->calls_alloca)
14842 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
14843 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
14844 size - STACK_CHECK_PROTECT);
14846 else
14847 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
14852 if (allocate == 0)
14854 else if (!ix86_target_stack_probe ()
14855 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
14857 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14858 GEN_INT (-allocate), -1,
14859 m->fs.cfa_reg == stack_pointer_rtx);
14861 else
14863 rtx eax = gen_rtx_REG (Pmode, AX_REG);
14864 rtx r10 = NULL;
14865 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
14866 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
14867 bool eax_live = ix86_eax_live_at_start_p ();
14868 bool r10_live = false;
14870 if (TARGET_64BIT)
14871 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
14873 if (eax_live)
14875 insn = emit_insn (gen_push (eax));
14876 allocate -= UNITS_PER_WORD;
14877 /* Note that SEH directives need to continue tracking the stack
14878 pointer even after the frame pointer has been set up. */
14879 if (sp_is_cfa_reg || TARGET_SEH)
14881 if (sp_is_cfa_reg)
14882 m->fs.cfa_offset += UNITS_PER_WORD;
14883 RTX_FRAME_RELATED_P (insn) = 1;
14884 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14885 gen_rtx_SET (stack_pointer_rtx,
14886 plus_constant (Pmode, stack_pointer_rtx,
14887 -UNITS_PER_WORD)));
14891 if (r10_live)
14893 r10 = gen_rtx_REG (Pmode, R10_REG);
14894 insn = emit_insn (gen_push (r10));
14895 allocate -= UNITS_PER_WORD;
14896 if (sp_is_cfa_reg || TARGET_SEH)
14898 if (sp_is_cfa_reg)
14899 m->fs.cfa_offset += UNITS_PER_WORD;
14900 RTX_FRAME_RELATED_P (insn) = 1;
14901 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14902 gen_rtx_SET (stack_pointer_rtx,
14903 plus_constant (Pmode, stack_pointer_rtx,
14904 -UNITS_PER_WORD)));
14908 emit_move_insn (eax, GEN_INT (allocate));
14909 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
14911 /* Use the fact that AX still contains ALLOCATE. */
14912 adjust_stack_insn = (Pmode == DImode
14913 ? gen_pro_epilogue_adjust_stack_di_sub
14914 : gen_pro_epilogue_adjust_stack_si_sub);
14916 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
14917 stack_pointer_rtx, eax));
14919 if (sp_is_cfa_reg || TARGET_SEH)
14921 if (sp_is_cfa_reg)
14922 m->fs.cfa_offset += allocate;
14923 RTX_FRAME_RELATED_P (insn) = 1;
14924 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
14925 gen_rtx_SET (stack_pointer_rtx,
14926 plus_constant (Pmode, stack_pointer_rtx,
14927 -allocate)));
14929 m->fs.sp_offset += allocate;
14931 /* Use stack_pointer_rtx for relative addressing so that code
14932 works for realigned stack, too. */
14933 if (r10_live && eax_live)
14935 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14936 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14937 gen_frame_mem (word_mode, t));
14938 t = plus_constant (Pmode, t, UNITS_PER_WORD);
14939 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
14940 gen_frame_mem (word_mode, t));
14942 else if (eax_live || r10_live)
14944 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
14945 emit_move_insn (gen_rtx_REG (word_mode,
14946 (eax_live ? AX_REG : R10_REG)),
14947 gen_frame_mem (word_mode, t));
14950 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
14952 /* If we havn't already set up the frame pointer, do so now. */
14953 if (frame_pointer_needed && !m->fs.fp_valid)
14955 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
14956 GEN_INT (frame.stack_pointer_offset
14957 - frame.hard_frame_pointer_offset));
14958 insn = emit_insn (insn);
14959 RTX_FRAME_RELATED_P (insn) = 1;
14960 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
14962 if (m->fs.cfa_reg == stack_pointer_rtx)
14963 m->fs.cfa_reg = hard_frame_pointer_rtx;
14964 m->fs.fp_offset = frame.hard_frame_pointer_offset;
14965 m->fs.fp_valid = true;
14968 if (!int_registers_saved)
14969 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
14970 if (!sse_registers_saved)
14971 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
14973 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
14974 in PROLOGUE. */
14975 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
14977 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
14978 insn = emit_insn (gen_set_got (pic));
14979 RTX_FRAME_RELATED_P (insn) = 1;
14980 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
14981 emit_insn (gen_prologue_use (pic));
14982 /* Deleting already emmitted SET_GOT if exist and allocated to
14983 REAL_PIC_OFFSET_TABLE_REGNUM. */
14984 ix86_elim_entry_set_got (pic);
14987 if (crtl->drap_reg && !crtl->stack_realign_needed)
14989 /* vDRAP is setup but after reload it turns out stack realign
14990 isn't necessary, here we will emit prologue to setup DRAP
14991 without stack realign adjustment */
14992 t = choose_baseaddr (0, NULL);
14993 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
14996 /* Prevent instructions from being scheduled into register save push
14997 sequence when access to the redzone area is done through frame pointer.
14998 The offset between the frame pointer and the stack pointer is calculated
14999 relative to the value of the stack pointer at the end of the function
15000 prologue, and moving instructions that access redzone area via frame
15001 pointer inside push sequence violates this assumption. */
15002 if (frame_pointer_needed && frame.red_zone_size)
15003 emit_insn (gen_memory_blockage ());
15005 /* SEH requires that the prologue end within 256 bytes of the start of
15006 the function. Prevent instruction schedules that would extend that.
15007 Further, prevent alloca modifications to the stack pointer from being
15008 combined with prologue modifications. */
15009 if (TARGET_SEH)
15010 emit_insn (gen_prologue_use (stack_pointer_rtx));
15013 /* Emit code to restore REG using a POP insn. */
15015 static void
15016 ix86_emit_restore_reg_using_pop (rtx reg)
15018 struct machine_function *m = cfun->machine;
15019 rtx_insn *insn = emit_insn (gen_pop (reg));
15021 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
15022 m->fs.sp_offset -= UNITS_PER_WORD;
15024 if (m->fs.cfa_reg == crtl->drap_reg
15025 && REGNO (reg) == REGNO (crtl->drap_reg))
15027 /* Previously we'd represented the CFA as an expression
15028 like *(%ebp - 8). We've just popped that value from
15029 the stack, which means we need to reset the CFA to
15030 the drap register. This will remain until we restore
15031 the stack pointer. */
15032 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15033 RTX_FRAME_RELATED_P (insn) = 1;
15035 /* This means that the DRAP register is valid for addressing too. */
15036 m->fs.drap_valid = true;
15037 return;
15040 if (m->fs.cfa_reg == stack_pointer_rtx)
15042 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15043 x = gen_rtx_SET (stack_pointer_rtx, x);
15044 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15045 RTX_FRAME_RELATED_P (insn) = 1;
15047 m->fs.cfa_offset -= UNITS_PER_WORD;
15050 /* When the frame pointer is the CFA, and we pop it, we are
15051 swapping back to the stack pointer as the CFA. This happens
15052 for stack frames that don't allocate other data, so we assume
15053 the stack pointer is now pointing at the return address, i.e.
15054 the function entry state, which makes the offset be 1 word. */
15055 if (reg == hard_frame_pointer_rtx)
15057 m->fs.fp_valid = false;
15058 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15060 m->fs.cfa_reg = stack_pointer_rtx;
15061 m->fs.cfa_offset -= UNITS_PER_WORD;
15063 add_reg_note (insn, REG_CFA_DEF_CFA,
15064 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15065 GEN_INT (m->fs.cfa_offset)));
15066 RTX_FRAME_RELATED_P (insn) = 1;
15071 /* Emit code to restore saved registers using POP insns. */
15073 static void
15074 ix86_emit_restore_regs_using_pop (void)
15076 unsigned int regno;
15078 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15079 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
15080 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
15083 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
15084 omits the emit and only attaches the notes. */
15086 static void
15087 ix86_emit_leave (rtx_insn *insn)
15089 struct machine_function *m = cfun->machine;
15090 if (!insn)
15091 insn = emit_insn (ix86_gen_leave ());
15093 ix86_add_queued_cfa_restore_notes (insn);
15095 gcc_assert (m->fs.fp_valid);
15096 m->fs.sp_valid = true;
15097 m->fs.sp_realigned = false;
15098 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
15099 m->fs.fp_valid = false;
15101 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
15103 m->fs.cfa_reg = stack_pointer_rtx;
15104 m->fs.cfa_offset = m->fs.sp_offset;
15106 add_reg_note (insn, REG_CFA_DEF_CFA,
15107 plus_constant (Pmode, stack_pointer_rtx,
15108 m->fs.sp_offset));
15109 RTX_FRAME_RELATED_P (insn) = 1;
15111 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
15112 m->fs.fp_offset);
15115 /* Emit code to restore saved registers using MOV insns.
15116 First register is restored from CFA - CFA_OFFSET. */
15117 static void
15118 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
15119 bool maybe_eh_return)
15121 struct machine_function *m = cfun->machine;
15122 unsigned int regno;
15124 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15125 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15127 rtx reg = gen_rtx_REG (word_mode, regno);
15128 rtx mem;
15129 rtx_insn *insn;
15131 mem = choose_baseaddr (cfa_offset, NULL);
15132 mem = gen_frame_mem (word_mode, mem);
15133 insn = emit_move_insn (reg, mem);
15135 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
15137 /* Previously we'd represented the CFA as an expression
15138 like *(%ebp - 8). We've just popped that value from
15139 the stack, which means we need to reset the CFA to
15140 the drap register. This will remain until we restore
15141 the stack pointer. */
15142 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
15143 RTX_FRAME_RELATED_P (insn) = 1;
15145 /* This means that the DRAP register is valid for addressing. */
15146 m->fs.drap_valid = true;
15148 else
15149 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15151 cfa_offset -= UNITS_PER_WORD;
15155 /* Emit code to restore saved registers using MOV insns.
15156 First register is restored from CFA - CFA_OFFSET. */
15157 static void
15158 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
15159 bool maybe_eh_return)
15161 unsigned int regno;
15163 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
15164 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
15166 rtx reg = gen_rtx_REG (V4SFmode, regno);
15167 rtx mem;
15168 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
15170 mem = choose_baseaddr (cfa_offset, &align);
15171 mem = gen_rtx_MEM (V4SFmode, mem);
15173 /* The location aligment depends upon the base register. */
15174 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
15175 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
15176 set_mem_align (mem, align);
15177 emit_insn (gen_rtx_SET (reg, mem));
15179 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
15181 cfa_offset -= GET_MODE_SIZE (V4SFmode);
15185 static void
15186 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
15187 bool use_call, int style)
15189 struct machine_function *m = cfun->machine;
15190 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
15191 + m->call_ms2sysv_extra_regs;
15192 rtvec v;
15193 unsigned int elems_needed, align, i, vi = 0;
15194 rtx_insn *insn;
15195 rtx sym, tmp;
15196 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
15197 rtx r10 = NULL_RTX;
15198 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
15199 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
15200 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
15201 rtx rsi_frame_load = NULL_RTX;
15202 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
15203 enum xlogue_stub stub;
15205 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
15207 /* If using a realigned stack, we should never start with padding. */
15208 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
15210 /* Setup RSI as the stub's base pointer. */
15211 align = GET_MODE_ALIGNMENT (V4SFmode);
15212 tmp = choose_baseaddr (rsi_offset, &align);
15213 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
15214 emit_insn (gen_rtx_SET (rsi, tmp));
15216 /* Get a symbol for the stub. */
15217 if (frame_pointer_needed)
15218 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
15219 : XLOGUE_STUB_RESTORE_HFP_TAIL;
15220 else
15221 stub = use_call ? XLOGUE_STUB_RESTORE
15222 : XLOGUE_STUB_RESTORE_TAIL;
15223 sym = xlogue.get_stub_rtx (stub);
15225 elems_needed = ncregs;
15226 if (use_call)
15227 elems_needed += 1;
15228 else
15229 elems_needed += frame_pointer_needed ? 5 : 3;
15230 v = rtvec_alloc (elems_needed);
15232 /* We call the epilogue stub when we need to pop incoming args or we are
15233 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
15234 epilogue stub and it is the tail-call. */
15235 if (use_call)
15236 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15237 else
15239 RTVEC_ELT (v, vi++) = ret_rtx;
15240 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
15241 if (frame_pointer_needed)
15243 rtx rbp = gen_rtx_REG (DImode, BP_REG);
15244 gcc_assert (m->fs.fp_valid);
15245 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
15247 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
15248 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
15249 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
15250 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
15251 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
15253 else
15255 /* If no hard frame pointer, we set R10 to the SP restore value. */
15256 gcc_assert (!m->fs.fp_valid);
15257 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15258 gcc_assert (m->fs.sp_valid);
15260 r10 = gen_rtx_REG (DImode, R10_REG);
15261 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
15262 emit_insn (gen_rtx_SET (r10, tmp));
15264 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
15268 /* Generate frame load insns and restore notes. */
15269 for (i = 0; i < ncregs; ++i)
15271 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
15272 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
15273 rtx reg, frame_load;
15275 reg = gen_rtx_REG (mode, r.regno);
15276 frame_load = gen_frame_load (reg, rsi, r.offset);
15278 /* Save RSI frame load insn & note to add last. */
15279 if (r.regno == SI_REG)
15281 gcc_assert (!rsi_frame_load);
15282 rsi_frame_load = frame_load;
15283 rsi_restore_offset = r.offset;
15285 else
15287 RTVEC_ELT (v, vi++) = frame_load;
15288 ix86_add_cfa_restore_note (NULL, reg, r.offset);
15292 /* Add RSI frame load & restore note at the end. */
15293 gcc_assert (rsi_frame_load);
15294 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
15295 RTVEC_ELT (v, vi++) = rsi_frame_load;
15296 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
15297 rsi_restore_offset);
15299 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
15300 if (!use_call && !frame_pointer_needed)
15302 gcc_assert (m->fs.sp_valid);
15303 gcc_assert (!m->fs.sp_realigned);
15305 /* At this point, R10 should point to frame.stack_realign_offset. */
15306 if (m->fs.cfa_reg == stack_pointer_rtx)
15307 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
15308 m->fs.sp_offset = frame.stack_realign_offset;
15311 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
15312 tmp = gen_rtx_PARALLEL (VOIDmode, v);
15313 if (use_call)
15314 insn = emit_insn (tmp);
15315 else
15317 insn = emit_jump_insn (tmp);
15318 JUMP_LABEL (insn) = ret_rtx;
15320 if (frame_pointer_needed)
15321 ix86_emit_leave (insn);
15322 else
15324 /* Need CFA adjust note. */
15325 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
15326 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
15330 RTX_FRAME_RELATED_P (insn) = true;
15331 ix86_add_queued_cfa_restore_notes (insn);
15333 /* If we're not doing a tail-call, we need to adjust the stack. */
15334 if (use_call && m->fs.sp_valid)
15336 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
15337 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15338 GEN_INT (dealloc), style,
15339 m->fs.cfa_reg == stack_pointer_rtx);
15343 /* Restore function stack, frame, and registers. */
15345 void
15346 ix86_expand_epilogue (int style)
15348 struct machine_function *m = cfun->machine;
15349 struct machine_frame_state frame_state_save = m->fs;
15350 struct ix86_frame frame;
15351 bool restore_regs_via_mov;
15352 bool using_drap;
15353 bool restore_stub_is_tail = false;
15355 if (ix86_function_naked (current_function_decl))
15357 /* The program should not reach this point. */
15358 emit_insn (gen_ud2 ());
15359 return;
15362 ix86_finalize_stack_frame_flags ();
15363 frame = m->frame;
15365 m->fs.sp_realigned = stack_realign_fp;
15366 m->fs.sp_valid = stack_realign_fp
15367 || !frame_pointer_needed
15368 || crtl->sp_is_unchanging;
15369 gcc_assert (!m->fs.sp_valid
15370 || m->fs.sp_offset == frame.stack_pointer_offset);
15372 /* The FP must be valid if the frame pointer is present. */
15373 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
15374 gcc_assert (!m->fs.fp_valid
15375 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
15377 /* We must have *some* valid pointer to the stack frame. */
15378 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
15380 /* The DRAP is never valid at this point. */
15381 gcc_assert (!m->fs.drap_valid);
15383 /* See the comment about red zone and frame
15384 pointer usage in ix86_expand_prologue. */
15385 if (frame_pointer_needed && frame.red_zone_size)
15386 emit_insn (gen_memory_blockage ());
15388 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
15389 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
15391 /* Determine the CFA offset of the end of the red-zone. */
15392 m->fs.red_zone_offset = 0;
15393 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
15395 /* The red-zone begins below return address and error code in
15396 exception handler. */
15397 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
15399 /* When the register save area is in the aligned portion of
15400 the stack, determine the maximum runtime displacement that
15401 matches up with the aligned frame. */
15402 if (stack_realign_drap)
15403 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
15404 + UNITS_PER_WORD);
15407 /* Special care must be taken for the normal return case of a function
15408 using eh_return: the eax and edx registers are marked as saved, but
15409 not restored along this path. Adjust the save location to match. */
15410 if (crtl->calls_eh_return && style != 2)
15411 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
15413 /* EH_RETURN requires the use of moves to function properly. */
15414 if (crtl->calls_eh_return)
15415 restore_regs_via_mov = true;
15416 /* SEH requires the use of pops to identify the epilogue. */
15417 else if (TARGET_SEH)
15418 restore_regs_via_mov = false;
15419 /* If we're only restoring one register and sp cannot be used then
15420 using a move instruction to restore the register since it's
15421 less work than reloading sp and popping the register. */
15422 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
15423 restore_regs_via_mov = true;
15424 else if (TARGET_EPILOGUE_USING_MOVE
15425 && cfun->machine->use_fast_prologue_epilogue
15426 && (frame.nregs > 1
15427 || m->fs.sp_offset != frame.reg_save_offset))
15428 restore_regs_via_mov = true;
15429 else if (frame_pointer_needed
15430 && !frame.nregs
15431 && m->fs.sp_offset != frame.reg_save_offset)
15432 restore_regs_via_mov = true;
15433 else if (frame_pointer_needed
15434 && TARGET_USE_LEAVE
15435 && cfun->machine->use_fast_prologue_epilogue
15436 && frame.nregs == 1)
15437 restore_regs_via_mov = true;
15438 else
15439 restore_regs_via_mov = false;
15441 if (restore_regs_via_mov || frame.nsseregs)
15443 /* Ensure that the entire register save area is addressable via
15444 the stack pointer, if we will restore SSE regs via sp. */
15445 if (TARGET_64BIT
15446 && m->fs.sp_offset > 0x7fffffff
15447 && sp_valid_at (frame.stack_realign_offset)
15448 && (frame.nsseregs + frame.nregs) != 0)
15450 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15451 GEN_INT (m->fs.sp_offset
15452 - frame.sse_reg_save_offset),
15453 style,
15454 m->fs.cfa_reg == stack_pointer_rtx);
15458 /* If there are any SSE registers to restore, then we have to do it
15459 via moves, since there's obviously no pop for SSE regs. */
15460 if (frame.nsseregs)
15461 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
15462 style == 2);
15464 if (m->call_ms2sysv)
15466 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
15468 /* We cannot use a tail-call for the stub if:
15469 1. We have to pop incoming args,
15470 2. We have additional int regs to restore, or
15471 3. A sibling call will be the tail-call, or
15472 4. We are emitting an eh_return_internal epilogue.
15474 TODO: Item 4 has not yet tested!
15476 If any of the above are true, we will call the stub rather than
15477 jump to it. */
15478 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
15479 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
15482 /* If using out-of-line stub that is a tail-call, then...*/
15483 if (m->call_ms2sysv && restore_stub_is_tail)
15485 /* TODO: parinoid tests. (remove eventually) */
15486 gcc_assert (m->fs.sp_valid);
15487 gcc_assert (!m->fs.sp_realigned);
15488 gcc_assert (!m->fs.fp_valid);
15489 gcc_assert (!m->fs.realigned);
15490 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
15491 gcc_assert (!crtl->drap_reg);
15492 gcc_assert (!frame.nregs);
15494 else if (restore_regs_via_mov)
15496 rtx t;
15498 if (frame.nregs)
15499 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
15501 /* eh_return epilogues need %ecx added to the stack pointer. */
15502 if (style == 2)
15504 rtx sa = EH_RETURN_STACKADJ_RTX;
15505 rtx_insn *insn;
15507 /* %ecx can't be used for both DRAP register and eh_return. */
15508 if (crtl->drap_reg)
15509 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
15511 /* regparm nested functions don't work with eh_return. */
15512 gcc_assert (!ix86_static_chain_on_stack);
15514 if (frame_pointer_needed)
15516 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
15517 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
15518 emit_insn (gen_rtx_SET (sa, t));
15520 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
15521 insn = emit_move_insn (hard_frame_pointer_rtx, t);
15523 /* Note that we use SA as a temporary CFA, as the return
15524 address is at the proper place relative to it. We
15525 pretend this happens at the FP restore insn because
15526 prior to this insn the FP would be stored at the wrong
15527 offset relative to SA, and after this insn we have no
15528 other reasonable register to use for the CFA. We don't
15529 bother resetting the CFA to the SP for the duration of
15530 the return insn. */
15531 add_reg_note (insn, REG_CFA_DEF_CFA,
15532 plus_constant (Pmode, sa, UNITS_PER_WORD));
15533 ix86_add_queued_cfa_restore_notes (insn);
15534 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
15535 RTX_FRAME_RELATED_P (insn) = 1;
15537 m->fs.cfa_reg = sa;
15538 m->fs.cfa_offset = UNITS_PER_WORD;
15539 m->fs.fp_valid = false;
15541 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
15542 const0_rtx, style, false);
15544 else
15546 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
15547 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
15548 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
15549 ix86_add_queued_cfa_restore_notes (insn);
15551 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
15552 if (m->fs.cfa_offset != UNITS_PER_WORD)
15554 m->fs.cfa_offset = UNITS_PER_WORD;
15555 add_reg_note (insn, REG_CFA_DEF_CFA,
15556 plus_constant (Pmode, stack_pointer_rtx,
15557 UNITS_PER_WORD));
15558 RTX_FRAME_RELATED_P (insn) = 1;
15561 m->fs.sp_offset = UNITS_PER_WORD;
15562 m->fs.sp_valid = true;
15563 m->fs.sp_realigned = false;
15566 else
15568 /* SEH requires that the function end with (1) a stack adjustment
15569 if necessary, (2) a sequence of pops, and (3) a return or
15570 jump instruction. Prevent insns from the function body from
15571 being scheduled into this sequence. */
15572 if (TARGET_SEH)
15574 /* Prevent a catch region from being adjacent to the standard
15575 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
15576 several other flags that would be interesting to test are
15577 not yet set up. */
15578 if (flag_non_call_exceptions)
15579 emit_insn (gen_nops (const1_rtx));
15580 else
15581 emit_insn (gen_blockage ());
15584 /* First step is to deallocate the stack frame so that we can
15585 pop the registers. If the stack pointer was realigned, it needs
15586 to be restored now. Also do it on SEH target for very large
15587 frame as the emitted instructions aren't allowed by the ABI
15588 in epilogues. */
15589 if (!m->fs.sp_valid || m->fs.sp_realigned
15590 || (TARGET_SEH
15591 && (m->fs.sp_offset - frame.reg_save_offset
15592 >= SEH_MAX_FRAME_SIZE)))
15594 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
15595 GEN_INT (m->fs.fp_offset
15596 - frame.reg_save_offset),
15597 style, false);
15599 else if (m->fs.sp_offset != frame.reg_save_offset)
15601 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15602 GEN_INT (m->fs.sp_offset
15603 - frame.reg_save_offset),
15604 style,
15605 m->fs.cfa_reg == stack_pointer_rtx);
15608 ix86_emit_restore_regs_using_pop ();
15611 /* If we used a stack pointer and haven't already got rid of it,
15612 then do so now. */
15613 if (m->fs.fp_valid)
15615 /* If the stack pointer is valid and pointing at the frame
15616 pointer store address, then we only need a pop. */
15617 if (sp_valid_at (frame.hfp_save_offset)
15618 && m->fs.sp_offset == frame.hfp_save_offset)
15619 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15620 /* Leave results in shorter dependency chains on CPUs that are
15621 able to grok it fast. */
15622 else if (TARGET_USE_LEAVE
15623 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
15624 || !cfun->machine->use_fast_prologue_epilogue)
15625 ix86_emit_leave (NULL);
15626 else
15628 pro_epilogue_adjust_stack (stack_pointer_rtx,
15629 hard_frame_pointer_rtx,
15630 const0_rtx, style, !using_drap);
15631 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
15635 if (using_drap)
15637 int param_ptr_offset = UNITS_PER_WORD;
15638 rtx_insn *insn;
15640 gcc_assert (stack_realign_drap);
15642 if (ix86_static_chain_on_stack)
15643 param_ptr_offset += UNITS_PER_WORD;
15644 if (!call_used_regs[REGNO (crtl->drap_reg)])
15645 param_ptr_offset += UNITS_PER_WORD;
15647 insn = emit_insn (gen_rtx_SET
15648 (stack_pointer_rtx,
15649 gen_rtx_PLUS (Pmode,
15650 crtl->drap_reg,
15651 GEN_INT (-param_ptr_offset))));
15652 m->fs.cfa_reg = stack_pointer_rtx;
15653 m->fs.cfa_offset = param_ptr_offset;
15654 m->fs.sp_offset = param_ptr_offset;
15655 m->fs.realigned = false;
15657 add_reg_note (insn, REG_CFA_DEF_CFA,
15658 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15659 GEN_INT (param_ptr_offset)));
15660 RTX_FRAME_RELATED_P (insn) = 1;
15662 if (!call_used_regs[REGNO (crtl->drap_reg)])
15663 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
15666 /* At this point the stack pointer must be valid, and we must have
15667 restored all of the registers. We may not have deallocated the
15668 entire stack frame. We've delayed this until now because it may
15669 be possible to merge the local stack deallocation with the
15670 deallocation forced by ix86_static_chain_on_stack. */
15671 gcc_assert (m->fs.sp_valid);
15672 gcc_assert (!m->fs.sp_realigned);
15673 gcc_assert (!m->fs.fp_valid);
15674 gcc_assert (!m->fs.realigned);
15675 if (m->fs.sp_offset != UNITS_PER_WORD)
15677 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15678 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
15679 style, true);
15681 else
15682 ix86_add_queued_cfa_restore_notes (get_last_insn ());
15684 /* Sibcall epilogues don't want a return instruction. */
15685 if (style == 0)
15687 m->fs = frame_state_save;
15688 return;
15691 if (cfun->machine->func_type != TYPE_NORMAL)
15692 emit_jump_insn (gen_interrupt_return ());
15693 else if (crtl->args.pops_args && crtl->args.size)
15695 rtx popc = GEN_INT (crtl->args.pops_args);
15697 /* i386 can only pop 64K bytes. If asked to pop more, pop return
15698 address, do explicit add, and jump indirectly to the caller. */
15700 if (crtl->args.pops_args >= 65536)
15702 rtx ecx = gen_rtx_REG (SImode, CX_REG);
15703 rtx_insn *insn;
15705 /* There is no "pascal" calling convention in any 64bit ABI. */
15706 gcc_assert (!TARGET_64BIT);
15708 insn = emit_insn (gen_pop (ecx));
15709 m->fs.cfa_offset -= UNITS_PER_WORD;
15710 m->fs.sp_offset -= UNITS_PER_WORD;
15712 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
15713 x = gen_rtx_SET (stack_pointer_rtx, x);
15714 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
15715 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
15716 RTX_FRAME_RELATED_P (insn) = 1;
15718 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
15719 popc, -1, true);
15720 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
15722 else
15723 emit_jump_insn (gen_simple_return_pop_internal (popc));
15725 else if (!m->call_ms2sysv || !restore_stub_is_tail)
15726 emit_jump_insn (gen_simple_return_internal ());
15728 /* Restore the state back to the state from the prologue,
15729 so that it's correct for the next epilogue. */
15730 m->fs = frame_state_save;
15733 /* Reset from the function's potential modifications. */
15735 static void
15736 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
15738 if (pic_offset_table_rtx
15739 && !ix86_use_pseudo_pic_reg ())
15740 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
15742 if (TARGET_MACHO)
15744 rtx_insn *insn = get_last_insn ();
15745 rtx_insn *deleted_debug_label = NULL;
15747 /* Mach-O doesn't support labels at the end of objects, so if
15748 it looks like we might want one, take special action.
15749 First, collect any sequence of deleted debug labels. */
15750 while (insn
15751 && NOTE_P (insn)
15752 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
15754 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
15755 notes only, instead set their CODE_LABEL_NUMBER to -1,
15756 otherwise there would be code generation differences
15757 in between -g and -g0. */
15758 if (NOTE_P (insn) && NOTE_KIND (insn)
15759 == NOTE_INSN_DELETED_DEBUG_LABEL)
15760 deleted_debug_label = insn;
15761 insn = PREV_INSN (insn);
15764 /* If we have:
15765 label:
15766 barrier
15767 then this needs to be detected, so skip past the barrier. */
15769 if (insn && BARRIER_P (insn))
15770 insn = PREV_INSN (insn);
15772 /* Up to now we've only seen notes or barriers. */
15773 if (insn)
15775 if (LABEL_P (insn)
15776 || (NOTE_P (insn)
15777 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
15778 /* Trailing label. */
15779 fputs ("\tnop\n", file);
15780 else if (cfun && ! cfun->is_thunk)
15782 /* See if we have a completely empty function body, skipping
15783 the special case of the picbase thunk emitted as asm. */
15784 while (insn && ! INSN_P (insn))
15785 insn = PREV_INSN (insn);
15786 /* If we don't find any insns, we've got an empty function body;
15787 I.e. completely empty - without a return or branch. This is
15788 taken as the case where a function body has been removed
15789 because it contains an inline __builtin_unreachable(). GCC
15790 declares that reaching __builtin_unreachable() means UB so
15791 we're not obliged to do anything special; however, we want
15792 non-zero-sized function bodies. To meet this, and help the
15793 user out, let's trap the case. */
15794 if (insn == NULL)
15795 fputs ("\tud2\n", file);
15798 else if (deleted_debug_label)
15799 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
15800 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
15801 CODE_LABEL_NUMBER (insn) = -1;
15805 /* Return a scratch register to use in the split stack prologue. The
15806 split stack prologue is used for -fsplit-stack. It is the first
15807 instructions in the function, even before the regular prologue.
15808 The scratch register can be any caller-saved register which is not
15809 used for parameters or for the static chain. */
15811 static unsigned int
15812 split_stack_prologue_scratch_regno (void)
15814 if (TARGET_64BIT)
15815 return R11_REG;
15816 else
15818 bool is_fastcall, is_thiscall;
15819 int regparm;
15821 is_fastcall = (lookup_attribute ("fastcall",
15822 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15823 != NULL);
15824 is_thiscall = (lookup_attribute ("thiscall",
15825 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
15826 != NULL);
15827 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
15829 if (is_fastcall)
15831 if (DECL_STATIC_CHAIN (cfun->decl))
15833 sorry ("-fsplit-stack does not support fastcall with "
15834 "nested function");
15835 return INVALID_REGNUM;
15837 return AX_REG;
15839 else if (is_thiscall)
15841 if (!DECL_STATIC_CHAIN (cfun->decl))
15842 return DX_REG;
15843 return AX_REG;
15845 else if (regparm < 3)
15847 if (!DECL_STATIC_CHAIN (cfun->decl))
15848 return CX_REG;
15849 else
15851 if (regparm >= 2)
15853 sorry ("-fsplit-stack does not support 2 register "
15854 "parameters for a nested function");
15855 return INVALID_REGNUM;
15857 return DX_REG;
15860 else
15862 /* FIXME: We could make this work by pushing a register
15863 around the addition and comparison. */
15864 sorry ("-fsplit-stack does not support 3 register parameters");
15865 return INVALID_REGNUM;
15870 /* A SYMBOL_REF for the function which allocates new stackspace for
15871 -fsplit-stack. */
15873 static GTY(()) rtx split_stack_fn;
15875 /* A SYMBOL_REF for the more stack function when using the large
15876 model. */
15878 static GTY(()) rtx split_stack_fn_large;
15880 /* Return location of the stack guard value in the TLS block. */
15883 ix86_split_stack_guard (void)
15885 int offset;
15886 addr_space_t as = DEFAULT_TLS_SEG_REG;
15887 rtx r;
15889 gcc_assert (flag_split_stack);
15891 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15892 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15893 #else
15894 gcc_unreachable ();
15895 #endif
15897 r = GEN_INT (offset);
15898 r = gen_const_mem (Pmode, r);
15899 set_mem_addr_space (r, as);
15901 return r;
15904 /* Handle -fsplit-stack. These are the first instructions in the
15905 function, even before the regular prologue. */
15907 void
15908 ix86_expand_split_stack_prologue (void)
15910 struct ix86_frame frame;
15911 HOST_WIDE_INT allocate;
15912 unsigned HOST_WIDE_INT args_size;
15913 rtx_code_label *label;
15914 rtx limit, current, allocate_rtx, call_insn, call_fusage;
15915 rtx scratch_reg = NULL_RTX;
15916 rtx_code_label *varargs_label = NULL;
15917 rtx fn;
15919 gcc_assert (flag_split_stack && reload_completed);
15921 ix86_finalize_stack_frame_flags ();
15922 frame = cfun->machine->frame;
15923 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
15925 /* This is the label we will branch to if we have enough stack
15926 space. We expect the basic block reordering pass to reverse this
15927 branch if optimizing, so that we branch in the unlikely case. */
15928 label = gen_label_rtx ();
15930 /* We need to compare the stack pointer minus the frame size with
15931 the stack boundary in the TCB. The stack boundary always gives
15932 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
15933 can compare directly. Otherwise we need to do an addition. */
15935 limit = ix86_split_stack_guard ();
15937 if (allocate < SPLIT_STACK_AVAILABLE)
15938 current = stack_pointer_rtx;
15939 else
15941 unsigned int scratch_regno;
15942 rtx offset;
15944 /* We need a scratch register to hold the stack pointer minus
15945 the required frame size. Since this is the very start of the
15946 function, the scratch register can be any caller-saved
15947 register which is not used for parameters. */
15948 offset = GEN_INT (- allocate);
15949 scratch_regno = split_stack_prologue_scratch_regno ();
15950 if (scratch_regno == INVALID_REGNUM)
15951 return;
15952 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15953 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
15955 /* We don't use ix86_gen_add3 in this case because it will
15956 want to split to lea, but when not optimizing the insn
15957 will not be split after this point. */
15958 emit_insn (gen_rtx_SET (scratch_reg,
15959 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15960 offset)));
15962 else
15964 emit_move_insn (scratch_reg, offset);
15965 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
15966 stack_pointer_rtx));
15968 current = scratch_reg;
15971 ix86_expand_branch (GEU, current, limit, label);
15972 rtx_insn *jump_insn = get_last_insn ();
15973 JUMP_LABEL (jump_insn) = label;
15975 /* Mark the jump as very likely to be taken. */
15976 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
15978 if (split_stack_fn == NULL_RTX)
15980 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
15981 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
15983 fn = split_stack_fn;
15985 /* Get more stack space. We pass in the desired stack space and the
15986 size of the arguments to copy to the new stack. In 32-bit mode
15987 we push the parameters; __morestack will return on a new stack
15988 anyhow. In 64-bit mode we pass the parameters in r10 and
15989 r11. */
15990 allocate_rtx = GEN_INT (allocate);
15991 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
15992 call_fusage = NULL_RTX;
15993 rtx pop = NULL_RTX;
15994 if (TARGET_64BIT)
15996 rtx reg10, reg11;
15998 reg10 = gen_rtx_REG (Pmode, R10_REG);
15999 reg11 = gen_rtx_REG (Pmode, R11_REG);
16001 /* If this function uses a static chain, it will be in %r10.
16002 Preserve it across the call to __morestack. */
16003 if (DECL_STATIC_CHAIN (cfun->decl))
16005 rtx rax;
16007 rax = gen_rtx_REG (word_mode, AX_REG);
16008 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
16009 use_reg (&call_fusage, rax);
16012 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
16013 && !TARGET_PECOFF)
16015 HOST_WIDE_INT argval;
16017 gcc_assert (Pmode == DImode);
16018 /* When using the large model we need to load the address
16019 into a register, and we've run out of registers. So we
16020 switch to a different calling convention, and we call a
16021 different function: __morestack_large. We pass the
16022 argument size in the upper 32 bits of r10 and pass the
16023 frame size in the lower 32 bits. */
16024 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
16025 gcc_assert ((args_size & 0xffffffff) == args_size);
16027 if (split_stack_fn_large == NULL_RTX)
16029 split_stack_fn_large =
16030 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
16031 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
16033 if (ix86_cmodel == CM_LARGE_PIC)
16035 rtx_code_label *label;
16036 rtx x;
16038 label = gen_label_rtx ();
16039 emit_label (label);
16040 LABEL_PRESERVE_P (label) = 1;
16041 emit_insn (gen_set_rip_rex64 (reg10, label));
16042 emit_insn (gen_set_got_offset_rex64 (reg11, label));
16043 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
16044 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
16045 UNSPEC_GOT);
16046 x = gen_rtx_CONST (Pmode, x);
16047 emit_move_insn (reg11, x);
16048 x = gen_rtx_PLUS (Pmode, reg10, reg11);
16049 x = gen_const_mem (Pmode, x);
16050 emit_move_insn (reg11, x);
16052 else
16053 emit_move_insn (reg11, split_stack_fn_large);
16055 fn = reg11;
16057 argval = ((args_size << 16) << 16) + allocate;
16058 emit_move_insn (reg10, GEN_INT (argval));
16060 else
16062 emit_move_insn (reg10, allocate_rtx);
16063 emit_move_insn (reg11, GEN_INT (args_size));
16064 use_reg (&call_fusage, reg11);
16067 use_reg (&call_fusage, reg10);
16069 else
16071 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
16072 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
16073 insn = emit_insn (gen_push (allocate_rtx));
16074 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
16075 pop = GEN_INT (2 * UNITS_PER_WORD);
16077 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
16078 GEN_INT (UNITS_PER_WORD), constm1_rtx,
16079 pop, false);
16080 add_function_usage_to (call_insn, call_fusage);
16081 if (!TARGET_64BIT)
16082 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
16083 /* Indicate that this function can't jump to non-local gotos. */
16084 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
16086 /* In order to make call/return prediction work right, we now need
16087 to execute a return instruction. See
16088 libgcc/config/i386/morestack.S for the details on how this works.
16090 For flow purposes gcc must not see this as a return
16091 instruction--we need control flow to continue at the subsequent
16092 label. Therefore, we use an unspec. */
16093 gcc_assert (crtl->args.pops_args < 65536);
16094 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
16096 /* If we are in 64-bit mode and this function uses a static chain,
16097 we saved %r10 in %rax before calling _morestack. */
16098 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
16099 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
16100 gen_rtx_REG (word_mode, AX_REG));
16102 /* If this function calls va_start, we need to store a pointer to
16103 the arguments on the old stack, because they may not have been
16104 all copied to the new stack. At this point the old stack can be
16105 found at the frame pointer value used by __morestack, because
16106 __morestack has set that up before calling back to us. Here we
16107 store that pointer in a scratch register, and in
16108 ix86_expand_prologue we store the scratch register in a stack
16109 slot. */
16110 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16112 unsigned int scratch_regno;
16113 rtx frame_reg;
16114 int words;
16116 scratch_regno = split_stack_prologue_scratch_regno ();
16117 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
16118 frame_reg = gen_rtx_REG (Pmode, BP_REG);
16120 /* 64-bit:
16121 fp -> old fp value
16122 return address within this function
16123 return address of caller of this function
16124 stack arguments
16125 So we add three words to get to the stack arguments.
16127 32-bit:
16128 fp -> old fp value
16129 return address within this function
16130 first argument to __morestack
16131 second argument to __morestack
16132 return address of caller of this function
16133 stack arguments
16134 So we add five words to get to the stack arguments.
16136 words = TARGET_64BIT ? 3 : 5;
16137 emit_insn (gen_rtx_SET (scratch_reg,
16138 gen_rtx_PLUS (Pmode, frame_reg,
16139 GEN_INT (words * UNITS_PER_WORD))));
16141 varargs_label = gen_label_rtx ();
16142 emit_jump_insn (gen_jump (varargs_label));
16143 JUMP_LABEL (get_last_insn ()) = varargs_label;
16145 emit_barrier ();
16148 emit_label (label);
16149 LABEL_NUSES (label) = 1;
16151 /* If this function calls va_start, we now have to set the scratch
16152 register for the case where we do not call __morestack. In this
16153 case we need to set it based on the stack pointer. */
16154 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16156 emit_insn (gen_rtx_SET (scratch_reg,
16157 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
16158 GEN_INT (UNITS_PER_WORD))));
16160 emit_label (varargs_label);
16161 LABEL_NUSES (varargs_label) = 1;
16165 /* We may have to tell the dataflow pass that the split stack prologue
16166 is initializing a scratch register. */
16168 static void
16169 ix86_live_on_entry (bitmap regs)
16171 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
16173 gcc_assert (flag_split_stack);
16174 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
16178 /* Extract the parts of an RTL expression that is a valid memory address
16179 for an instruction. Return 0 if the structure of the address is
16180 grossly off. Return -1 if the address contains ASHIFT, so it is not
16181 strictly valid, but still used for computing length of lea instruction. */
16184 ix86_decompose_address (rtx addr, struct ix86_address *out)
16186 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
16187 rtx base_reg, index_reg;
16188 HOST_WIDE_INT scale = 1;
16189 rtx scale_rtx = NULL_RTX;
16190 rtx tmp;
16191 int retval = 1;
16192 addr_space_t seg = ADDR_SPACE_GENERIC;
16194 /* Allow zero-extended SImode addresses,
16195 they will be emitted with addr32 prefix. */
16196 if (TARGET_64BIT && GET_MODE (addr) == DImode)
16198 if (GET_CODE (addr) == ZERO_EXTEND
16199 && GET_MODE (XEXP (addr, 0)) == SImode)
16201 addr = XEXP (addr, 0);
16202 if (CONST_INT_P (addr))
16203 return 0;
16205 else if (GET_CODE (addr) == AND
16206 && const_32bit_mask (XEXP (addr, 1), DImode))
16208 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
16209 if (addr == NULL_RTX)
16210 return 0;
16212 if (CONST_INT_P (addr))
16213 return 0;
16217 /* Allow SImode subregs of DImode addresses,
16218 they will be emitted with addr32 prefix. */
16219 if (TARGET_64BIT && GET_MODE (addr) == SImode)
16221 if (SUBREG_P (addr)
16222 && GET_MODE (SUBREG_REG (addr)) == DImode)
16224 addr = SUBREG_REG (addr);
16225 if (CONST_INT_P (addr))
16226 return 0;
16230 if (REG_P (addr))
16231 base = addr;
16232 else if (SUBREG_P (addr))
16234 if (REG_P (SUBREG_REG (addr)))
16235 base = addr;
16236 else
16237 return 0;
16239 else if (GET_CODE (addr) == PLUS)
16241 rtx addends[4], op;
16242 int n = 0, i;
16244 op = addr;
16247 if (n >= 4)
16248 return 0;
16249 addends[n++] = XEXP (op, 1);
16250 op = XEXP (op, 0);
16252 while (GET_CODE (op) == PLUS);
16253 if (n >= 4)
16254 return 0;
16255 addends[n] = op;
16257 for (i = n; i >= 0; --i)
16259 op = addends[i];
16260 switch (GET_CODE (op))
16262 case MULT:
16263 if (index)
16264 return 0;
16265 index = XEXP (op, 0);
16266 scale_rtx = XEXP (op, 1);
16267 break;
16269 case ASHIFT:
16270 if (index)
16271 return 0;
16272 index = XEXP (op, 0);
16273 tmp = XEXP (op, 1);
16274 if (!CONST_INT_P (tmp))
16275 return 0;
16276 scale = INTVAL (tmp);
16277 if ((unsigned HOST_WIDE_INT) scale > 3)
16278 return 0;
16279 scale = 1 << scale;
16280 break;
16282 case ZERO_EXTEND:
16283 op = XEXP (op, 0);
16284 if (GET_CODE (op) != UNSPEC)
16285 return 0;
16286 /* FALLTHRU */
16288 case UNSPEC:
16289 if (XINT (op, 1) == UNSPEC_TP
16290 && TARGET_TLS_DIRECT_SEG_REFS
16291 && seg == ADDR_SPACE_GENERIC)
16292 seg = DEFAULT_TLS_SEG_REG;
16293 else
16294 return 0;
16295 break;
16297 case SUBREG:
16298 if (!REG_P (SUBREG_REG (op)))
16299 return 0;
16300 /* FALLTHRU */
16302 case REG:
16303 if (!base)
16304 base = op;
16305 else if (!index)
16306 index = op;
16307 else
16308 return 0;
16309 break;
16311 case CONST:
16312 case CONST_INT:
16313 case SYMBOL_REF:
16314 case LABEL_REF:
16315 if (disp)
16316 return 0;
16317 disp = op;
16318 break;
16320 default:
16321 return 0;
16325 else if (GET_CODE (addr) == MULT)
16327 index = XEXP (addr, 0); /* index*scale */
16328 scale_rtx = XEXP (addr, 1);
16330 else if (GET_CODE (addr) == ASHIFT)
16332 /* We're called for lea too, which implements ashift on occasion. */
16333 index = XEXP (addr, 0);
16334 tmp = XEXP (addr, 1);
16335 if (!CONST_INT_P (tmp))
16336 return 0;
16337 scale = INTVAL (tmp);
16338 if ((unsigned HOST_WIDE_INT) scale > 3)
16339 return 0;
16340 scale = 1 << scale;
16341 retval = -1;
16343 else
16344 disp = addr; /* displacement */
16346 if (index)
16348 if (REG_P (index))
16350 else if (SUBREG_P (index)
16351 && REG_P (SUBREG_REG (index)))
16353 else
16354 return 0;
16357 /* Extract the integral value of scale. */
16358 if (scale_rtx)
16360 if (!CONST_INT_P (scale_rtx))
16361 return 0;
16362 scale = INTVAL (scale_rtx);
16365 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
16366 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
16368 /* Avoid useless 0 displacement. */
16369 if (disp == const0_rtx && (base || index))
16370 disp = NULL_RTX;
16372 /* Allow arg pointer and stack pointer as index if there is not scaling. */
16373 if (base_reg && index_reg && scale == 1
16374 && (REGNO (index_reg) == ARG_POINTER_REGNUM
16375 || REGNO (index_reg) == FRAME_POINTER_REGNUM
16376 || REGNO (index_reg) == SP_REG))
16378 std::swap (base, index);
16379 std::swap (base_reg, index_reg);
16382 /* Special case: %ebp cannot be encoded as a base without a displacement.
16383 Similarly %r13. */
16384 if (!disp && base_reg
16385 && (REGNO (base_reg) == ARG_POINTER_REGNUM
16386 || REGNO (base_reg) == FRAME_POINTER_REGNUM
16387 || REGNO (base_reg) == BP_REG
16388 || REGNO (base_reg) == R13_REG))
16389 disp = const0_rtx;
16391 /* Special case: on K6, [%esi] makes the instruction vector decoded.
16392 Avoid this by transforming to [%esi+0].
16393 Reload calls address legitimization without cfun defined, so we need
16394 to test cfun for being non-NULL. */
16395 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
16396 && base_reg && !index_reg && !disp
16397 && REGNO (base_reg) == SI_REG)
16398 disp = const0_rtx;
16400 /* Special case: encode reg+reg instead of reg*2. */
16401 if (!base && index && scale == 2)
16402 base = index, base_reg = index_reg, scale = 1;
16404 /* Special case: scaling cannot be encoded without base or displacement. */
16405 if (!base && !disp && index && scale != 1)
16406 disp = const0_rtx;
16408 out->base = base;
16409 out->index = index;
16410 out->disp = disp;
16411 out->scale = scale;
16412 out->seg = seg;
16414 return retval;
16417 /* Return cost of the memory address x.
16418 For i386, it is better to use a complex address than let gcc copy
16419 the address into a reg and make a new pseudo. But not if the address
16420 requires to two regs - that would mean more pseudos with longer
16421 lifetimes. */
16422 static int
16423 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
16425 struct ix86_address parts;
16426 int cost = 1;
16427 int ok = ix86_decompose_address (x, &parts);
16429 gcc_assert (ok);
16431 if (parts.base && SUBREG_P (parts.base))
16432 parts.base = SUBREG_REG (parts.base);
16433 if (parts.index && SUBREG_P (parts.index))
16434 parts.index = SUBREG_REG (parts.index);
16436 /* Attempt to minimize number of registers in the address by increasing
16437 address cost for each used register. We don't increase address cost
16438 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
16439 is not invariant itself it most likely means that base or index is not
16440 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
16441 which is not profitable for x86. */
16442 if (parts.base
16443 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
16444 && (current_pass->type == GIMPLE_PASS
16445 || !pic_offset_table_rtx
16446 || !REG_P (parts.base)
16447 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
16448 cost++;
16450 if (parts.index
16451 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
16452 && (current_pass->type == GIMPLE_PASS
16453 || !pic_offset_table_rtx
16454 || !REG_P (parts.index)
16455 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
16456 cost++;
16458 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
16459 since it's predecode logic can't detect the length of instructions
16460 and it degenerates to vector decoded. Increase cost of such
16461 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
16462 to split such addresses or even refuse such addresses at all.
16464 Following addressing modes are affected:
16465 [base+scale*index]
16466 [scale*index+disp]
16467 [base+index]
16469 The first and last case may be avoidable by explicitly coding the zero in
16470 memory address, but I don't have AMD-K6 machine handy to check this
16471 theory. */
16473 if (TARGET_K6
16474 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
16475 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
16476 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
16477 cost += 10;
16479 return cost;
16482 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
16483 this is used for to form addresses to local data when -fPIC is in
16484 use. */
16486 static bool
16487 darwin_local_data_pic (rtx disp)
16489 return (GET_CODE (disp) == UNSPEC
16490 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
16493 /* True if operand X should be loaded from GOT. */
16495 bool
16496 ix86_force_load_from_GOT_p (rtx x)
16498 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
16499 && !TARGET_PECOFF && !TARGET_MACHO
16500 && !flag_plt && !flag_pic
16501 && ix86_cmodel != CM_LARGE
16502 && GET_CODE (x) == SYMBOL_REF
16503 && SYMBOL_REF_FUNCTION_P (x)
16504 && !SYMBOL_REF_LOCAL_P (x));
16507 /* Determine if a given RTX is a valid constant. We already know this
16508 satisfies CONSTANT_P. */
16510 static bool
16511 ix86_legitimate_constant_p (machine_mode mode, rtx x)
16513 /* Pointer bounds constants are not valid. */
16514 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
16515 return false;
16517 switch (GET_CODE (x))
16519 case CONST:
16520 x = XEXP (x, 0);
16522 if (GET_CODE (x) == PLUS)
16524 if (!CONST_INT_P (XEXP (x, 1)))
16525 return false;
16526 x = XEXP (x, 0);
16529 if (TARGET_MACHO && darwin_local_data_pic (x))
16530 return true;
16532 /* Only some unspecs are valid as "constants". */
16533 if (GET_CODE (x) == UNSPEC)
16534 switch (XINT (x, 1))
16536 case UNSPEC_GOT:
16537 case UNSPEC_GOTOFF:
16538 case UNSPEC_PLTOFF:
16539 return TARGET_64BIT;
16540 case UNSPEC_TPOFF:
16541 case UNSPEC_NTPOFF:
16542 x = XVECEXP (x, 0, 0);
16543 return (GET_CODE (x) == SYMBOL_REF
16544 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16545 case UNSPEC_DTPOFF:
16546 x = XVECEXP (x, 0, 0);
16547 return (GET_CODE (x) == SYMBOL_REF
16548 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
16549 default:
16550 return false;
16553 /* We must have drilled down to a symbol. */
16554 if (GET_CODE (x) == LABEL_REF)
16555 return true;
16556 if (GET_CODE (x) != SYMBOL_REF)
16557 return false;
16558 /* FALLTHRU */
16560 case SYMBOL_REF:
16561 /* TLS symbols are never valid. */
16562 if (SYMBOL_REF_TLS_MODEL (x))
16563 return false;
16565 /* DLLIMPORT symbols are never valid. */
16566 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
16567 && SYMBOL_REF_DLLIMPORT_P (x))
16568 return false;
16570 #if TARGET_MACHO
16571 /* mdynamic-no-pic */
16572 if (MACHO_DYNAMIC_NO_PIC_P)
16573 return machopic_symbol_defined_p (x);
16574 #endif
16576 /* External function address should be loaded
16577 via the GOT slot to avoid PLT. */
16578 if (ix86_force_load_from_GOT_p (x))
16579 return false;
16581 break;
16583 CASE_CONST_SCALAR_INT:
16584 switch (mode)
16586 case E_TImode:
16587 if (TARGET_64BIT)
16588 return true;
16589 /* FALLTHRU */
16590 case E_OImode:
16591 case E_XImode:
16592 if (!standard_sse_constant_p (x, mode))
16593 return false;
16594 default:
16595 break;
16597 break;
16599 case CONST_VECTOR:
16600 if (!standard_sse_constant_p (x, mode))
16601 return false;
16603 default:
16604 break;
16607 /* Otherwise we handle everything else in the move patterns. */
16608 return true;
16611 /* Determine if it's legal to put X into the constant pool. This
16612 is not possible for the address of thread-local symbols, which
16613 is checked above. */
16615 static bool
16616 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
16618 /* We can put any immediate constant in memory. */
16619 switch (GET_CODE (x))
16621 CASE_CONST_ANY:
16622 return false;
16624 default:
16625 break;
16628 return !ix86_legitimate_constant_p (mode, x);
16631 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
16632 otherwise zero. */
16634 static bool
16635 is_imported_p (rtx x)
16637 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
16638 || GET_CODE (x) != SYMBOL_REF)
16639 return false;
16641 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
16645 /* Nonzero if the constant value X is a legitimate general operand
16646 when generating PIC code. It is given that flag_pic is on and
16647 that X satisfies CONSTANT_P. */
16649 bool
16650 legitimate_pic_operand_p (rtx x)
16652 rtx inner;
16654 switch (GET_CODE (x))
16656 case CONST:
16657 inner = XEXP (x, 0);
16658 if (GET_CODE (inner) == PLUS
16659 && CONST_INT_P (XEXP (inner, 1)))
16660 inner = XEXP (inner, 0);
16662 /* Only some unspecs are valid as "constants". */
16663 if (GET_CODE (inner) == UNSPEC)
16664 switch (XINT (inner, 1))
16666 case UNSPEC_GOT:
16667 case UNSPEC_GOTOFF:
16668 case UNSPEC_PLTOFF:
16669 return TARGET_64BIT;
16670 case UNSPEC_TPOFF:
16671 x = XVECEXP (inner, 0, 0);
16672 return (GET_CODE (x) == SYMBOL_REF
16673 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
16674 case UNSPEC_MACHOPIC_OFFSET:
16675 return legitimate_pic_address_disp_p (x);
16676 default:
16677 return false;
16679 /* FALLTHRU */
16681 case SYMBOL_REF:
16682 case LABEL_REF:
16683 return legitimate_pic_address_disp_p (x);
16685 default:
16686 return true;
16690 /* Determine if a given CONST RTX is a valid memory displacement
16691 in PIC mode. */
16693 bool
16694 legitimate_pic_address_disp_p (rtx disp)
16696 bool saw_plus;
16698 /* In 64bit mode we can allow direct addresses of symbols and labels
16699 when they are not dynamic symbols. */
16700 if (TARGET_64BIT)
16702 rtx op0 = disp, op1;
16704 switch (GET_CODE (disp))
16706 case LABEL_REF:
16707 return true;
16709 case CONST:
16710 if (GET_CODE (XEXP (disp, 0)) != PLUS)
16711 break;
16712 op0 = XEXP (XEXP (disp, 0), 0);
16713 op1 = XEXP (XEXP (disp, 0), 1);
16714 if (!CONST_INT_P (op1)
16715 || INTVAL (op1) >= 16*1024*1024
16716 || INTVAL (op1) < -16*1024*1024)
16717 break;
16718 if (GET_CODE (op0) == LABEL_REF)
16719 return true;
16720 if (GET_CODE (op0) == CONST
16721 && GET_CODE (XEXP (op0, 0)) == UNSPEC
16722 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
16723 return true;
16724 if (GET_CODE (op0) == UNSPEC
16725 && XINT (op0, 1) == UNSPEC_PCREL)
16726 return true;
16727 if (GET_CODE (op0) != SYMBOL_REF)
16728 break;
16729 /* FALLTHRU */
16731 case SYMBOL_REF:
16732 /* TLS references should always be enclosed in UNSPEC.
16733 The dllimported symbol needs always to be resolved. */
16734 if (SYMBOL_REF_TLS_MODEL (op0)
16735 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
16736 return false;
16738 if (TARGET_PECOFF)
16740 if (is_imported_p (op0))
16741 return true;
16743 if (SYMBOL_REF_FAR_ADDR_P (op0)
16744 || !SYMBOL_REF_LOCAL_P (op0))
16745 break;
16747 /* Function-symbols need to be resolved only for
16748 large-model.
16749 For the small-model we don't need to resolve anything
16750 here. */
16751 if ((ix86_cmodel != CM_LARGE_PIC
16752 && SYMBOL_REF_FUNCTION_P (op0))
16753 || ix86_cmodel == CM_SMALL_PIC)
16754 return true;
16755 /* Non-external symbols don't need to be resolved for
16756 large, and medium-model. */
16757 if ((ix86_cmodel == CM_LARGE_PIC
16758 || ix86_cmodel == CM_MEDIUM_PIC)
16759 && !SYMBOL_REF_EXTERNAL_P (op0))
16760 return true;
16762 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
16763 && (SYMBOL_REF_LOCAL_P (op0)
16764 || (HAVE_LD_PIE_COPYRELOC
16765 && flag_pie
16766 && !SYMBOL_REF_WEAK (op0)
16767 && !SYMBOL_REF_FUNCTION_P (op0)))
16768 && ix86_cmodel != CM_LARGE_PIC)
16769 return true;
16770 break;
16772 default:
16773 break;
16776 if (GET_CODE (disp) != CONST)
16777 return false;
16778 disp = XEXP (disp, 0);
16780 if (TARGET_64BIT)
16782 /* We are unsafe to allow PLUS expressions. This limit allowed distance
16783 of GOT tables. We should not need these anyway. */
16784 if (GET_CODE (disp) != UNSPEC
16785 || (XINT (disp, 1) != UNSPEC_GOTPCREL
16786 && XINT (disp, 1) != UNSPEC_GOTOFF
16787 && XINT (disp, 1) != UNSPEC_PCREL
16788 && XINT (disp, 1) != UNSPEC_PLTOFF))
16789 return false;
16791 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
16792 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
16793 return false;
16794 return true;
16797 saw_plus = false;
16798 if (GET_CODE (disp) == PLUS)
16800 if (!CONST_INT_P (XEXP (disp, 1)))
16801 return false;
16802 disp = XEXP (disp, 0);
16803 saw_plus = true;
16806 if (TARGET_MACHO && darwin_local_data_pic (disp))
16807 return true;
16809 if (GET_CODE (disp) != UNSPEC)
16810 return false;
16812 switch (XINT (disp, 1))
16814 case UNSPEC_GOT:
16815 if (saw_plus)
16816 return false;
16817 /* We need to check for both symbols and labels because VxWorks loads
16818 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
16819 details. */
16820 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16821 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
16822 case UNSPEC_GOTOFF:
16823 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
16824 While ABI specify also 32bit relocation but we don't produce it in
16825 small PIC model at all. */
16826 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
16827 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
16828 && !TARGET_64BIT)
16829 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
16830 return false;
16831 case UNSPEC_GOTTPOFF:
16832 case UNSPEC_GOTNTPOFF:
16833 case UNSPEC_INDNTPOFF:
16834 if (saw_plus)
16835 return false;
16836 disp = XVECEXP (disp, 0, 0);
16837 return (GET_CODE (disp) == SYMBOL_REF
16838 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
16839 case UNSPEC_NTPOFF:
16840 disp = XVECEXP (disp, 0, 0);
16841 return (GET_CODE (disp) == SYMBOL_REF
16842 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
16843 case UNSPEC_DTPOFF:
16844 disp = XVECEXP (disp, 0, 0);
16845 return (GET_CODE (disp) == SYMBOL_REF
16846 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
16849 return false;
16852 /* Determine if op is suitable RTX for an address register.
16853 Return naked register if a register or a register subreg is
16854 found, otherwise return NULL_RTX. */
16856 static rtx
16857 ix86_validate_address_register (rtx op)
16859 machine_mode mode = GET_MODE (op);
16861 /* Only SImode or DImode registers can form the address. */
16862 if (mode != SImode && mode != DImode)
16863 return NULL_RTX;
16865 if (REG_P (op))
16866 return op;
16867 else if (SUBREG_P (op))
16869 rtx reg = SUBREG_REG (op);
16871 if (!REG_P (reg))
16872 return NULL_RTX;
16874 mode = GET_MODE (reg);
16876 /* Don't allow SUBREGs that span more than a word. It can
16877 lead to spill failures when the register is one word out
16878 of a two word structure. */
16879 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
16880 return NULL_RTX;
16882 /* Allow only SUBREGs of non-eliminable hard registers. */
16883 if (register_no_elim_operand (reg, mode))
16884 return reg;
16887 /* Op is not a register. */
16888 return NULL_RTX;
16891 /* Recognizes RTL expressions that are valid memory addresses for an
16892 instruction. The MODE argument is the machine mode for the MEM
16893 expression that wants to use this address.
16895 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
16896 convert common non-canonical forms to canonical form so that they will
16897 be recognized. */
16899 static bool
16900 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
16902 struct ix86_address parts;
16903 rtx base, index, disp;
16904 HOST_WIDE_INT scale;
16905 addr_space_t seg;
16907 if (ix86_decompose_address (addr, &parts) <= 0)
16908 /* Decomposition failed. */
16909 return false;
16911 base = parts.base;
16912 index = parts.index;
16913 disp = parts.disp;
16914 scale = parts.scale;
16915 seg = parts.seg;
16917 /* Validate base register. */
16918 if (base)
16920 rtx reg = ix86_validate_address_register (base);
16922 if (reg == NULL_RTX)
16923 return false;
16925 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
16926 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
16927 /* Base is not valid. */
16928 return false;
16931 /* Validate index register. */
16932 if (index)
16934 rtx reg = ix86_validate_address_register (index);
16936 if (reg == NULL_RTX)
16937 return false;
16939 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
16940 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
16941 /* Index is not valid. */
16942 return false;
16945 /* Index and base should have the same mode. */
16946 if (base && index
16947 && GET_MODE (base) != GET_MODE (index))
16948 return false;
16950 /* Address override works only on the (%reg) part of %fs:(%reg). */
16951 if (seg != ADDR_SPACE_GENERIC
16952 && ((base && GET_MODE (base) != word_mode)
16953 || (index && GET_MODE (index) != word_mode)))
16954 return false;
16956 /* Validate scale factor. */
16957 if (scale != 1)
16959 if (!index)
16960 /* Scale without index. */
16961 return false;
16963 if (scale != 2 && scale != 4 && scale != 8)
16964 /* Scale is not a valid multiplier. */
16965 return false;
16968 /* Validate displacement. */
16969 if (disp)
16971 if (GET_CODE (disp) == CONST
16972 && GET_CODE (XEXP (disp, 0)) == UNSPEC
16973 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
16974 switch (XINT (XEXP (disp, 0), 1))
16976 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
16977 when used. While ABI specify also 32bit relocations, we
16978 don't produce them at all and use IP relative instead.
16979 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
16980 should be loaded via GOT. */
16981 case UNSPEC_GOT:
16982 if (!TARGET_64BIT
16983 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16984 goto is_legitimate_pic;
16985 /* FALLTHRU */
16986 case UNSPEC_GOTOFF:
16987 gcc_assert (flag_pic);
16988 if (!TARGET_64BIT)
16989 goto is_legitimate_pic;
16991 /* 64bit address unspec. */
16992 return false;
16994 case UNSPEC_GOTPCREL:
16995 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
16996 goto is_legitimate_pic;
16997 /* FALLTHRU */
16998 case UNSPEC_PCREL:
16999 gcc_assert (flag_pic);
17000 goto is_legitimate_pic;
17002 case UNSPEC_GOTTPOFF:
17003 case UNSPEC_GOTNTPOFF:
17004 case UNSPEC_INDNTPOFF:
17005 case UNSPEC_NTPOFF:
17006 case UNSPEC_DTPOFF:
17007 break;
17009 default:
17010 /* Invalid address unspec. */
17011 return false;
17014 else if (SYMBOLIC_CONST (disp)
17015 && (flag_pic
17016 || (TARGET_MACHO
17017 #if TARGET_MACHO
17018 && MACHOPIC_INDIRECT
17019 && !machopic_operand_p (disp)
17020 #endif
17024 is_legitimate_pic:
17025 if (TARGET_64BIT && (index || base))
17027 /* foo@dtpoff(%rX) is ok. */
17028 if (GET_CODE (disp) != CONST
17029 || GET_CODE (XEXP (disp, 0)) != PLUS
17030 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
17031 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
17032 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
17033 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
17034 /* Non-constant pic memory reference. */
17035 return false;
17037 else if ((!TARGET_MACHO || flag_pic)
17038 && ! legitimate_pic_address_disp_p (disp))
17039 /* Displacement is an invalid pic construct. */
17040 return false;
17041 #if TARGET_MACHO
17042 else if (MACHO_DYNAMIC_NO_PIC_P
17043 && !ix86_legitimate_constant_p (Pmode, disp))
17044 /* displacment must be referenced via non_lazy_pointer */
17045 return false;
17046 #endif
17048 /* This code used to verify that a symbolic pic displacement
17049 includes the pic_offset_table_rtx register.
17051 While this is good idea, unfortunately these constructs may
17052 be created by "adds using lea" optimization for incorrect
17053 code like:
17055 int a;
17056 int foo(int i)
17058 return *(&a+i);
17061 This code is nonsensical, but results in addressing
17062 GOT table with pic_offset_table_rtx base. We can't
17063 just refuse it easily, since it gets matched by
17064 "addsi3" pattern, that later gets split to lea in the
17065 case output register differs from input. While this
17066 can be handled by separate addsi pattern for this case
17067 that never results in lea, this seems to be easier and
17068 correct fix for crash to disable this test. */
17070 else if (GET_CODE (disp) != LABEL_REF
17071 && !CONST_INT_P (disp)
17072 && (GET_CODE (disp) != CONST
17073 || !ix86_legitimate_constant_p (Pmode, disp))
17074 && (GET_CODE (disp) != SYMBOL_REF
17075 || !ix86_legitimate_constant_p (Pmode, disp)))
17076 /* Displacement is not constant. */
17077 return false;
17078 else if (TARGET_64BIT
17079 && !x86_64_immediate_operand (disp, VOIDmode))
17080 /* Displacement is out of range. */
17081 return false;
17082 /* In x32 mode, constant addresses are sign extended to 64bit, so
17083 we have to prevent addresses from 0x80000000 to 0xffffffff. */
17084 else if (TARGET_X32 && !(index || base)
17085 && CONST_INT_P (disp)
17086 && val_signbit_known_set_p (SImode, INTVAL (disp)))
17087 return false;
17090 /* Everything looks valid. */
17091 return true;
17094 /* Determine if a given RTX is a valid constant address. */
17096 bool
17097 constant_address_p (rtx x)
17099 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
17102 /* Return a unique alias set for the GOT. */
17104 static alias_set_type
17105 ix86_GOT_alias_set (void)
17107 static alias_set_type set = -1;
17108 if (set == -1)
17109 set = new_alias_set ();
17110 return set;
17113 /* Return a legitimate reference for ORIG (an address) using the
17114 register REG. If REG is 0, a new pseudo is generated.
17116 There are two types of references that must be handled:
17118 1. Global data references must load the address from the GOT, via
17119 the PIC reg. An insn is emitted to do this load, and the reg is
17120 returned.
17122 2. Static data references, constant pool addresses, and code labels
17123 compute the address as an offset from the GOT, whose base is in
17124 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
17125 differentiate them from global data objects. The returned
17126 address is the PIC reg + an unspec constant.
17128 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
17129 reg also appears in the address. */
17131 static rtx
17132 legitimize_pic_address (rtx orig, rtx reg)
17134 rtx addr = orig;
17135 rtx new_rtx = orig;
17137 #if TARGET_MACHO
17138 if (TARGET_MACHO && !TARGET_64BIT)
17140 if (reg == 0)
17141 reg = gen_reg_rtx (Pmode);
17142 /* Use the generic Mach-O PIC machinery. */
17143 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
17145 #endif
17147 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17149 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17150 if (tmp)
17151 return tmp;
17154 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
17155 new_rtx = addr;
17156 else if ((!TARGET_64BIT
17157 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
17158 && !TARGET_PECOFF
17159 && gotoff_operand (addr, Pmode))
17161 /* This symbol may be referenced via a displacement
17162 from the PIC base address (@GOTOFF). */
17163 if (GET_CODE (addr) == CONST)
17164 addr = XEXP (addr, 0);
17166 if (GET_CODE (addr) == PLUS)
17168 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
17169 UNSPEC_GOTOFF);
17170 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
17172 else
17173 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
17175 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17177 if (TARGET_64BIT)
17178 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17180 if (reg != 0)
17182 gcc_assert (REG_P (reg));
17183 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
17184 new_rtx, reg, 1, OPTAB_DIRECT);
17186 else
17187 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17189 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
17190 /* We can't use @GOTOFF for text labels
17191 on VxWorks, see gotoff_operand. */
17192 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
17194 rtx tmp = legitimize_pe_coff_symbol (addr, true);
17195 if (tmp)
17196 return tmp;
17198 /* For x64 PE-COFF there is no GOT table,
17199 so we use address directly. */
17200 if (TARGET_64BIT && TARGET_PECOFF)
17202 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
17203 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17205 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
17207 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
17208 UNSPEC_GOTPCREL);
17209 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17210 new_rtx = gen_const_mem (Pmode, new_rtx);
17211 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17213 else
17215 /* This symbol must be referenced via a load
17216 from the Global Offset Table (@GOT). */
17217 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
17218 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17219 if (TARGET_64BIT)
17220 new_rtx = force_reg (Pmode, new_rtx);
17221 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17222 new_rtx = gen_const_mem (Pmode, new_rtx);
17223 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
17226 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
17228 else
17230 if (CONST_INT_P (addr)
17231 && !x86_64_immediate_operand (addr, VOIDmode))
17232 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
17233 else if (GET_CODE (addr) == CONST)
17235 addr = XEXP (addr, 0);
17237 /* We must match stuff we generate before. Assume the only
17238 unspecs that can get here are ours. Not that we could do
17239 anything with them anyway.... */
17240 if (GET_CODE (addr) == UNSPEC
17241 || (GET_CODE (addr) == PLUS
17242 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
17243 return orig;
17244 gcc_assert (GET_CODE (addr) == PLUS);
17247 if (GET_CODE (addr) == PLUS)
17249 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
17251 /* Check first to see if this is a constant
17252 offset from a @GOTOFF symbol reference. */
17253 if (!TARGET_PECOFF
17254 && gotoff_operand (op0, Pmode)
17255 && CONST_INT_P (op1))
17257 if (!TARGET_64BIT)
17259 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
17260 UNSPEC_GOTOFF);
17261 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
17262 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
17264 if (reg != 0)
17266 gcc_assert (REG_P (reg));
17267 new_rtx = expand_simple_binop (Pmode, PLUS,
17268 pic_offset_table_rtx,
17269 new_rtx, reg, 1,
17270 OPTAB_DIRECT);
17272 else
17273 new_rtx
17274 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
17276 else
17278 if (INTVAL (op1) < -16*1024*1024
17279 || INTVAL (op1) >= 16*1024*1024)
17281 if (!x86_64_immediate_operand (op1, Pmode))
17282 op1 = force_reg (Pmode, op1);
17284 new_rtx
17285 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
17289 else
17291 rtx base = legitimize_pic_address (op0, reg);
17292 machine_mode mode = GET_MODE (base);
17293 new_rtx
17294 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
17296 if (CONST_INT_P (new_rtx))
17298 if (INTVAL (new_rtx) < -16*1024*1024
17299 || INTVAL (new_rtx) >= 16*1024*1024)
17301 if (!x86_64_immediate_operand (new_rtx, mode))
17302 new_rtx = force_reg (mode, new_rtx);
17304 new_rtx
17305 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
17307 else
17308 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
17310 else
17312 /* For %rip addressing, we have to use
17313 just disp32, not base nor index. */
17314 if (TARGET_64BIT
17315 && (GET_CODE (base) == SYMBOL_REF
17316 || GET_CODE (base) == LABEL_REF))
17317 base = force_reg (mode, base);
17318 if (GET_CODE (new_rtx) == PLUS
17319 && CONSTANT_P (XEXP (new_rtx, 1)))
17321 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
17322 new_rtx = XEXP (new_rtx, 1);
17324 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
17329 return new_rtx;
17332 /* Load the thread pointer. If TO_REG is true, force it into a register. */
17334 static rtx
17335 get_thread_pointer (machine_mode tp_mode, bool to_reg)
17337 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
17339 if (GET_MODE (tp) != tp_mode)
17341 gcc_assert (GET_MODE (tp) == SImode);
17342 gcc_assert (tp_mode == DImode);
17344 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
17347 if (to_reg)
17348 tp = copy_to_mode_reg (tp_mode, tp);
17350 return tp;
17353 /* Construct the SYMBOL_REF for the tls_get_addr function. */
17355 static GTY(()) rtx ix86_tls_symbol;
17357 static rtx
17358 ix86_tls_get_addr (void)
17360 if (!ix86_tls_symbol)
17362 const char *sym
17363 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
17364 ? "___tls_get_addr" : "__tls_get_addr");
17366 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
17369 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
17371 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
17372 UNSPEC_PLTOFF);
17373 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
17374 gen_rtx_CONST (Pmode, unspec));
17377 return ix86_tls_symbol;
17380 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
17382 static GTY(()) rtx ix86_tls_module_base_symbol;
17385 ix86_tls_module_base (void)
17387 if (!ix86_tls_module_base_symbol)
17389 ix86_tls_module_base_symbol
17390 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
17392 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
17393 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
17396 return ix86_tls_module_base_symbol;
17399 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
17400 false if we expect this to be used for a memory address and true if
17401 we expect to load the address into a register. */
17403 static rtx
17404 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
17406 rtx dest, base, off;
17407 rtx pic = NULL_RTX, tp = NULL_RTX;
17408 machine_mode tp_mode = Pmode;
17409 int type;
17411 /* Fall back to global dynamic model if tool chain cannot support local
17412 dynamic. */
17413 if (TARGET_SUN_TLS && !TARGET_64BIT
17414 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
17415 && model == TLS_MODEL_LOCAL_DYNAMIC)
17416 model = TLS_MODEL_GLOBAL_DYNAMIC;
17418 switch (model)
17420 case TLS_MODEL_GLOBAL_DYNAMIC:
17421 dest = gen_reg_rtx (Pmode);
17423 if (!TARGET_64BIT)
17425 if (flag_pic && !TARGET_PECOFF)
17426 pic = pic_offset_table_rtx;
17427 else
17429 pic = gen_reg_rtx (Pmode);
17430 emit_insn (gen_set_got (pic));
17434 if (TARGET_GNU2_TLS)
17436 if (TARGET_64BIT)
17437 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
17438 else
17439 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
17441 tp = get_thread_pointer (Pmode, true);
17442 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
17444 if (GET_MODE (x) != Pmode)
17445 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17447 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17449 else
17451 rtx caddr = ix86_tls_get_addr ();
17453 if (TARGET_64BIT)
17455 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17456 rtx_insn *insns;
17458 start_sequence ();
17459 emit_call_insn
17460 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
17461 insns = get_insns ();
17462 end_sequence ();
17464 if (GET_MODE (x) != Pmode)
17465 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17467 RTL_CONST_CALL_P (insns) = 1;
17468 emit_libcall_block (insns, dest, rax, x);
17470 else
17471 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
17473 break;
17475 case TLS_MODEL_LOCAL_DYNAMIC:
17476 base = gen_reg_rtx (Pmode);
17478 if (!TARGET_64BIT)
17480 if (flag_pic)
17481 pic = pic_offset_table_rtx;
17482 else
17484 pic = gen_reg_rtx (Pmode);
17485 emit_insn (gen_set_got (pic));
17489 if (TARGET_GNU2_TLS)
17491 rtx tmp = ix86_tls_module_base ();
17493 if (TARGET_64BIT)
17494 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
17495 else
17496 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
17498 tp = get_thread_pointer (Pmode, true);
17499 set_unique_reg_note (get_last_insn (), REG_EQUAL,
17500 gen_rtx_MINUS (Pmode, tmp, tp));
17502 else
17504 rtx caddr = ix86_tls_get_addr ();
17506 if (TARGET_64BIT)
17508 rtx rax = gen_rtx_REG (Pmode, AX_REG);
17509 rtx_insn *insns;
17510 rtx eqv;
17512 start_sequence ();
17513 emit_call_insn
17514 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
17515 insns = get_insns ();
17516 end_sequence ();
17518 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
17519 share the LD_BASE result with other LD model accesses. */
17520 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
17521 UNSPEC_TLS_LD_BASE);
17523 RTL_CONST_CALL_P (insns) = 1;
17524 emit_libcall_block (insns, base, rax, eqv);
17526 else
17527 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
17530 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
17531 off = gen_rtx_CONST (Pmode, off);
17533 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
17535 if (TARGET_GNU2_TLS)
17537 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
17539 if (GET_MODE (x) != Pmode)
17540 x = gen_rtx_ZERO_EXTEND (Pmode, x);
17542 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
17544 break;
17546 case TLS_MODEL_INITIAL_EXEC:
17547 if (TARGET_64BIT)
17549 if (TARGET_SUN_TLS && !TARGET_X32)
17551 /* The Sun linker took the AMD64 TLS spec literally
17552 and can only handle %rax as destination of the
17553 initial executable code sequence. */
17555 dest = gen_reg_rtx (DImode);
17556 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
17557 return dest;
17560 /* Generate DImode references to avoid %fs:(%reg32)
17561 problems and linker IE->LE relaxation bug. */
17562 tp_mode = DImode;
17563 pic = NULL;
17564 type = UNSPEC_GOTNTPOFF;
17566 else if (flag_pic)
17568 pic = pic_offset_table_rtx;
17569 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
17571 else if (!TARGET_ANY_GNU_TLS)
17573 pic = gen_reg_rtx (Pmode);
17574 emit_insn (gen_set_got (pic));
17575 type = UNSPEC_GOTTPOFF;
17577 else
17579 pic = NULL;
17580 type = UNSPEC_INDNTPOFF;
17583 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
17584 off = gen_rtx_CONST (tp_mode, off);
17585 if (pic)
17586 off = gen_rtx_PLUS (tp_mode, pic, off);
17587 off = gen_const_mem (tp_mode, off);
17588 set_mem_alias_set (off, ix86_GOT_alias_set ());
17590 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17592 base = get_thread_pointer (tp_mode,
17593 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17594 off = force_reg (tp_mode, off);
17595 dest = gen_rtx_PLUS (tp_mode, base, off);
17596 if (tp_mode != Pmode)
17597 dest = convert_to_mode (Pmode, dest, 1);
17599 else
17601 base = get_thread_pointer (Pmode, true);
17602 dest = gen_reg_rtx (Pmode);
17603 emit_insn (ix86_gen_sub3 (dest, base, off));
17605 break;
17607 case TLS_MODEL_LOCAL_EXEC:
17608 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
17609 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17610 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
17611 off = gen_rtx_CONST (Pmode, off);
17613 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
17615 base = get_thread_pointer (Pmode,
17616 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
17617 return gen_rtx_PLUS (Pmode, base, off);
17619 else
17621 base = get_thread_pointer (Pmode, true);
17622 dest = gen_reg_rtx (Pmode);
17623 emit_insn (ix86_gen_sub3 (dest, base, off));
17625 break;
17627 default:
17628 gcc_unreachable ();
17631 return dest;
17634 /* Create or return the unique __imp_DECL dllimport symbol corresponding
17635 to symbol DECL if BEIMPORT is true. Otherwise create or return the
17636 unique refptr-DECL symbol corresponding to symbol DECL. */
17638 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
17640 static inline hashval_t hash (tree_map *m) { return m->hash; }
17641 static inline bool
17642 equal (tree_map *a, tree_map *b)
17644 return a->base.from == b->base.from;
17647 static int
17648 keep_cache_entry (tree_map *&m)
17650 return ggc_marked_p (m->base.from);
17654 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
17656 static tree
17657 get_dllimport_decl (tree decl, bool beimport)
17659 struct tree_map *h, in;
17660 const char *name;
17661 const char *prefix;
17662 size_t namelen, prefixlen;
17663 char *imp_name;
17664 tree to;
17665 rtx rtl;
17667 if (!dllimport_map)
17668 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
17670 in.hash = htab_hash_pointer (decl);
17671 in.base.from = decl;
17672 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
17673 h = *loc;
17674 if (h)
17675 return h->to;
17677 *loc = h = ggc_alloc<tree_map> ();
17678 h->hash = in.hash;
17679 h->base.from = decl;
17680 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
17681 VAR_DECL, NULL, ptr_type_node);
17682 DECL_ARTIFICIAL (to) = 1;
17683 DECL_IGNORED_P (to) = 1;
17684 DECL_EXTERNAL (to) = 1;
17685 TREE_READONLY (to) = 1;
17687 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
17688 name = targetm.strip_name_encoding (name);
17689 if (beimport)
17690 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
17691 ? "*__imp_" : "*__imp__";
17692 else
17693 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
17694 namelen = strlen (name);
17695 prefixlen = strlen (prefix);
17696 imp_name = (char *) alloca (namelen + prefixlen + 1);
17697 memcpy (imp_name, prefix, prefixlen);
17698 memcpy (imp_name + prefixlen, name, namelen + 1);
17700 name = ggc_alloc_string (imp_name, namelen + prefixlen);
17701 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
17702 SET_SYMBOL_REF_DECL (rtl, to);
17703 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
17704 if (!beimport)
17706 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
17707 #ifdef SUB_TARGET_RECORD_STUB
17708 SUB_TARGET_RECORD_STUB (name);
17709 #endif
17712 rtl = gen_const_mem (Pmode, rtl);
17713 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
17715 SET_DECL_RTL (to, rtl);
17716 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
17718 return to;
17721 /* Expand SYMBOL into its corresponding far-address symbol.
17722 WANT_REG is true if we require the result be a register. */
17724 static rtx
17725 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
17727 tree imp_decl;
17728 rtx x;
17730 gcc_assert (SYMBOL_REF_DECL (symbol));
17731 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
17733 x = DECL_RTL (imp_decl);
17734 if (want_reg)
17735 x = force_reg (Pmode, x);
17736 return x;
17739 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
17740 true if we require the result be a register. */
17742 static rtx
17743 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
17745 tree imp_decl;
17746 rtx x;
17748 gcc_assert (SYMBOL_REF_DECL (symbol));
17749 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
17751 x = DECL_RTL (imp_decl);
17752 if (want_reg)
17753 x = force_reg (Pmode, x);
17754 return x;
17757 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
17758 is true if we require the result be a register. */
17760 static rtx
17761 legitimize_pe_coff_symbol (rtx addr, bool inreg)
17763 if (!TARGET_PECOFF)
17764 return NULL_RTX;
17766 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17768 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
17769 return legitimize_dllimport_symbol (addr, inreg);
17770 if (GET_CODE (addr) == CONST
17771 && GET_CODE (XEXP (addr, 0)) == PLUS
17772 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17773 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
17775 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
17776 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17780 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
17781 return NULL_RTX;
17782 if (GET_CODE (addr) == SYMBOL_REF
17783 && !is_imported_p (addr)
17784 && SYMBOL_REF_EXTERNAL_P (addr)
17785 && SYMBOL_REF_DECL (addr))
17786 return legitimize_pe_coff_extern_decl (addr, inreg);
17788 if (GET_CODE (addr) == CONST
17789 && GET_CODE (XEXP (addr, 0)) == PLUS
17790 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
17791 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
17792 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
17793 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
17795 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
17796 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
17798 return NULL_RTX;
17801 /* Try machine-dependent ways of modifying an illegitimate address
17802 to be legitimate. If we find one, return the new, valid address.
17803 This macro is used in only one place: `memory_address' in explow.c.
17805 OLDX is the address as it was before break_out_memory_refs was called.
17806 In some cases it is useful to look at this to decide what needs to be done.
17808 It is always safe for this macro to do nothing. It exists to recognize
17809 opportunities to optimize the output.
17811 For the 80386, we handle X+REG by loading X into a register R and
17812 using R+REG. R will go in a general reg and indexing will be used.
17813 However, if REG is a broken-out memory address or multiplication,
17814 nothing needs to be done because REG can certainly go in a general reg.
17816 When -fpic is used, special handling is needed for symbolic references.
17817 See comments by legitimize_pic_address in i386.c for details. */
17819 static rtx
17820 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
17822 bool changed = false;
17823 unsigned log;
17825 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
17826 if (log)
17827 return legitimize_tls_address (x, (enum tls_model) log, false);
17828 if (GET_CODE (x) == CONST
17829 && GET_CODE (XEXP (x, 0)) == PLUS
17830 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
17831 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
17833 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
17834 (enum tls_model) log, false);
17835 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
17838 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
17840 rtx tmp = legitimize_pe_coff_symbol (x, true);
17841 if (tmp)
17842 return tmp;
17845 if (flag_pic && SYMBOLIC_CONST (x))
17846 return legitimize_pic_address (x, 0);
17848 #if TARGET_MACHO
17849 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
17850 return machopic_indirect_data_reference (x, 0);
17851 #endif
17853 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
17854 if (GET_CODE (x) == ASHIFT
17855 && CONST_INT_P (XEXP (x, 1))
17856 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
17858 changed = true;
17859 log = INTVAL (XEXP (x, 1));
17860 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
17861 GEN_INT (1 << log));
17864 if (GET_CODE (x) == PLUS)
17866 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
17868 if (GET_CODE (XEXP (x, 0)) == ASHIFT
17869 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17870 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
17872 changed = true;
17873 log = INTVAL (XEXP (XEXP (x, 0), 1));
17874 XEXP (x, 0) = gen_rtx_MULT (Pmode,
17875 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
17876 GEN_INT (1 << log));
17879 if (GET_CODE (XEXP (x, 1)) == ASHIFT
17880 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
17881 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
17883 changed = true;
17884 log = INTVAL (XEXP (XEXP (x, 1), 1));
17885 XEXP (x, 1) = gen_rtx_MULT (Pmode,
17886 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
17887 GEN_INT (1 << log));
17890 /* Put multiply first if it isn't already. */
17891 if (GET_CODE (XEXP (x, 1)) == MULT)
17893 std::swap (XEXP (x, 0), XEXP (x, 1));
17894 changed = true;
17897 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
17898 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
17899 created by virtual register instantiation, register elimination, and
17900 similar optimizations. */
17901 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
17903 changed = true;
17904 x = gen_rtx_PLUS (Pmode,
17905 gen_rtx_PLUS (Pmode, XEXP (x, 0),
17906 XEXP (XEXP (x, 1), 0)),
17907 XEXP (XEXP (x, 1), 1));
17910 /* Canonicalize
17911 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
17912 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
17913 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17914 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17915 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17916 && CONSTANT_P (XEXP (x, 1)))
17918 rtx constant;
17919 rtx other = NULL_RTX;
17921 if (CONST_INT_P (XEXP (x, 1)))
17923 constant = XEXP (x, 1);
17924 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17926 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17928 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17929 other = XEXP (x, 1);
17931 else
17932 constant = 0;
17934 if (constant)
17936 changed = true;
17937 x = gen_rtx_PLUS (Pmode,
17938 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17939 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17940 plus_constant (Pmode, other,
17941 INTVAL (constant)));
17945 if (changed && ix86_legitimate_address_p (mode, x, false))
17946 return x;
17948 if (GET_CODE (XEXP (x, 0)) == MULT)
17950 changed = true;
17951 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17954 if (GET_CODE (XEXP (x, 1)) == MULT)
17956 changed = true;
17957 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17960 if (changed
17961 && REG_P (XEXP (x, 1))
17962 && REG_P (XEXP (x, 0)))
17963 return x;
17965 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17967 changed = true;
17968 x = legitimize_pic_address (x, 0);
17971 if (changed && ix86_legitimate_address_p (mode, x, false))
17972 return x;
17974 if (REG_P (XEXP (x, 0)))
17976 rtx temp = gen_reg_rtx (Pmode);
17977 rtx val = force_operand (XEXP (x, 1), temp);
17978 if (val != temp)
17980 val = convert_to_mode (Pmode, val, 1);
17981 emit_move_insn (temp, val);
17984 XEXP (x, 1) = temp;
17985 return x;
17988 else if (REG_P (XEXP (x, 1)))
17990 rtx temp = gen_reg_rtx (Pmode);
17991 rtx val = force_operand (XEXP (x, 0), temp);
17992 if (val != temp)
17994 val = convert_to_mode (Pmode, val, 1);
17995 emit_move_insn (temp, val);
17998 XEXP (x, 0) = temp;
17999 return x;
18003 return x;
18006 /* Print an integer constant expression in assembler syntax. Addition
18007 and subtraction are the only arithmetic that may appear in these
18008 expressions. FILE is the stdio stream to write to, X is the rtx, and
18009 CODE is the operand print code from the output string. */
18011 static void
18012 output_pic_addr_const (FILE *file, rtx x, int code)
18014 char buf[256];
18016 switch (GET_CODE (x))
18018 case PC:
18019 gcc_assert (flag_pic);
18020 putc ('.', file);
18021 break;
18023 case SYMBOL_REF:
18024 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
18025 output_addr_const (file, x);
18026 else
18028 const char *name = XSTR (x, 0);
18030 /* Mark the decl as referenced so that cgraph will
18031 output the function. */
18032 if (SYMBOL_REF_DECL (x))
18033 mark_decl_referenced (SYMBOL_REF_DECL (x));
18035 #if TARGET_MACHO
18036 if (MACHOPIC_INDIRECT
18037 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
18038 name = machopic_indirection_name (x, /*stub_p=*/true);
18039 #endif
18040 assemble_name (file, name);
18042 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
18043 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
18044 fputs ("@PLT", file);
18045 break;
18047 case LABEL_REF:
18048 x = XEXP (x, 0);
18049 /* FALLTHRU */
18050 case CODE_LABEL:
18051 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
18052 assemble_name (asm_out_file, buf);
18053 break;
18055 case CONST_INT:
18056 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18057 break;
18059 case CONST:
18060 /* This used to output parentheses around the expression,
18061 but that does not work on the 386 (either ATT or BSD assembler). */
18062 output_pic_addr_const (file, XEXP (x, 0), code);
18063 break;
18065 case CONST_DOUBLE:
18066 /* We can't handle floating point constants;
18067 TARGET_PRINT_OPERAND must handle them. */
18068 output_operand_lossage ("floating constant misused");
18069 break;
18071 case PLUS:
18072 /* Some assemblers need integer constants to appear first. */
18073 if (CONST_INT_P (XEXP (x, 0)))
18075 output_pic_addr_const (file, XEXP (x, 0), code);
18076 putc ('+', file);
18077 output_pic_addr_const (file, XEXP (x, 1), code);
18079 else
18081 gcc_assert (CONST_INT_P (XEXP (x, 1)));
18082 output_pic_addr_const (file, XEXP (x, 1), code);
18083 putc ('+', file);
18084 output_pic_addr_const (file, XEXP (x, 0), code);
18086 break;
18088 case MINUS:
18089 if (!TARGET_MACHO)
18090 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
18091 output_pic_addr_const (file, XEXP (x, 0), code);
18092 putc ('-', file);
18093 output_pic_addr_const (file, XEXP (x, 1), code);
18094 if (!TARGET_MACHO)
18095 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
18096 break;
18098 case UNSPEC:
18099 gcc_assert (XVECLEN (x, 0) == 1);
18100 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
18101 switch (XINT (x, 1))
18103 case UNSPEC_GOT:
18104 fputs ("@GOT", file);
18105 break;
18106 case UNSPEC_GOTOFF:
18107 fputs ("@GOTOFF", file);
18108 break;
18109 case UNSPEC_PLTOFF:
18110 fputs ("@PLTOFF", file);
18111 break;
18112 case UNSPEC_PCREL:
18113 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18114 "(%rip)" : "[rip]", file);
18115 break;
18116 case UNSPEC_GOTPCREL:
18117 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18118 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
18119 break;
18120 case UNSPEC_GOTTPOFF:
18121 /* FIXME: This might be @TPOFF in Sun ld too. */
18122 fputs ("@gottpoff", file);
18123 break;
18124 case UNSPEC_TPOFF:
18125 fputs ("@tpoff", file);
18126 break;
18127 case UNSPEC_NTPOFF:
18128 if (TARGET_64BIT)
18129 fputs ("@tpoff", file);
18130 else
18131 fputs ("@ntpoff", file);
18132 break;
18133 case UNSPEC_DTPOFF:
18134 fputs ("@dtpoff", file);
18135 break;
18136 case UNSPEC_GOTNTPOFF:
18137 if (TARGET_64BIT)
18138 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18139 "@gottpoff(%rip)": "@gottpoff[rip]", file);
18140 else
18141 fputs ("@gotntpoff", file);
18142 break;
18143 case UNSPEC_INDNTPOFF:
18144 fputs ("@indntpoff", file);
18145 break;
18146 #if TARGET_MACHO
18147 case UNSPEC_MACHOPIC_OFFSET:
18148 putc ('-', file);
18149 machopic_output_function_base_name (file);
18150 break;
18151 #endif
18152 default:
18153 output_operand_lossage ("invalid UNSPEC as operand");
18154 break;
18156 break;
18158 default:
18159 output_operand_lossage ("invalid expression as operand");
18163 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
18164 We need to emit DTP-relative relocations. */
18166 static void ATTRIBUTE_UNUSED
18167 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
18169 fputs (ASM_LONG, file);
18170 output_addr_const (file, x);
18171 fputs ("@dtpoff", file);
18172 switch (size)
18174 case 4:
18175 break;
18176 case 8:
18177 fputs (", 0", file);
18178 break;
18179 default:
18180 gcc_unreachable ();
18184 /* Return true if X is a representation of the PIC register. This copes
18185 with calls from ix86_find_base_term, where the register might have
18186 been replaced by a cselib value. */
18188 static bool
18189 ix86_pic_register_p (rtx x)
18191 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
18192 return (pic_offset_table_rtx
18193 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
18194 else if (!REG_P (x))
18195 return false;
18196 else if (pic_offset_table_rtx)
18198 if (REGNO (x) == REGNO (pic_offset_table_rtx))
18199 return true;
18200 if (HARD_REGISTER_P (x)
18201 && !HARD_REGISTER_P (pic_offset_table_rtx)
18202 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
18203 return true;
18204 return false;
18206 else
18207 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
18210 /* Helper function for ix86_delegitimize_address.
18211 Attempt to delegitimize TLS local-exec accesses. */
18213 static rtx
18214 ix86_delegitimize_tls_address (rtx orig_x)
18216 rtx x = orig_x, unspec;
18217 struct ix86_address addr;
18219 if (!TARGET_TLS_DIRECT_SEG_REFS)
18220 return orig_x;
18221 if (MEM_P (x))
18222 x = XEXP (x, 0);
18223 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
18224 return orig_x;
18225 if (ix86_decompose_address (x, &addr) == 0
18226 || addr.seg != DEFAULT_TLS_SEG_REG
18227 || addr.disp == NULL_RTX
18228 || GET_CODE (addr.disp) != CONST)
18229 return orig_x;
18230 unspec = XEXP (addr.disp, 0);
18231 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
18232 unspec = XEXP (unspec, 0);
18233 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
18234 return orig_x;
18235 x = XVECEXP (unspec, 0, 0);
18236 gcc_assert (GET_CODE (x) == SYMBOL_REF);
18237 if (unspec != XEXP (addr.disp, 0))
18238 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
18239 if (addr.index)
18241 rtx idx = addr.index;
18242 if (addr.scale != 1)
18243 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
18244 x = gen_rtx_PLUS (Pmode, idx, x);
18246 if (addr.base)
18247 x = gen_rtx_PLUS (Pmode, addr.base, x);
18248 if (MEM_P (orig_x))
18249 x = replace_equiv_address_nv (orig_x, x);
18250 return x;
18253 /* In the name of slightly smaller debug output, and to cater to
18254 general assembler lossage, recognize PIC+GOTOFF and turn it back
18255 into a direct symbol reference.
18257 On Darwin, this is necessary to avoid a crash, because Darwin
18258 has a different PIC label for each routine but the DWARF debugging
18259 information is not associated with any particular routine, so it's
18260 necessary to remove references to the PIC label from RTL stored by
18261 the DWARF output code.
18263 This helper is used in the normal ix86_delegitimize_address
18264 entrypoint (e.g. used in the target delegitimization hook) and
18265 in ix86_find_base_term. As compile time memory optimization, we
18266 avoid allocating rtxes that will not change anything on the outcome
18267 of the callers (find_base_value and find_base_term). */
18269 static inline rtx
18270 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
18272 rtx orig_x = delegitimize_mem_from_attrs (x);
18273 /* addend is NULL or some rtx if x is something+GOTOFF where
18274 something doesn't include the PIC register. */
18275 rtx addend = NULL_RTX;
18276 /* reg_addend is NULL or a multiple of some register. */
18277 rtx reg_addend = NULL_RTX;
18278 /* const_addend is NULL or a const_int. */
18279 rtx const_addend = NULL_RTX;
18280 /* This is the result, or NULL. */
18281 rtx result = NULL_RTX;
18283 x = orig_x;
18285 if (MEM_P (x))
18286 x = XEXP (x, 0);
18288 if (TARGET_64BIT)
18290 if (GET_CODE (x) == CONST
18291 && GET_CODE (XEXP (x, 0)) == PLUS
18292 && GET_MODE (XEXP (x, 0)) == Pmode
18293 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
18294 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
18295 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
18297 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
18298 base. A CONST can't be arg_pointer_rtx based. */
18299 if (base_term_p && MEM_P (orig_x))
18300 return orig_x;
18301 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
18302 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
18303 if (MEM_P (orig_x))
18304 x = replace_equiv_address_nv (orig_x, x);
18305 return x;
18308 if (GET_CODE (x) == CONST
18309 && GET_CODE (XEXP (x, 0)) == UNSPEC
18310 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
18311 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
18312 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
18314 x = XVECEXP (XEXP (x, 0), 0, 0);
18315 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
18317 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
18318 if (x == NULL_RTX)
18319 return orig_x;
18321 return x;
18324 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
18325 return ix86_delegitimize_tls_address (orig_x);
18327 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
18328 and -mcmodel=medium -fpic. */
18331 if (GET_CODE (x) != PLUS
18332 || GET_CODE (XEXP (x, 1)) != CONST)
18333 return ix86_delegitimize_tls_address (orig_x);
18335 if (ix86_pic_register_p (XEXP (x, 0)))
18336 /* %ebx + GOT/GOTOFF */
18338 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18340 /* %ebx + %reg * scale + GOT/GOTOFF */
18341 reg_addend = XEXP (x, 0);
18342 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
18343 reg_addend = XEXP (reg_addend, 1);
18344 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
18345 reg_addend = XEXP (reg_addend, 0);
18346 else
18348 reg_addend = NULL_RTX;
18349 addend = XEXP (x, 0);
18352 else
18353 addend = XEXP (x, 0);
18355 x = XEXP (XEXP (x, 1), 0);
18356 if (GET_CODE (x) == PLUS
18357 && CONST_INT_P (XEXP (x, 1)))
18359 const_addend = XEXP (x, 1);
18360 x = XEXP (x, 0);
18363 if (GET_CODE (x) == UNSPEC
18364 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
18365 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
18366 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
18367 && !MEM_P (orig_x) && !addend)))
18368 result = XVECEXP (x, 0, 0);
18370 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
18371 && !MEM_P (orig_x))
18372 result = XVECEXP (x, 0, 0);
18374 if (! result)
18375 return ix86_delegitimize_tls_address (orig_x);
18377 /* For (PLUS something CONST_INT) both find_base_{value,term} just
18378 recurse on the first operand. */
18379 if (const_addend && !base_term_p)
18380 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
18381 if (reg_addend)
18382 result = gen_rtx_PLUS (Pmode, reg_addend, result);
18383 if (addend)
18385 /* If the rest of original X doesn't involve the PIC register, add
18386 addend and subtract pic_offset_table_rtx. This can happen e.g.
18387 for code like:
18388 leal (%ebx, %ecx, 4), %ecx
18390 movl foo@GOTOFF(%ecx), %edx
18391 in which case we return (%ecx - %ebx) + foo
18392 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
18393 and reload has completed. */
18394 if (pic_offset_table_rtx
18395 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
18396 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
18397 pic_offset_table_rtx),
18398 result);
18399 else if (pic_offset_table_rtx && !TARGET_MACHO && !TARGET_VXWORKS_RTP)
18401 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
18402 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
18403 result = gen_rtx_PLUS (Pmode, tmp, result);
18405 else
18406 return orig_x;
18408 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
18410 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
18411 if (result == NULL_RTX)
18412 return orig_x;
18414 return result;
18417 /* The normal instantiation of the above template. */
18419 static rtx
18420 ix86_delegitimize_address (rtx x)
18422 return ix86_delegitimize_address_1 (x, false);
18425 /* If X is a machine specific address (i.e. a symbol or label being
18426 referenced as a displacement from the GOT implemented using an
18427 UNSPEC), then return the base term. Otherwise return X. */
18430 ix86_find_base_term (rtx x)
18432 rtx term;
18434 if (TARGET_64BIT)
18436 if (GET_CODE (x) != CONST)
18437 return x;
18438 term = XEXP (x, 0);
18439 if (GET_CODE (term) == PLUS
18440 && CONST_INT_P (XEXP (term, 1)))
18441 term = XEXP (term, 0);
18442 if (GET_CODE (term) != UNSPEC
18443 || (XINT (term, 1) != UNSPEC_GOTPCREL
18444 && XINT (term, 1) != UNSPEC_PCREL))
18445 return x;
18447 return XVECEXP (term, 0, 0);
18450 return ix86_delegitimize_address_1 (x, true);
18453 static void
18454 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
18455 bool fp, FILE *file)
18457 const char *suffix;
18459 if (mode == CCFPmode || mode == CCFPUmode)
18461 code = ix86_fp_compare_code_to_integer (code);
18462 mode = CCmode;
18464 if (reverse)
18465 code = reverse_condition (code);
18467 switch (code)
18469 case EQ:
18470 switch (mode)
18472 case E_CCAmode:
18473 suffix = "a";
18474 break;
18475 case E_CCCmode:
18476 suffix = "c";
18477 break;
18478 case E_CCOmode:
18479 suffix = "o";
18480 break;
18481 case E_CCPmode:
18482 suffix = "p";
18483 break;
18484 case E_CCSmode:
18485 suffix = "s";
18486 break;
18487 default:
18488 suffix = "e";
18489 break;
18491 break;
18492 case NE:
18493 switch (mode)
18495 case E_CCAmode:
18496 suffix = "na";
18497 break;
18498 case E_CCCmode:
18499 suffix = "nc";
18500 break;
18501 case E_CCOmode:
18502 suffix = "no";
18503 break;
18504 case E_CCPmode:
18505 suffix = "np";
18506 break;
18507 case E_CCSmode:
18508 suffix = "ns";
18509 break;
18510 default:
18511 suffix = "ne";
18512 break;
18514 break;
18515 case GT:
18516 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
18517 suffix = "g";
18518 break;
18519 case GTU:
18520 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
18521 Those same assemblers have the same but opposite lossage on cmov. */
18522 if (mode == CCmode)
18523 suffix = fp ? "nbe" : "a";
18524 else
18525 gcc_unreachable ();
18526 break;
18527 case LT:
18528 switch (mode)
18530 case E_CCNOmode:
18531 case E_CCGOCmode:
18532 suffix = "s";
18533 break;
18535 case E_CCmode:
18536 case E_CCGCmode:
18537 suffix = "l";
18538 break;
18540 default:
18541 gcc_unreachable ();
18543 break;
18544 case LTU:
18545 if (mode == CCmode)
18546 suffix = "b";
18547 else if (mode == CCCmode)
18548 suffix = fp ? "b" : "c";
18549 else
18550 gcc_unreachable ();
18551 break;
18552 case GE:
18553 switch (mode)
18555 case E_CCNOmode:
18556 case E_CCGOCmode:
18557 suffix = "ns";
18558 break;
18560 case E_CCmode:
18561 case E_CCGCmode:
18562 suffix = "ge";
18563 break;
18565 default:
18566 gcc_unreachable ();
18568 break;
18569 case GEU:
18570 if (mode == CCmode)
18571 suffix = "nb";
18572 else if (mode == CCCmode)
18573 suffix = fp ? "nb" : "nc";
18574 else
18575 gcc_unreachable ();
18576 break;
18577 case LE:
18578 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
18579 suffix = "le";
18580 break;
18581 case LEU:
18582 if (mode == CCmode)
18583 suffix = "be";
18584 else
18585 gcc_unreachable ();
18586 break;
18587 case UNORDERED:
18588 suffix = fp ? "u" : "p";
18589 break;
18590 case ORDERED:
18591 suffix = fp ? "nu" : "np";
18592 break;
18593 default:
18594 gcc_unreachable ();
18596 fputs (suffix, file);
18599 /* Print the name of register X to FILE based on its machine mode and number.
18600 If CODE is 'w', pretend the mode is HImode.
18601 If CODE is 'b', pretend the mode is QImode.
18602 If CODE is 'k', pretend the mode is SImode.
18603 If CODE is 'q', pretend the mode is DImode.
18604 If CODE is 'x', pretend the mode is V4SFmode.
18605 If CODE is 't', pretend the mode is V8SFmode.
18606 If CODE is 'g', pretend the mode is V16SFmode.
18607 If CODE is 'h', pretend the reg is the 'high' byte register.
18608 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
18609 If CODE is 'd', duplicate the operand for AVX instruction.
18612 void
18613 print_reg (rtx x, int code, FILE *file)
18615 const char *reg;
18616 int msize;
18617 unsigned int regno;
18618 bool duplicated;
18620 if (ASSEMBLER_DIALECT == ASM_ATT)
18621 putc ('%', file);
18623 if (x == pc_rtx)
18625 gcc_assert (TARGET_64BIT);
18626 fputs ("rip", file);
18627 return;
18630 if (code == 'y' && STACK_TOP_P (x))
18632 fputs ("st(0)", file);
18633 return;
18636 if (code == 'w')
18637 msize = 2;
18638 else if (code == 'b')
18639 msize = 1;
18640 else if (code == 'k')
18641 msize = 4;
18642 else if (code == 'q')
18643 msize = 8;
18644 else if (code == 'h')
18645 msize = 0;
18646 else if (code == 'x')
18647 msize = 16;
18648 else if (code == 't')
18649 msize = 32;
18650 else if (code == 'g')
18651 msize = 64;
18652 else
18653 msize = GET_MODE_SIZE (GET_MODE (x));
18655 regno = REGNO (x);
18657 if (regno == ARG_POINTER_REGNUM
18658 || regno == FRAME_POINTER_REGNUM
18659 || regno == FPSR_REG
18660 || regno == FPCR_REG)
18662 output_operand_lossage
18663 ("invalid use of register '%s'", reg_names[regno]);
18664 return;
18666 else if (regno == FLAGS_REG)
18668 output_operand_lossage ("invalid use of asm flag output");
18669 return;
18672 duplicated = code == 'd' && TARGET_AVX;
18674 switch (msize)
18676 case 16:
18677 case 12:
18678 case 8:
18679 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
18680 warning (0, "unsupported size for integer register");
18681 /* FALLTHRU */
18682 case 4:
18683 if (LEGACY_INT_REGNO_P (regno))
18684 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
18685 /* FALLTHRU */
18686 case 2:
18687 normal:
18688 reg = hi_reg_name[regno];
18689 break;
18690 case 1:
18691 if (regno >= ARRAY_SIZE (qi_reg_name))
18692 goto normal;
18693 if (!ANY_QI_REGNO_P (regno))
18694 error ("unsupported size for integer register");
18695 reg = qi_reg_name[regno];
18696 break;
18697 case 0:
18698 if (regno >= ARRAY_SIZE (qi_high_reg_name))
18699 goto normal;
18700 reg = qi_high_reg_name[regno];
18701 break;
18702 case 32:
18703 case 64:
18704 if (SSE_REGNO_P (regno))
18706 gcc_assert (!duplicated);
18707 putc (msize == 32 ? 'y' : 'z', file);
18708 reg = hi_reg_name[regno] + 1;
18709 break;
18711 goto normal;
18712 default:
18713 gcc_unreachable ();
18716 fputs (reg, file);
18718 /* Irritatingly, AMD extended registers use
18719 different naming convention: "r%d[bwd]" */
18720 if (REX_INT_REGNO_P (regno))
18722 gcc_assert (TARGET_64BIT);
18723 switch (msize)
18725 case 0:
18726 error ("extended registers have no high halves");
18727 break;
18728 case 1:
18729 putc ('b', file);
18730 break;
18731 case 2:
18732 putc ('w', file);
18733 break;
18734 case 4:
18735 putc ('d', file);
18736 break;
18737 case 8:
18738 /* no suffix */
18739 break;
18740 default:
18741 error ("unsupported operand size for extended register");
18742 break;
18744 return;
18747 if (duplicated)
18749 if (ASSEMBLER_DIALECT == ASM_ATT)
18750 fprintf (file, ", %%%s", reg);
18751 else
18752 fprintf (file, ", %s", reg);
18756 /* Meaning of CODE:
18757 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
18758 C -- print opcode suffix for set/cmov insn.
18759 c -- like C, but print reversed condition
18760 F,f -- likewise, but for floating-point.
18761 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
18762 otherwise nothing
18763 R -- print embeded rounding and sae.
18764 r -- print only sae.
18765 z -- print the opcode suffix for the size of the current operand.
18766 Z -- likewise, with special suffixes for x87 instructions.
18767 * -- print a star (in certain assembler syntax)
18768 A -- print an absolute memory reference.
18769 E -- print address with DImode register names if TARGET_64BIT.
18770 w -- print the operand as if it's a "word" (HImode) even if it isn't.
18771 s -- print a shift double count, followed by the assemblers argument
18772 delimiter.
18773 b -- print the QImode name of the register for the indicated operand.
18774 %b0 would print %al if operands[0] is reg 0.
18775 w -- likewise, print the HImode name of the register.
18776 k -- likewise, print the SImode name of the register.
18777 q -- likewise, print the DImode name of the register.
18778 x -- likewise, print the V4SFmode name of the register.
18779 t -- likewise, print the V8SFmode name of the register.
18780 g -- likewise, print the V16SFmode name of the register.
18781 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
18782 y -- print "st(0)" instead of "st" as a register.
18783 d -- print duplicated register operand for AVX instruction.
18784 D -- print condition for SSE cmp instruction.
18785 P -- if PIC, print an @PLT suffix.
18786 p -- print raw symbol name.
18787 X -- don't print any sort of PIC '@' suffix for a symbol.
18788 & -- print some in-use local-dynamic symbol name.
18789 H -- print a memory address offset by 8; used for sse high-parts
18790 Y -- print condition for XOP pcom* instruction.
18791 + -- print a branch hint as 'cs' or 'ds' prefix
18792 ; -- print a semicolon (after prefixes due to bug in older gas).
18793 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
18794 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
18795 ! -- print MPX prefix for jxx/call/ret instructions if required.
18798 void
18799 ix86_print_operand (FILE *file, rtx x, int code)
18801 if (code)
18803 switch (code)
18805 case 'A':
18806 switch (ASSEMBLER_DIALECT)
18808 case ASM_ATT:
18809 putc ('*', file);
18810 break;
18812 case ASM_INTEL:
18813 /* Intel syntax. For absolute addresses, registers should not
18814 be surrounded by braces. */
18815 if (!REG_P (x))
18817 putc ('[', file);
18818 ix86_print_operand (file, x, 0);
18819 putc (']', file);
18820 return;
18822 break;
18824 default:
18825 gcc_unreachable ();
18828 ix86_print_operand (file, x, 0);
18829 return;
18831 case 'E':
18832 /* Wrap address in an UNSPEC to declare special handling. */
18833 if (TARGET_64BIT)
18834 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
18836 output_address (VOIDmode, x);
18837 return;
18839 case 'L':
18840 if (ASSEMBLER_DIALECT == ASM_ATT)
18841 putc ('l', file);
18842 return;
18844 case 'W':
18845 if (ASSEMBLER_DIALECT == ASM_ATT)
18846 putc ('w', file);
18847 return;
18849 case 'B':
18850 if (ASSEMBLER_DIALECT == ASM_ATT)
18851 putc ('b', file);
18852 return;
18854 case 'Q':
18855 if (ASSEMBLER_DIALECT == ASM_ATT)
18856 putc ('l', file);
18857 return;
18859 case 'S':
18860 if (ASSEMBLER_DIALECT == ASM_ATT)
18861 putc ('s', file);
18862 return;
18864 case 'T':
18865 if (ASSEMBLER_DIALECT == ASM_ATT)
18866 putc ('t', file);
18867 return;
18869 case 'O':
18870 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18871 if (ASSEMBLER_DIALECT != ASM_ATT)
18872 return;
18874 switch (GET_MODE_SIZE (GET_MODE (x)))
18876 case 2:
18877 putc ('w', file);
18878 break;
18880 case 4:
18881 putc ('l', file);
18882 break;
18884 case 8:
18885 putc ('q', file);
18886 break;
18888 default:
18889 output_operand_lossage ("invalid operand size for operand "
18890 "code 'O'");
18891 return;
18894 putc ('.', file);
18895 #endif
18896 return;
18898 case 'z':
18899 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18901 /* Opcodes don't get size suffixes if using Intel opcodes. */
18902 if (ASSEMBLER_DIALECT == ASM_INTEL)
18903 return;
18905 switch (GET_MODE_SIZE (GET_MODE (x)))
18907 case 1:
18908 putc ('b', file);
18909 return;
18911 case 2:
18912 putc ('w', file);
18913 return;
18915 case 4:
18916 putc ('l', file);
18917 return;
18919 case 8:
18920 putc ('q', file);
18921 return;
18923 default:
18924 output_operand_lossage ("invalid operand size for operand "
18925 "code 'z'");
18926 return;
18930 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18931 warning (0, "non-integer operand used with operand code 'z'");
18932 /* FALLTHRU */
18934 case 'Z':
18935 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18936 if (ASSEMBLER_DIALECT == ASM_INTEL)
18937 return;
18939 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18941 switch (GET_MODE_SIZE (GET_MODE (x)))
18943 case 2:
18944 #ifdef HAVE_AS_IX86_FILDS
18945 putc ('s', file);
18946 #endif
18947 return;
18949 case 4:
18950 putc ('l', file);
18951 return;
18953 case 8:
18954 #ifdef HAVE_AS_IX86_FILDQ
18955 putc ('q', file);
18956 #else
18957 fputs ("ll", file);
18958 #endif
18959 return;
18961 default:
18962 break;
18965 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18967 /* 387 opcodes don't get size suffixes
18968 if the operands are registers. */
18969 if (STACK_REG_P (x))
18970 return;
18972 switch (GET_MODE_SIZE (GET_MODE (x)))
18974 case 4:
18975 putc ('s', file);
18976 return;
18978 case 8:
18979 putc ('l', file);
18980 return;
18982 case 12:
18983 case 16:
18984 putc ('t', file);
18985 return;
18987 default:
18988 break;
18991 else
18993 output_operand_lossage ("invalid operand type used with "
18994 "operand code 'Z'");
18995 return;
18998 output_operand_lossage ("invalid operand size for operand code 'Z'");
18999 return;
19001 case 'd':
19002 case 'b':
19003 case 'w':
19004 case 'k':
19005 case 'q':
19006 case 'h':
19007 case 't':
19008 case 'g':
19009 case 'y':
19010 case 'x':
19011 case 'X':
19012 case 'P':
19013 case 'p':
19014 break;
19016 case 's':
19017 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
19019 ix86_print_operand (file, x, 0);
19020 fputs (", ", file);
19022 return;
19024 case 'Y':
19025 switch (GET_CODE (x))
19027 case NE:
19028 fputs ("neq", file);
19029 break;
19030 case EQ:
19031 fputs ("eq", file);
19032 break;
19033 case GE:
19034 case GEU:
19035 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
19036 break;
19037 case GT:
19038 case GTU:
19039 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
19040 break;
19041 case LE:
19042 case LEU:
19043 fputs ("le", file);
19044 break;
19045 case LT:
19046 case LTU:
19047 fputs ("lt", file);
19048 break;
19049 case UNORDERED:
19050 fputs ("unord", file);
19051 break;
19052 case ORDERED:
19053 fputs ("ord", file);
19054 break;
19055 case UNEQ:
19056 fputs ("ueq", file);
19057 break;
19058 case UNGE:
19059 fputs ("nlt", file);
19060 break;
19061 case UNGT:
19062 fputs ("nle", file);
19063 break;
19064 case UNLE:
19065 fputs ("ule", file);
19066 break;
19067 case UNLT:
19068 fputs ("ult", file);
19069 break;
19070 case LTGT:
19071 fputs ("une", file);
19072 break;
19073 default:
19074 output_operand_lossage ("operand is not a condition code, "
19075 "invalid operand code 'Y'");
19076 return;
19078 return;
19080 case 'D':
19081 /* Little bit of braindamage here. The SSE compare instructions
19082 does use completely different names for the comparisons that the
19083 fp conditional moves. */
19084 switch (GET_CODE (x))
19086 case UNEQ:
19087 if (TARGET_AVX)
19089 fputs ("eq_us", file);
19090 break;
19092 /* FALLTHRU */
19093 case EQ:
19094 fputs ("eq", file);
19095 break;
19096 case UNLT:
19097 if (TARGET_AVX)
19099 fputs ("nge", file);
19100 break;
19102 /* FALLTHRU */
19103 case LT:
19104 fputs ("lt", file);
19105 break;
19106 case UNLE:
19107 if (TARGET_AVX)
19109 fputs ("ngt", file);
19110 break;
19112 /* FALLTHRU */
19113 case LE:
19114 fputs ("le", file);
19115 break;
19116 case UNORDERED:
19117 fputs ("unord", file);
19118 break;
19119 case LTGT:
19120 if (TARGET_AVX)
19122 fputs ("neq_oq", file);
19123 break;
19125 /* FALLTHRU */
19126 case NE:
19127 fputs ("neq", file);
19128 break;
19129 case GE:
19130 if (TARGET_AVX)
19132 fputs ("ge", file);
19133 break;
19135 /* FALLTHRU */
19136 case UNGE:
19137 fputs ("nlt", file);
19138 break;
19139 case GT:
19140 if (TARGET_AVX)
19142 fputs ("gt", file);
19143 break;
19145 /* FALLTHRU */
19146 case UNGT:
19147 fputs ("nle", file);
19148 break;
19149 case ORDERED:
19150 fputs ("ord", file);
19151 break;
19152 default:
19153 output_operand_lossage ("operand is not a condition code, "
19154 "invalid operand code 'D'");
19155 return;
19157 return;
19159 case 'F':
19160 case 'f':
19161 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
19162 if (ASSEMBLER_DIALECT == ASM_ATT)
19163 putc ('.', file);
19164 gcc_fallthrough ();
19165 #endif
19167 case 'C':
19168 case 'c':
19169 if (!COMPARISON_P (x))
19171 output_operand_lossage ("operand is not a condition code, "
19172 "invalid operand code '%c'", code);
19173 return;
19175 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
19176 code == 'c' || code == 'f',
19177 code == 'F' || code == 'f',
19178 file);
19179 return;
19181 case 'H':
19182 if (!offsettable_memref_p (x))
19184 output_operand_lossage ("operand is not an offsettable memory "
19185 "reference, invalid operand code 'H'");
19186 return;
19188 /* It doesn't actually matter what mode we use here, as we're
19189 only going to use this for printing. */
19190 x = adjust_address_nv (x, DImode, 8);
19191 /* Output 'qword ptr' for intel assembler dialect. */
19192 if (ASSEMBLER_DIALECT == ASM_INTEL)
19193 code = 'q';
19194 break;
19196 case 'K':
19197 if (!CONST_INT_P (x))
19199 output_operand_lossage ("operand is not an integer, invalid "
19200 "operand code 'K'");
19201 return;
19204 if (INTVAL (x) & IX86_HLE_ACQUIRE)
19205 #ifdef HAVE_AS_IX86_HLE
19206 fputs ("xacquire ", file);
19207 #else
19208 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
19209 #endif
19210 else if (INTVAL (x) & IX86_HLE_RELEASE)
19211 #ifdef HAVE_AS_IX86_HLE
19212 fputs ("xrelease ", file);
19213 #else
19214 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
19215 #endif
19216 /* We do not want to print value of the operand. */
19217 return;
19219 case 'N':
19220 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
19221 fputs ("{z}", file);
19222 return;
19224 case 'r':
19225 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
19227 output_operand_lossage ("operand is not a specific integer, "
19228 "invalid operand code 'r'");
19229 return;
19232 if (ASSEMBLER_DIALECT == ASM_INTEL)
19233 fputs (", ", file);
19235 fputs ("{sae}", file);
19237 if (ASSEMBLER_DIALECT == ASM_ATT)
19238 fputs (", ", file);
19240 return;
19242 case 'R':
19243 if (!CONST_INT_P (x))
19245 output_operand_lossage ("operand is not an integer, invalid "
19246 "operand code 'R'");
19247 return;
19250 if (ASSEMBLER_DIALECT == ASM_INTEL)
19251 fputs (", ", file);
19253 switch (INTVAL (x))
19255 case ROUND_NEAREST_INT | ROUND_SAE:
19256 fputs ("{rn-sae}", file);
19257 break;
19258 case ROUND_NEG_INF | ROUND_SAE:
19259 fputs ("{rd-sae}", file);
19260 break;
19261 case ROUND_POS_INF | ROUND_SAE:
19262 fputs ("{ru-sae}", file);
19263 break;
19264 case ROUND_ZERO | ROUND_SAE:
19265 fputs ("{rz-sae}", file);
19266 break;
19267 default:
19268 output_operand_lossage ("operand is not a specific integer, "
19269 "invalid operand code 'R'");
19272 if (ASSEMBLER_DIALECT == ASM_ATT)
19273 fputs (", ", file);
19275 return;
19277 case '*':
19278 if (ASSEMBLER_DIALECT == ASM_ATT)
19279 putc ('*', file);
19280 return;
19282 case '&':
19284 const char *name = get_some_local_dynamic_name ();
19285 if (name == NULL)
19286 output_operand_lossage ("'%%&' used without any "
19287 "local dynamic TLS references");
19288 else
19289 assemble_name (file, name);
19290 return;
19293 case '+':
19295 rtx x;
19297 if (!optimize
19298 || optimize_function_for_size_p (cfun)
19299 || !TARGET_BRANCH_PREDICTION_HINTS)
19300 return;
19302 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
19303 if (x)
19305 int pred_val = profile_probability::from_reg_br_prob_note
19306 (XINT (x, 0)).to_reg_br_prob_base ();
19308 if (pred_val < REG_BR_PROB_BASE * 45 / 100
19309 || pred_val > REG_BR_PROB_BASE * 55 / 100)
19311 bool taken = pred_val > REG_BR_PROB_BASE / 2;
19312 bool cputaken
19313 = final_forward_branch_p (current_output_insn) == 0;
19315 /* Emit hints only in the case default branch prediction
19316 heuristics would fail. */
19317 if (taken != cputaken)
19319 /* We use 3e (DS) prefix for taken branches and
19320 2e (CS) prefix for not taken branches. */
19321 if (taken)
19322 fputs ("ds ; ", file);
19323 else
19324 fputs ("cs ; ", file);
19328 return;
19331 case ';':
19332 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
19333 putc (';', file);
19334 #endif
19335 return;
19337 case '~':
19338 putc (TARGET_AVX2 ? 'i' : 'f', file);
19339 return;
19341 case '^':
19342 if (TARGET_64BIT && Pmode != word_mode)
19343 fputs ("addr32 ", file);
19344 return;
19346 case '!':
19347 if (ix86_bnd_prefixed_insn_p (current_output_insn))
19348 fputs ("bnd ", file);
19349 return;
19351 default:
19352 output_operand_lossage ("invalid operand code '%c'", code);
19356 if (REG_P (x))
19357 print_reg (x, code, file);
19359 else if (MEM_P (x))
19361 rtx addr = XEXP (x, 0);
19363 /* No `byte ptr' prefix for call instructions ... */
19364 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
19366 machine_mode mode = GET_MODE (x);
19367 const char *size;
19369 /* Check for explicit size override codes. */
19370 if (code == 'b')
19371 size = "BYTE";
19372 else if (code == 'w')
19373 size = "WORD";
19374 else if (code == 'k')
19375 size = "DWORD";
19376 else if (code == 'q')
19377 size = "QWORD";
19378 else if (code == 'x')
19379 size = "XMMWORD";
19380 else if (code == 't')
19381 size = "YMMWORD";
19382 else if (code == 'g')
19383 size = "ZMMWORD";
19384 else if (mode == BLKmode)
19385 /* ... or BLKmode operands, when not overridden. */
19386 size = NULL;
19387 else
19388 switch (GET_MODE_SIZE (mode))
19390 case 1: size = "BYTE"; break;
19391 case 2: size = "WORD"; break;
19392 case 4: size = "DWORD"; break;
19393 case 8: size = "QWORD"; break;
19394 case 12: size = "TBYTE"; break;
19395 case 16:
19396 if (mode == XFmode)
19397 size = "TBYTE";
19398 else
19399 size = "XMMWORD";
19400 break;
19401 case 32: size = "YMMWORD"; break;
19402 case 64: size = "ZMMWORD"; break;
19403 default:
19404 gcc_unreachable ();
19406 if (size)
19408 fputs (size, file);
19409 fputs (" PTR ", file);
19413 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
19414 output_operand_lossage ("invalid constraints for operand");
19415 else
19416 ix86_print_operand_address_as
19417 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
19420 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
19422 long l;
19424 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19426 if (ASSEMBLER_DIALECT == ASM_ATT)
19427 putc ('$', file);
19428 /* Sign extend 32bit SFmode immediate to 8 bytes. */
19429 if (code == 'q')
19430 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
19431 (unsigned long long) (int) l);
19432 else
19433 fprintf (file, "0x%08x", (unsigned int) l);
19436 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
19438 long l[2];
19440 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
19442 if (ASSEMBLER_DIALECT == ASM_ATT)
19443 putc ('$', file);
19444 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
19447 /* These float cases don't actually occur as immediate operands. */
19448 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
19450 char dstr[30];
19452 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
19453 fputs (dstr, file);
19456 else
19458 /* We have patterns that allow zero sets of memory, for instance.
19459 In 64-bit mode, we should probably support all 8-byte vectors,
19460 since we can in fact encode that into an immediate. */
19461 if (GET_CODE (x) == CONST_VECTOR)
19463 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
19464 x = const0_rtx;
19467 if (code != 'P' && code != 'p')
19469 if (CONST_INT_P (x))
19471 if (ASSEMBLER_DIALECT == ASM_ATT)
19472 putc ('$', file);
19474 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
19475 || GET_CODE (x) == LABEL_REF)
19477 if (ASSEMBLER_DIALECT == ASM_ATT)
19478 putc ('$', file);
19479 else
19480 fputs ("OFFSET FLAT:", file);
19483 if (CONST_INT_P (x))
19484 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
19485 else if (flag_pic || MACHOPIC_INDIRECT)
19486 output_pic_addr_const (file, x, code);
19487 else
19488 output_addr_const (file, x);
19492 static bool
19493 ix86_print_operand_punct_valid_p (unsigned char code)
19495 return (code == '*' || code == '+' || code == '&' || code == ';'
19496 || code == '~' || code == '^' || code == '!');
19499 /* Print a memory operand whose address is ADDR. */
19501 static void
19502 ix86_print_operand_address_as (FILE *file, rtx addr,
19503 addr_space_t as, bool no_rip)
19505 struct ix86_address parts;
19506 rtx base, index, disp;
19507 int scale;
19508 int ok;
19509 bool vsib = false;
19510 int code = 0;
19512 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
19514 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19515 gcc_assert (parts.index == NULL_RTX);
19516 parts.index = XVECEXP (addr, 0, 1);
19517 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
19518 addr = XVECEXP (addr, 0, 0);
19519 vsib = true;
19521 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
19523 gcc_assert (TARGET_64BIT);
19524 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19525 code = 'q';
19527 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
19529 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
19530 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
19531 if (parts.base != NULL_RTX)
19533 parts.index = parts.base;
19534 parts.scale = 1;
19536 parts.base = XVECEXP (addr, 0, 0);
19537 addr = XVECEXP (addr, 0, 0);
19539 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
19541 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
19542 gcc_assert (parts.index == NULL_RTX);
19543 parts.index = XVECEXP (addr, 0, 1);
19544 addr = XVECEXP (addr, 0, 0);
19546 else
19547 ok = ix86_decompose_address (addr, &parts);
19549 gcc_assert (ok);
19551 base = parts.base;
19552 index = parts.index;
19553 disp = parts.disp;
19554 scale = parts.scale;
19556 if (ADDR_SPACE_GENERIC_P (as))
19557 as = parts.seg;
19558 else
19559 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
19561 if (!ADDR_SPACE_GENERIC_P (as))
19563 const char *string;
19565 if (as == ADDR_SPACE_SEG_FS)
19566 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
19567 else if (as == ADDR_SPACE_SEG_GS)
19568 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
19569 else
19570 gcc_unreachable ();
19571 fputs (string, file);
19574 /* Use one byte shorter RIP relative addressing for 64bit mode. */
19575 if (TARGET_64BIT && !base && !index && !no_rip)
19577 rtx symbol = disp;
19579 if (GET_CODE (disp) == CONST
19580 && GET_CODE (XEXP (disp, 0)) == PLUS
19581 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19582 symbol = XEXP (XEXP (disp, 0), 0);
19584 if (GET_CODE (symbol) == LABEL_REF
19585 || (GET_CODE (symbol) == SYMBOL_REF
19586 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
19587 base = pc_rtx;
19590 if (!base && !index)
19592 /* Displacement only requires special attention. */
19593 if (CONST_INT_P (disp))
19595 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
19596 fputs ("ds:", file);
19597 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
19599 /* Load the external function address via the GOT slot to avoid PLT. */
19600 else if (GET_CODE (disp) == CONST
19601 && GET_CODE (XEXP (disp, 0)) == UNSPEC
19602 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
19603 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
19604 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
19605 output_pic_addr_const (file, disp, 0);
19606 else if (flag_pic)
19607 output_pic_addr_const (file, disp, 0);
19608 else
19609 output_addr_const (file, disp);
19611 else
19613 /* Print SImode register names to force addr32 prefix. */
19614 if (SImode_address_operand (addr, VOIDmode))
19616 if (flag_checking)
19618 gcc_assert (TARGET_64BIT);
19619 switch (GET_CODE (addr))
19621 case SUBREG:
19622 gcc_assert (GET_MODE (addr) == SImode);
19623 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
19624 break;
19625 case ZERO_EXTEND:
19626 case AND:
19627 gcc_assert (GET_MODE (addr) == DImode);
19628 break;
19629 default:
19630 gcc_unreachable ();
19633 gcc_assert (!code);
19634 code = 'k';
19636 else if (code == 0
19637 && TARGET_X32
19638 && disp
19639 && CONST_INT_P (disp)
19640 && INTVAL (disp) < -16*1024*1024)
19642 /* X32 runs in 64-bit mode, where displacement, DISP, in
19643 address DISP(%r64), is encoded as 32-bit immediate sign-
19644 extended from 32-bit to 64-bit. For -0x40000300(%r64),
19645 address is %r64 + 0xffffffffbffffd00. When %r64 <
19646 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
19647 which is invalid for x32. The correct address is %r64
19648 - 0x40000300 == 0xf7ffdd64. To properly encode
19649 -0x40000300(%r64) for x32, we zero-extend negative
19650 displacement by forcing addr32 prefix which truncates
19651 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
19652 zero-extend all negative displacements, including -1(%rsp).
19653 However, for small negative displacements, sign-extension
19654 won't cause overflow. We only zero-extend negative
19655 displacements if they < -16*1024*1024, which is also used
19656 to check legitimate address displacements for PIC. */
19657 code = 'k';
19660 if (ASSEMBLER_DIALECT == ASM_ATT)
19662 if (disp)
19664 if (flag_pic)
19665 output_pic_addr_const (file, disp, 0);
19666 else if (GET_CODE (disp) == LABEL_REF)
19667 output_asm_label (disp);
19668 else
19669 output_addr_const (file, disp);
19672 putc ('(', file);
19673 if (base)
19674 print_reg (base, code, file);
19675 if (index)
19677 putc (',', file);
19678 print_reg (index, vsib ? 0 : code, file);
19679 if (scale != 1 || vsib)
19680 fprintf (file, ",%d", scale);
19682 putc (')', file);
19684 else
19686 rtx offset = NULL_RTX;
19688 if (disp)
19690 /* Pull out the offset of a symbol; print any symbol itself. */
19691 if (GET_CODE (disp) == CONST
19692 && GET_CODE (XEXP (disp, 0)) == PLUS
19693 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
19695 offset = XEXP (XEXP (disp, 0), 1);
19696 disp = gen_rtx_CONST (VOIDmode,
19697 XEXP (XEXP (disp, 0), 0));
19700 if (flag_pic)
19701 output_pic_addr_const (file, disp, 0);
19702 else if (GET_CODE (disp) == LABEL_REF)
19703 output_asm_label (disp);
19704 else if (CONST_INT_P (disp))
19705 offset = disp;
19706 else
19707 output_addr_const (file, disp);
19710 putc ('[', file);
19711 if (base)
19713 print_reg (base, code, file);
19714 if (offset)
19716 if (INTVAL (offset) >= 0)
19717 putc ('+', file);
19718 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19721 else if (offset)
19722 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
19723 else
19724 putc ('0', file);
19726 if (index)
19728 putc ('+', file);
19729 print_reg (index, vsib ? 0 : code, file);
19730 if (scale != 1 || vsib)
19731 fprintf (file, "*%d", scale);
19733 putc (']', file);
19738 static void
19739 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
19741 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
19744 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
19746 static bool
19747 i386_asm_output_addr_const_extra (FILE *file, rtx x)
19749 rtx op;
19751 if (GET_CODE (x) != UNSPEC)
19752 return false;
19754 op = XVECEXP (x, 0, 0);
19755 switch (XINT (x, 1))
19757 case UNSPEC_GOTTPOFF:
19758 output_addr_const (file, op);
19759 /* FIXME: This might be @TPOFF in Sun ld. */
19760 fputs ("@gottpoff", file);
19761 break;
19762 case UNSPEC_TPOFF:
19763 output_addr_const (file, op);
19764 fputs ("@tpoff", file);
19765 break;
19766 case UNSPEC_NTPOFF:
19767 output_addr_const (file, op);
19768 if (TARGET_64BIT)
19769 fputs ("@tpoff", file);
19770 else
19771 fputs ("@ntpoff", file);
19772 break;
19773 case UNSPEC_DTPOFF:
19774 output_addr_const (file, op);
19775 fputs ("@dtpoff", file);
19776 break;
19777 case UNSPEC_GOTNTPOFF:
19778 output_addr_const (file, op);
19779 if (TARGET_64BIT)
19780 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
19781 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
19782 else
19783 fputs ("@gotntpoff", file);
19784 break;
19785 case UNSPEC_INDNTPOFF:
19786 output_addr_const (file, op);
19787 fputs ("@indntpoff", file);
19788 break;
19789 #if TARGET_MACHO
19790 case UNSPEC_MACHOPIC_OFFSET:
19791 output_addr_const (file, op);
19792 putc ('-', file);
19793 machopic_output_function_base_name (file);
19794 break;
19795 #endif
19797 default:
19798 return false;
19801 return true;
19804 /* Split one or more double-mode RTL references into pairs of half-mode
19805 references. The RTL can be REG, offsettable MEM, integer constant, or
19806 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
19807 split and "num" is its length. lo_half and hi_half are output arrays
19808 that parallel "operands". */
19810 void
19811 split_double_mode (machine_mode mode, rtx operands[],
19812 int num, rtx lo_half[], rtx hi_half[])
19814 machine_mode half_mode;
19815 unsigned int byte;
19817 switch (mode)
19819 case E_TImode:
19820 half_mode = DImode;
19821 break;
19822 case E_DImode:
19823 half_mode = SImode;
19824 break;
19825 default:
19826 gcc_unreachable ();
19829 byte = GET_MODE_SIZE (half_mode);
19831 while (num--)
19833 rtx op = operands[num];
19835 /* simplify_subreg refuse to split volatile memory addresses,
19836 but we still have to handle it. */
19837 if (MEM_P (op))
19839 lo_half[num] = adjust_address (op, half_mode, 0);
19840 hi_half[num] = adjust_address (op, half_mode, byte);
19842 else
19844 lo_half[num] = simplify_gen_subreg (half_mode, op,
19845 GET_MODE (op) == VOIDmode
19846 ? mode : GET_MODE (op), 0);
19847 hi_half[num] = simplify_gen_subreg (half_mode, op,
19848 GET_MODE (op) == VOIDmode
19849 ? mode : GET_MODE (op), byte);
19854 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
19855 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
19856 is the expression of the binary operation. The output may either be
19857 emitted here, or returned to the caller, like all output_* functions.
19859 There is no guarantee that the operands are the same mode, as they
19860 might be within FLOAT or FLOAT_EXTEND expressions. */
19862 #ifndef SYSV386_COMPAT
19863 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19864 wants to fix the assemblers because that causes incompatibility
19865 with gcc. No-one wants to fix gcc because that causes
19866 incompatibility with assemblers... You can use the option of
19867 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19868 #define SYSV386_COMPAT 1
19869 #endif
19871 const char *
19872 output_387_binary_op (rtx_insn *insn, rtx *operands)
19874 static char buf[40];
19875 const char *p;
19876 const char *ssep;
19877 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
19879 /* Even if we do not want to check the inputs, this documents input
19880 constraints. Which helps in understanding the following code. */
19881 if (flag_checking)
19883 if (STACK_REG_P (operands[0])
19884 && ((REG_P (operands[1])
19885 && REGNO (operands[0]) == REGNO (operands[1])
19886 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19887 || (REG_P (operands[2])
19888 && REGNO (operands[0]) == REGNO (operands[2])
19889 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19890 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19891 ; /* ok */
19892 else
19893 gcc_assert (is_sse);
19896 switch (GET_CODE (operands[3]))
19898 case PLUS:
19899 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19900 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19901 p = "fiadd";
19902 else
19903 p = "fadd";
19904 ssep = "vadd";
19905 break;
19907 case MINUS:
19908 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19909 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19910 p = "fisub";
19911 else
19912 p = "fsub";
19913 ssep = "vsub";
19914 break;
19916 case MULT:
19917 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19918 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19919 p = "fimul";
19920 else
19921 p = "fmul";
19922 ssep = "vmul";
19923 break;
19925 case DIV:
19926 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19927 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19928 p = "fidiv";
19929 else
19930 p = "fdiv";
19931 ssep = "vdiv";
19932 break;
19934 default:
19935 gcc_unreachable ();
19938 if (is_sse)
19940 if (TARGET_AVX)
19942 strcpy (buf, ssep);
19943 if (GET_MODE (operands[0]) == SFmode)
19944 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
19945 else
19946 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
19948 else
19950 strcpy (buf, ssep + 1);
19951 if (GET_MODE (operands[0]) == SFmode)
19952 strcat (buf, "ss\t{%2, %0|%0, %2}");
19953 else
19954 strcat (buf, "sd\t{%2, %0|%0, %2}");
19956 return buf;
19958 strcpy (buf, p);
19960 switch (GET_CODE (operands[3]))
19962 case MULT:
19963 case PLUS:
19964 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19965 std::swap (operands[1], operands[2]);
19967 /* know operands[0] == operands[1]. */
19969 if (MEM_P (operands[2]))
19971 p = "%Z2\t%2";
19972 break;
19975 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19977 if (STACK_TOP_P (operands[0]))
19978 /* How is it that we are storing to a dead operand[2]?
19979 Well, presumably operands[1] is dead too. We can't
19980 store the result to st(0) as st(0) gets popped on this
19981 instruction. Instead store to operands[2] (which I
19982 think has to be st(1)). st(1) will be popped later.
19983 gcc <= 2.8.1 didn't have this check and generated
19984 assembly code that the Unixware assembler rejected. */
19985 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19986 else
19987 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19988 break;
19991 if (STACK_TOP_P (operands[0]))
19992 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19993 else
19994 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19995 break;
19997 case MINUS:
19998 case DIV:
19999 if (MEM_P (operands[1]))
20001 p = "r%Z1\t%1";
20002 break;
20005 if (MEM_P (operands[2]))
20007 p = "%Z2\t%2";
20008 break;
20011 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
20013 #if SYSV386_COMPAT
20014 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
20015 derived assemblers, confusingly reverse the direction of
20016 the operation for fsub{r} and fdiv{r} when the
20017 destination register is not st(0). The Intel assembler
20018 doesn't have this brain damage. Read !SYSV386_COMPAT to
20019 figure out what the hardware really does. */
20020 if (STACK_TOP_P (operands[0]))
20021 p = "{p\t%0, %2|rp\t%2, %0}";
20022 else
20023 p = "{rp\t%2, %0|p\t%0, %2}";
20024 #else
20025 if (STACK_TOP_P (operands[0]))
20026 /* As above for fmul/fadd, we can't store to st(0). */
20027 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
20028 else
20029 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
20030 #endif
20031 break;
20034 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
20036 #if SYSV386_COMPAT
20037 if (STACK_TOP_P (operands[0]))
20038 p = "{rp\t%0, %1|p\t%1, %0}";
20039 else
20040 p = "{p\t%1, %0|rp\t%0, %1}";
20041 #else
20042 if (STACK_TOP_P (operands[0]))
20043 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
20044 else
20045 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
20046 #endif
20047 break;
20050 if (STACK_TOP_P (operands[0]))
20052 if (STACK_TOP_P (operands[1]))
20053 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
20054 else
20055 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
20056 break;
20058 else if (STACK_TOP_P (operands[1]))
20060 #if SYSV386_COMPAT
20061 p = "{\t%1, %0|r\t%0, %1}";
20062 #else
20063 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
20064 #endif
20066 else
20068 #if SYSV386_COMPAT
20069 p = "{r\t%2, %0|\t%0, %2}";
20070 #else
20071 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
20072 #endif
20074 break;
20076 default:
20077 gcc_unreachable ();
20080 strcat (buf, p);
20081 return buf;
20084 /* Return needed mode for entity in optimize_mode_switching pass. */
20086 static int
20087 ix86_dirflag_mode_needed (rtx_insn *insn)
20089 if (CALL_P (insn))
20091 if (cfun->machine->func_type == TYPE_NORMAL)
20092 return X86_DIRFLAG_ANY;
20093 else
20094 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
20095 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
20098 if (recog_memoized (insn) < 0)
20099 return X86_DIRFLAG_ANY;
20101 if (get_attr_type (insn) == TYPE_STR)
20103 /* Emit cld instruction if stringops are used in the function. */
20104 if (cfun->machine->func_type == TYPE_NORMAL)
20105 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
20106 else
20107 return X86_DIRFLAG_RESET;
20110 return X86_DIRFLAG_ANY;
20113 /* Check if a 256bit AVX register is referenced inside of EXP. */
20115 static bool
20116 ix86_check_avx256_register (const_rtx exp)
20118 if (SUBREG_P (exp))
20119 exp = SUBREG_REG (exp);
20121 return (REG_P (exp)
20122 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)));
20125 /* Return needed mode for entity in optimize_mode_switching pass. */
20127 static int
20128 ix86_avx_u128_mode_needed (rtx_insn *insn)
20130 if (CALL_P (insn))
20132 rtx link;
20134 /* Needed mode is set to AVX_U128_CLEAN if there are
20135 no 256bit modes used in function arguments. */
20136 for (link = CALL_INSN_FUNCTION_USAGE (insn);
20137 link;
20138 link = XEXP (link, 1))
20140 if (GET_CODE (XEXP (link, 0)) == USE)
20142 rtx arg = XEXP (XEXP (link, 0), 0);
20144 if (ix86_check_avx256_register (arg))
20145 return AVX_U128_DIRTY;
20149 return AVX_U128_CLEAN;
20152 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
20153 changes state only when a 256bit register is written to, but we need
20154 to prevent the compiler from moving optimal insertion point above
20155 eventual read from 256bit register. */
20156 subrtx_iterator::array_type array;
20157 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
20158 if (ix86_check_avx256_register (*iter))
20159 return AVX_U128_DIRTY;
20161 return AVX_U128_ANY;
20164 /* Return mode that i387 must be switched into
20165 prior to the execution of insn. */
20167 static int
20168 ix86_i387_mode_needed (int entity, rtx_insn *insn)
20170 enum attr_i387_cw mode;
20172 /* The mode UNINITIALIZED is used to store control word after a
20173 function call or ASM pattern. The mode ANY specify that function
20174 has no requirements on the control word and make no changes in the
20175 bits we are interested in. */
20177 if (CALL_P (insn)
20178 || (NONJUMP_INSN_P (insn)
20179 && (asm_noperands (PATTERN (insn)) >= 0
20180 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
20181 return I387_CW_UNINITIALIZED;
20183 if (recog_memoized (insn) < 0)
20184 return I387_CW_ANY;
20186 mode = get_attr_i387_cw (insn);
20188 switch (entity)
20190 case I387_TRUNC:
20191 if (mode == I387_CW_TRUNC)
20192 return mode;
20193 break;
20195 case I387_FLOOR:
20196 if (mode == I387_CW_FLOOR)
20197 return mode;
20198 break;
20200 case I387_CEIL:
20201 if (mode == I387_CW_CEIL)
20202 return mode;
20203 break;
20205 case I387_MASK_PM:
20206 if (mode == I387_CW_MASK_PM)
20207 return mode;
20208 break;
20210 default:
20211 gcc_unreachable ();
20214 return I387_CW_ANY;
20217 /* Return mode that entity must be switched into
20218 prior to the execution of insn. */
20220 static int
20221 ix86_mode_needed (int entity, rtx_insn *insn)
20223 switch (entity)
20225 case X86_DIRFLAG:
20226 return ix86_dirflag_mode_needed (insn);
20227 case AVX_U128:
20228 return ix86_avx_u128_mode_needed (insn);
20229 case I387_TRUNC:
20230 case I387_FLOOR:
20231 case I387_CEIL:
20232 case I387_MASK_PM:
20233 return ix86_i387_mode_needed (entity, insn);
20234 default:
20235 gcc_unreachable ();
20237 return 0;
20240 /* Check if a 256bit AVX register is referenced in stores. */
20242 static void
20243 ix86_check_avx256_stores (rtx dest, const_rtx, void *data)
20245 if (ix86_check_avx256_register (dest))
20247 bool *used = (bool *) data;
20248 *used = true;
20252 /* Calculate mode of upper 128bit AVX registers after the insn. */
20254 static int
20255 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
20257 rtx pat = PATTERN (insn);
20259 if (vzeroupper_operation (pat, VOIDmode)
20260 || vzeroall_operation (pat, VOIDmode))
20261 return AVX_U128_CLEAN;
20263 /* We know that state is clean after CALL insn if there are no
20264 256bit registers used in the function return register. */
20265 if (CALL_P (insn))
20267 bool avx_reg256_found = false;
20268 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
20270 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
20273 /* Otherwise, return current mode. Remember that if insn
20274 references AVX 256bit registers, the mode was already changed
20275 to DIRTY from MODE_NEEDED. */
20276 return mode;
20279 /* Return the mode that an insn results in. */
20281 static int
20282 ix86_mode_after (int entity, int mode, rtx_insn *insn)
20284 switch (entity)
20286 case X86_DIRFLAG:
20287 return mode;
20288 case AVX_U128:
20289 return ix86_avx_u128_mode_after (mode, insn);
20290 case I387_TRUNC:
20291 case I387_FLOOR:
20292 case I387_CEIL:
20293 case I387_MASK_PM:
20294 return mode;
20295 default:
20296 gcc_unreachable ();
20300 static int
20301 ix86_dirflag_mode_entry (void)
20303 /* For TARGET_CLD or in the interrupt handler we can't assume
20304 direction flag state at function entry. */
20305 if (TARGET_CLD
20306 || cfun->machine->func_type != TYPE_NORMAL)
20307 return X86_DIRFLAG_ANY;
20309 return X86_DIRFLAG_RESET;
20312 static int
20313 ix86_avx_u128_mode_entry (void)
20315 tree arg;
20317 /* Entry mode is set to AVX_U128_DIRTY if there are
20318 256bit modes used in function arguments. */
20319 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
20320 arg = TREE_CHAIN (arg))
20322 rtx incoming = DECL_INCOMING_RTL (arg);
20324 if (incoming && ix86_check_avx256_register (incoming))
20325 return AVX_U128_DIRTY;
20328 return AVX_U128_CLEAN;
20331 /* Return a mode that ENTITY is assumed to be
20332 switched to at function entry. */
20334 static int
20335 ix86_mode_entry (int entity)
20337 switch (entity)
20339 case X86_DIRFLAG:
20340 return ix86_dirflag_mode_entry ();
20341 case AVX_U128:
20342 return ix86_avx_u128_mode_entry ();
20343 case I387_TRUNC:
20344 case I387_FLOOR:
20345 case I387_CEIL:
20346 case I387_MASK_PM:
20347 return I387_CW_ANY;
20348 default:
20349 gcc_unreachable ();
20353 static int
20354 ix86_avx_u128_mode_exit (void)
20356 rtx reg = crtl->return_rtx;
20358 /* Exit mode is set to AVX_U128_DIRTY if there are
20359 256bit modes used in the function return register. */
20360 if (reg && ix86_check_avx256_register (reg))
20361 return AVX_U128_DIRTY;
20363 return AVX_U128_CLEAN;
20366 /* Return a mode that ENTITY is assumed to be
20367 switched to at function exit. */
20369 static int
20370 ix86_mode_exit (int entity)
20372 switch (entity)
20374 case X86_DIRFLAG:
20375 return X86_DIRFLAG_ANY;
20376 case AVX_U128:
20377 return ix86_avx_u128_mode_exit ();
20378 case I387_TRUNC:
20379 case I387_FLOOR:
20380 case I387_CEIL:
20381 case I387_MASK_PM:
20382 return I387_CW_ANY;
20383 default:
20384 gcc_unreachable ();
20388 static int
20389 ix86_mode_priority (int, int n)
20391 return n;
20394 /* Output code to initialize control word copies used by trunc?f?i and
20395 rounding patterns. CURRENT_MODE is set to current control word,
20396 while NEW_MODE is set to new control word. */
20398 static void
20399 emit_i387_cw_initialization (int mode)
20401 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
20402 rtx new_mode;
20404 enum ix86_stack_slot slot;
20406 rtx reg = gen_reg_rtx (HImode);
20408 emit_insn (gen_x86_fnstcw_1 (stored_mode));
20409 emit_move_insn (reg, copy_rtx (stored_mode));
20411 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
20412 || optimize_insn_for_size_p ())
20414 switch (mode)
20416 case I387_CW_TRUNC:
20417 /* round toward zero (truncate) */
20418 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
20419 slot = SLOT_CW_TRUNC;
20420 break;
20422 case I387_CW_FLOOR:
20423 /* round down toward -oo */
20424 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20425 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
20426 slot = SLOT_CW_FLOOR;
20427 break;
20429 case I387_CW_CEIL:
20430 /* round up toward +oo */
20431 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
20432 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
20433 slot = SLOT_CW_CEIL;
20434 break;
20436 case I387_CW_MASK_PM:
20437 /* mask precision exception for nearbyint() */
20438 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20439 slot = SLOT_CW_MASK_PM;
20440 break;
20442 default:
20443 gcc_unreachable ();
20446 else
20448 switch (mode)
20450 case I387_CW_TRUNC:
20451 /* round toward zero (truncate) */
20452 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
20453 slot = SLOT_CW_TRUNC;
20454 break;
20456 case I387_CW_FLOOR:
20457 /* round down toward -oo */
20458 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
20459 slot = SLOT_CW_FLOOR;
20460 break;
20462 case I387_CW_CEIL:
20463 /* round up toward +oo */
20464 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
20465 slot = SLOT_CW_CEIL;
20466 break;
20468 case I387_CW_MASK_PM:
20469 /* mask precision exception for nearbyint() */
20470 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
20471 slot = SLOT_CW_MASK_PM;
20472 break;
20474 default:
20475 gcc_unreachable ();
20479 gcc_assert (slot < MAX_386_STACK_LOCALS);
20481 new_mode = assign_386_stack_local (HImode, slot);
20482 emit_move_insn (new_mode, reg);
20485 /* Emit vzeroupper. */
20487 void
20488 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
20490 int i;
20492 /* Cancel automatic vzeroupper insertion if there are
20493 live call-saved SSE registers at the insertion point. */
20495 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
20496 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20497 return;
20499 if (TARGET_64BIT)
20500 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
20501 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
20502 return;
20504 emit_insn (gen_avx_vzeroupper ());
20507 /* Generate one or more insns to set ENTITY to MODE. */
20509 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
20510 is the set of hard registers live at the point where the insn(s)
20511 are to be inserted. */
20513 static void
20514 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
20515 HARD_REG_SET regs_live)
20517 switch (entity)
20519 case X86_DIRFLAG:
20520 if (mode == X86_DIRFLAG_RESET)
20521 emit_insn (gen_cld ());
20522 break;
20523 case AVX_U128:
20524 if (mode == AVX_U128_CLEAN)
20525 ix86_avx_emit_vzeroupper (regs_live);
20526 break;
20527 case I387_TRUNC:
20528 case I387_FLOOR:
20529 case I387_CEIL:
20530 case I387_MASK_PM:
20531 if (mode != I387_CW_ANY
20532 && mode != I387_CW_UNINITIALIZED)
20533 emit_i387_cw_initialization (mode);
20534 break;
20535 default:
20536 gcc_unreachable ();
20540 /* Output code for INSN to convert a float to a signed int. OPERANDS
20541 are the insn operands. The output may be [HSD]Imode and the input
20542 operand may be [SDX]Fmode. */
20544 const char *
20545 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
20547 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20548 int dimode_p = GET_MODE (operands[0]) == DImode;
20549 int round_mode = get_attr_i387_cw (insn);
20551 /* Jump through a hoop or two for DImode, since the hardware has no
20552 non-popping instruction. We used to do this a different way, but
20553 that was somewhat fragile and broke with post-reload splitters. */
20554 if ((dimode_p || fisttp) && !stack_top_dies)
20555 output_asm_insn ("fld\t%y1", operands);
20557 gcc_assert (STACK_TOP_P (operands[1]));
20558 gcc_assert (MEM_P (operands[0]));
20559 gcc_assert (GET_MODE (operands[1]) != TFmode);
20561 if (fisttp)
20562 output_asm_insn ("fisttp%Z0\t%0", operands);
20563 else
20565 if (round_mode != I387_CW_ANY)
20566 output_asm_insn ("fldcw\t%3", operands);
20567 if (stack_top_dies || dimode_p)
20568 output_asm_insn ("fistp%Z0\t%0", operands);
20569 else
20570 output_asm_insn ("fist%Z0\t%0", operands);
20571 if (round_mode != I387_CW_ANY)
20572 output_asm_insn ("fldcw\t%2", operands);
20575 return "";
20578 /* Output code for x87 ffreep insn. The OPNO argument, which may only
20579 have the values zero or one, indicates the ffreep insn's operand
20580 from the OPERANDS array. */
20582 static const char *
20583 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
20585 if (TARGET_USE_FFREEP)
20586 #ifdef HAVE_AS_IX86_FFREEP
20587 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
20588 #else
20590 static char retval[32];
20591 int regno = REGNO (operands[opno]);
20593 gcc_assert (STACK_REGNO_P (regno));
20595 regno -= FIRST_STACK_REG;
20597 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
20598 return retval;
20600 #endif
20602 return opno ? "fstp\t%y1" : "fstp\t%y0";
20606 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
20607 should be used. UNORDERED_P is true when fucom should be used. */
20609 const char *
20610 output_fp_compare (rtx_insn *insn, rtx *operands, bool eflags_p, bool unordered_p)
20612 int stack_top_dies;
20613 rtx cmp_op0, cmp_op1;
20614 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
20616 if (eflags_p)
20618 cmp_op0 = operands[0];
20619 cmp_op1 = operands[1];
20621 else
20623 cmp_op0 = operands[1];
20624 cmp_op1 = operands[2];
20627 if (is_sse)
20629 if (GET_MODE (operands[0]) == SFmode)
20630 if (unordered_p)
20631 return "%vucomiss\t{%1, %0|%0, %1}";
20632 else
20633 return "%vcomiss\t{%1, %0|%0, %1}";
20634 else
20635 if (unordered_p)
20636 return "%vucomisd\t{%1, %0|%0, %1}";
20637 else
20638 return "%vcomisd\t{%1, %0|%0, %1}";
20641 gcc_assert (STACK_TOP_P (cmp_op0));
20643 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
20645 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
20647 if (stack_top_dies)
20649 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
20650 return output_387_ffreep (operands, 1);
20652 else
20653 return "ftst\n\tfnstsw\t%0";
20656 if (STACK_REG_P (cmp_op1)
20657 && stack_top_dies
20658 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
20659 && REGNO (cmp_op1) != FIRST_STACK_REG)
20661 /* If both the top of the 387 stack dies, and the other operand
20662 is also a stack register that dies, then this must be a
20663 `fcompp' float compare */
20665 if (eflags_p)
20667 /* There is no double popping fcomi variant. Fortunately,
20668 eflags is immune from the fstp's cc clobbering. */
20669 if (unordered_p)
20670 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
20671 else
20672 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
20673 return output_387_ffreep (operands, 0);
20675 else
20677 if (unordered_p)
20678 return "fucompp\n\tfnstsw\t%0";
20679 else
20680 return "fcompp\n\tfnstsw\t%0";
20683 else
20685 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
20687 static const char * const alt[16] =
20689 "fcom%Z2\t%y2\n\tfnstsw\t%0",
20690 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
20691 "fucom%Z2\t%y2\n\tfnstsw\t%0",
20692 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
20694 "ficom%Z2\t%y2\n\tfnstsw\t%0",
20695 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
20696 NULL,
20697 NULL,
20699 "fcomi\t{%y1, %0|%0, %y1}",
20700 "fcomip\t{%y1, %0|%0, %y1}",
20701 "fucomi\t{%y1, %0|%0, %y1}",
20702 "fucomip\t{%y1, %0|%0, %y1}",
20704 NULL,
20705 NULL,
20706 NULL,
20707 NULL
20710 int mask;
20711 const char *ret;
20713 mask = eflags_p << 3;
20714 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
20715 mask |= unordered_p << 1;
20716 mask |= stack_top_dies;
20718 gcc_assert (mask < 16);
20719 ret = alt[mask];
20720 gcc_assert (ret);
20722 return ret;
20726 void
20727 ix86_output_addr_vec_elt (FILE *file, int value)
20729 const char *directive = ASM_LONG;
20731 #ifdef ASM_QUAD
20732 if (TARGET_LP64)
20733 directive = ASM_QUAD;
20734 #else
20735 gcc_assert (!TARGET_64BIT);
20736 #endif
20738 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
20741 void
20742 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
20744 const char *directive = ASM_LONG;
20746 #ifdef ASM_QUAD
20747 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
20748 directive = ASM_QUAD;
20749 #else
20750 gcc_assert (!TARGET_64BIT);
20751 #endif
20752 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
20753 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
20754 fprintf (file, "%s%s%d-%s%d\n",
20755 directive, LPREFIX, value, LPREFIX, rel);
20756 else if (HAVE_AS_GOTOFF_IN_DATA)
20757 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
20758 #if TARGET_MACHO
20759 else if (TARGET_MACHO)
20761 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
20762 machopic_output_function_base_name (file);
20763 putc ('\n', file);
20765 #endif
20766 else
20767 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
20768 GOT_SYMBOL_NAME, LPREFIX, value);
20771 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
20772 for the target. */
20774 void
20775 ix86_expand_clear (rtx dest)
20777 rtx tmp;
20779 /* We play register width games, which are only valid after reload. */
20780 gcc_assert (reload_completed);
20782 /* Avoid HImode and its attendant prefix byte. */
20783 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
20784 dest = gen_rtx_REG (SImode, REGNO (dest));
20785 tmp = gen_rtx_SET (dest, const0_rtx);
20787 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
20789 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20790 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
20793 emit_insn (tmp);
20796 /* X is an unchanging MEM. If it is a constant pool reference, return
20797 the constant pool rtx, else NULL. */
20800 maybe_get_pool_constant (rtx x)
20802 x = ix86_delegitimize_address (XEXP (x, 0));
20804 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
20805 return get_pool_constant (x);
20807 return NULL_RTX;
20810 void
20811 ix86_expand_move (machine_mode mode, rtx operands[])
20813 rtx op0, op1;
20814 rtx tmp, addend = NULL_RTX;
20815 enum tls_model model;
20817 op0 = operands[0];
20818 op1 = operands[1];
20820 switch (GET_CODE (op1))
20822 case CONST:
20823 tmp = XEXP (op1, 0);
20825 if (GET_CODE (tmp) != PLUS
20826 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
20827 break;
20829 op1 = XEXP (tmp, 0);
20830 addend = XEXP (tmp, 1);
20831 /* FALLTHRU */
20833 case SYMBOL_REF:
20834 model = SYMBOL_REF_TLS_MODEL (op1);
20836 if (model)
20837 op1 = legitimize_tls_address (op1, model, true);
20838 else if (ix86_force_load_from_GOT_p (op1))
20840 /* Load the external function address via GOT slot to avoid PLT. */
20841 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
20842 (TARGET_64BIT
20843 ? UNSPEC_GOTPCREL
20844 : UNSPEC_GOT));
20845 op1 = gen_rtx_CONST (Pmode, op1);
20846 op1 = gen_const_mem (Pmode, op1);
20847 set_mem_alias_set (op1, ix86_GOT_alias_set ());
20849 else
20851 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
20852 if (tmp)
20854 op1 = tmp;
20855 if (!addend)
20856 break;
20858 else
20860 op1 = operands[1];
20861 break;
20865 if (addend)
20867 op1 = force_operand (op1, NULL_RTX);
20868 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
20869 op0, 1, OPTAB_DIRECT);
20871 else
20872 op1 = force_operand (op1, op0);
20874 if (op1 == op0)
20875 return;
20877 op1 = convert_to_mode (mode, op1, 1);
20879 default:
20880 break;
20883 if ((flag_pic || MACHOPIC_INDIRECT)
20884 && symbolic_operand (op1, mode))
20886 if (TARGET_MACHO && !TARGET_64BIT)
20888 #if TARGET_MACHO
20889 /* dynamic-no-pic */
20890 if (MACHOPIC_INDIRECT)
20892 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
20893 ? op0 : gen_reg_rtx (Pmode);
20894 op1 = machopic_indirect_data_reference (op1, temp);
20895 if (MACHOPIC_PURE)
20896 op1 = machopic_legitimize_pic_address (op1, mode,
20897 temp == op1 ? 0 : temp);
20899 if (op0 != op1 && GET_CODE (op0) != MEM)
20901 rtx insn = gen_rtx_SET (op0, op1);
20902 emit_insn (insn);
20903 return;
20905 if (GET_CODE (op0) == MEM)
20906 op1 = force_reg (Pmode, op1);
20907 else
20909 rtx temp = op0;
20910 if (GET_CODE (temp) != REG)
20911 temp = gen_reg_rtx (Pmode);
20912 temp = legitimize_pic_address (op1, temp);
20913 if (temp == op0)
20914 return;
20915 op1 = temp;
20917 /* dynamic-no-pic */
20918 #endif
20920 else
20922 if (MEM_P (op0))
20923 op1 = force_reg (mode, op1);
20924 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
20926 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
20927 op1 = legitimize_pic_address (op1, reg);
20928 if (op0 == op1)
20929 return;
20930 op1 = convert_to_mode (mode, op1, 1);
20934 else
20936 if (MEM_P (op0)
20937 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
20938 || !push_operand (op0, mode))
20939 && MEM_P (op1))
20940 op1 = force_reg (mode, op1);
20942 if (push_operand (op0, mode)
20943 && ! general_no_elim_operand (op1, mode))
20944 op1 = copy_to_mode_reg (mode, op1);
20946 /* Force large constants in 64bit compilation into register
20947 to get them CSEed. */
20948 if (can_create_pseudo_p ()
20949 && (mode == DImode) && TARGET_64BIT
20950 && immediate_operand (op1, mode)
20951 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20952 && !register_operand (op0, mode)
20953 && optimize)
20954 op1 = copy_to_mode_reg (mode, op1);
20956 if (can_create_pseudo_p ()
20957 && CONST_DOUBLE_P (op1))
20959 /* If we are loading a floating point constant to a register,
20960 force the value to memory now, since we'll get better code
20961 out the back end. */
20963 op1 = validize_mem (force_const_mem (mode, op1));
20964 if (!register_operand (op0, mode))
20966 rtx temp = gen_reg_rtx (mode);
20967 emit_insn (gen_rtx_SET (temp, op1));
20968 emit_move_insn (op0, temp);
20969 return;
20974 emit_insn (gen_rtx_SET (op0, op1));
20977 void
20978 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20980 rtx op0 = operands[0], op1 = operands[1];
20981 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20982 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20983 unsigned int align = (TARGET_IAMCU
20984 ? GET_MODE_BITSIZE (mode)
20985 : GET_MODE_ALIGNMENT (mode));
20987 if (push_operand (op0, VOIDmode))
20988 op0 = emit_move_resolve_push (mode, op0);
20990 /* Force constants other than zero into memory. We do not know how
20991 the instructions used to build constants modify the upper 64 bits
20992 of the register, once we have that information we may be able
20993 to handle some of them more efficiently. */
20994 if (can_create_pseudo_p ()
20995 && (CONSTANT_P (op1)
20996 || (SUBREG_P (op1)
20997 && CONSTANT_P (SUBREG_REG (op1))))
20998 && ((register_operand (op0, mode)
20999 && !standard_sse_constant_p (op1, mode))
21000 /* ix86_expand_vector_move_misalign() does not like constants. */
21001 || (SSE_REG_MODE_P (mode)
21002 && MEM_P (op0)
21003 && MEM_ALIGN (op0) < align)))
21005 if (SUBREG_P (op1))
21007 machine_mode imode = GET_MODE (SUBREG_REG (op1));
21008 rtx r = force_const_mem (imode, SUBREG_REG (op1));
21009 if (r)
21010 r = validize_mem (r);
21011 else
21012 r = force_reg (imode, SUBREG_REG (op1));
21013 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
21015 else
21016 op1 = validize_mem (force_const_mem (mode, op1));
21019 /* We need to check memory alignment for SSE mode since attribute
21020 can make operands unaligned. */
21021 if (can_create_pseudo_p ()
21022 && SSE_REG_MODE_P (mode)
21023 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
21024 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
21026 rtx tmp[2];
21028 /* ix86_expand_vector_move_misalign() does not like both
21029 arguments in memory. */
21030 if (!register_operand (op0, mode)
21031 && !register_operand (op1, mode))
21032 op1 = force_reg (mode, op1);
21034 tmp[0] = op0; tmp[1] = op1;
21035 ix86_expand_vector_move_misalign (mode, tmp);
21036 return;
21039 /* Make operand1 a register if it isn't already. */
21040 if (can_create_pseudo_p ()
21041 && !register_operand (op0, mode)
21042 && !register_operand (op1, mode))
21044 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
21045 return;
21048 emit_insn (gen_rtx_SET (op0, op1));
21051 /* Split 32-byte AVX unaligned load and store if needed. */
21053 static void
21054 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
21056 rtx m;
21057 rtx (*extract) (rtx, rtx, rtx);
21058 machine_mode mode;
21060 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
21061 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
21063 emit_insn (gen_rtx_SET (op0, op1));
21064 return;
21067 rtx orig_op0 = NULL_RTX;
21068 mode = GET_MODE (op0);
21069 switch (GET_MODE_CLASS (mode))
21071 case MODE_VECTOR_INT:
21072 case MODE_INT:
21073 if (mode != V32QImode)
21075 if (!MEM_P (op0))
21077 orig_op0 = op0;
21078 op0 = gen_reg_rtx (V32QImode);
21080 else
21081 op0 = gen_lowpart (V32QImode, op0);
21082 op1 = gen_lowpart (V32QImode, op1);
21083 mode = V32QImode;
21085 break;
21086 case MODE_VECTOR_FLOAT:
21087 break;
21088 default:
21089 gcc_unreachable ();
21092 switch (mode)
21094 default:
21095 gcc_unreachable ();
21096 case E_V32QImode:
21097 extract = gen_avx_vextractf128v32qi;
21098 mode = V16QImode;
21099 break;
21100 case E_V8SFmode:
21101 extract = gen_avx_vextractf128v8sf;
21102 mode = V4SFmode;
21103 break;
21104 case E_V4DFmode:
21105 extract = gen_avx_vextractf128v4df;
21106 mode = V2DFmode;
21107 break;
21110 if (MEM_P (op1))
21112 rtx r = gen_reg_rtx (mode);
21113 m = adjust_address (op1, mode, 0);
21114 emit_move_insn (r, m);
21115 m = adjust_address (op1, mode, 16);
21116 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
21117 emit_move_insn (op0, r);
21119 else if (MEM_P (op0))
21121 m = adjust_address (op0, mode, 0);
21122 emit_insn (extract (m, op1, const0_rtx));
21123 m = adjust_address (op0, mode, 16);
21124 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
21126 else
21127 gcc_unreachable ();
21129 if (orig_op0)
21130 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
21133 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
21134 straight to ix86_expand_vector_move. */
21135 /* Code generation for scalar reg-reg moves of single and double precision data:
21136 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
21137 movaps reg, reg
21138 else
21139 movss reg, reg
21140 if (x86_sse_partial_reg_dependency == true)
21141 movapd reg, reg
21142 else
21143 movsd reg, reg
21145 Code generation for scalar loads of double precision data:
21146 if (x86_sse_split_regs == true)
21147 movlpd mem, reg (gas syntax)
21148 else
21149 movsd mem, reg
21151 Code generation for unaligned packed loads of single precision data
21152 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
21153 if (x86_sse_unaligned_move_optimal)
21154 movups mem, reg
21156 if (x86_sse_partial_reg_dependency == true)
21158 xorps reg, reg
21159 movlps mem, reg
21160 movhps mem+8, reg
21162 else
21164 movlps mem, reg
21165 movhps mem+8, reg
21168 Code generation for unaligned packed loads of double precision data
21169 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
21170 if (x86_sse_unaligned_move_optimal)
21171 movupd mem, reg
21173 if (x86_sse_split_regs == true)
21175 movlpd mem, reg
21176 movhpd mem+8, reg
21178 else
21180 movsd mem, reg
21181 movhpd mem+8, reg
21185 void
21186 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
21188 rtx op0, op1, m;
21190 op0 = operands[0];
21191 op1 = operands[1];
21193 /* Use unaligned load/store for AVX512 or when optimizing for size. */
21194 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
21196 emit_insn (gen_rtx_SET (op0, op1));
21197 return;
21200 if (TARGET_AVX)
21202 if (GET_MODE_SIZE (mode) == 32)
21203 ix86_avx256_split_vector_move_misalign (op0, op1);
21204 else
21205 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
21206 emit_insn (gen_rtx_SET (op0, op1));
21207 return;
21210 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
21211 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
21213 emit_insn (gen_rtx_SET (op0, op1));
21214 return;
21217 /* ??? If we have typed data, then it would appear that using
21218 movdqu is the only way to get unaligned data loaded with
21219 integer type. */
21220 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
21222 emit_insn (gen_rtx_SET (op0, op1));
21223 return;
21226 if (MEM_P (op1))
21228 if (TARGET_SSE2 && mode == V2DFmode)
21230 rtx zero;
21232 /* When SSE registers are split into halves, we can avoid
21233 writing to the top half twice. */
21234 if (TARGET_SSE_SPLIT_REGS)
21236 emit_clobber (op0);
21237 zero = op0;
21239 else
21241 /* ??? Not sure about the best option for the Intel chips.
21242 The following would seem to satisfy; the register is
21243 entirely cleared, breaking the dependency chain. We
21244 then store to the upper half, with a dependency depth
21245 of one. A rumor has it that Intel recommends two movsd
21246 followed by an unpacklpd, but this is unconfirmed. And
21247 given that the dependency depth of the unpacklpd would
21248 still be one, I'm not sure why this would be better. */
21249 zero = CONST0_RTX (V2DFmode);
21252 m = adjust_address (op1, DFmode, 0);
21253 emit_insn (gen_sse2_loadlpd (op0, zero, m));
21254 m = adjust_address (op1, DFmode, 8);
21255 emit_insn (gen_sse2_loadhpd (op0, op0, m));
21257 else
21259 rtx t;
21261 if (mode != V4SFmode)
21262 t = gen_reg_rtx (V4SFmode);
21263 else
21264 t = op0;
21266 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
21267 emit_move_insn (t, CONST0_RTX (V4SFmode));
21268 else
21269 emit_clobber (t);
21271 m = adjust_address (op1, V2SFmode, 0);
21272 emit_insn (gen_sse_loadlps (t, t, m));
21273 m = adjust_address (op1, V2SFmode, 8);
21274 emit_insn (gen_sse_loadhps (t, t, m));
21275 if (mode != V4SFmode)
21276 emit_move_insn (op0, gen_lowpart (mode, t));
21279 else if (MEM_P (op0))
21281 if (TARGET_SSE2 && mode == V2DFmode)
21283 m = adjust_address (op0, DFmode, 0);
21284 emit_insn (gen_sse2_storelpd (m, op1));
21285 m = adjust_address (op0, DFmode, 8);
21286 emit_insn (gen_sse2_storehpd (m, op1));
21288 else
21290 if (mode != V4SFmode)
21291 op1 = gen_lowpart (V4SFmode, op1);
21293 m = adjust_address (op0, V2SFmode, 0);
21294 emit_insn (gen_sse_storelps (m, op1));
21295 m = adjust_address (op0, V2SFmode, 8);
21296 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
21299 else
21300 gcc_unreachable ();
21303 /* Helper function of ix86_fixup_binary_operands to canonicalize
21304 operand order. Returns true if the operands should be swapped. */
21306 static bool
21307 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
21308 rtx operands[])
21310 rtx dst = operands[0];
21311 rtx src1 = operands[1];
21312 rtx src2 = operands[2];
21314 /* If the operation is not commutative, we can't do anything. */
21315 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
21316 return false;
21318 /* Highest priority is that src1 should match dst. */
21319 if (rtx_equal_p (dst, src1))
21320 return false;
21321 if (rtx_equal_p (dst, src2))
21322 return true;
21324 /* Next highest priority is that immediate constants come second. */
21325 if (immediate_operand (src2, mode))
21326 return false;
21327 if (immediate_operand (src1, mode))
21328 return true;
21330 /* Lowest priority is that memory references should come second. */
21331 if (MEM_P (src2))
21332 return false;
21333 if (MEM_P (src1))
21334 return true;
21336 return false;
21340 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
21341 destination to use for the operation. If different from the true
21342 destination in operands[0], a copy operation will be required. */
21345 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
21346 rtx operands[])
21348 rtx dst = operands[0];
21349 rtx src1 = operands[1];
21350 rtx src2 = operands[2];
21352 /* Canonicalize operand order. */
21353 if (ix86_swap_binary_operands_p (code, mode, operands))
21355 /* It is invalid to swap operands of different modes. */
21356 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
21358 std::swap (src1, src2);
21361 /* Both source operands cannot be in memory. */
21362 if (MEM_P (src1) && MEM_P (src2))
21364 /* Optimization: Only read from memory once. */
21365 if (rtx_equal_p (src1, src2))
21367 src2 = force_reg (mode, src2);
21368 src1 = src2;
21370 else if (rtx_equal_p (dst, src1))
21371 src2 = force_reg (mode, src2);
21372 else
21373 src1 = force_reg (mode, src1);
21376 /* If the destination is memory, and we do not have matching source
21377 operands, do things in registers. */
21378 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21379 dst = gen_reg_rtx (mode);
21381 /* Source 1 cannot be a constant. */
21382 if (CONSTANT_P (src1))
21383 src1 = force_reg (mode, src1);
21385 /* Source 1 cannot be a non-matching memory. */
21386 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21387 src1 = force_reg (mode, src1);
21389 /* Improve address combine. */
21390 if (code == PLUS
21391 && GET_MODE_CLASS (mode) == MODE_INT
21392 && MEM_P (src2))
21393 src2 = force_reg (mode, src2);
21395 operands[1] = src1;
21396 operands[2] = src2;
21397 return dst;
21400 /* Similarly, but assume that the destination has already been
21401 set up properly. */
21403 void
21404 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
21405 machine_mode mode, rtx operands[])
21407 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
21408 gcc_assert (dst == operands[0]);
21411 /* Attempt to expand a binary operator. Make the expansion closer to the
21412 actual machine, then just general_operand, which will allow 3 separate
21413 memory references (one output, two input) in a single insn. */
21415 void
21416 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
21417 rtx operands[])
21419 rtx src1, src2, dst, op, clob;
21421 dst = ix86_fixup_binary_operands (code, mode, operands);
21422 src1 = operands[1];
21423 src2 = operands[2];
21425 /* Emit the instruction. */
21427 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
21429 if (reload_completed
21430 && code == PLUS
21431 && !rtx_equal_p (dst, src1))
21433 /* This is going to be an LEA; avoid splitting it later. */
21434 emit_insn (op);
21436 else
21438 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21439 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21442 /* Fix up the destination if needed. */
21443 if (dst != operands[0])
21444 emit_move_insn (operands[0], dst);
21447 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
21448 the given OPERANDS. */
21450 void
21451 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
21452 rtx operands[])
21454 rtx op1 = NULL_RTX, op2 = NULL_RTX;
21455 if (SUBREG_P (operands[1]))
21457 op1 = operands[1];
21458 op2 = operands[2];
21460 else if (SUBREG_P (operands[2]))
21462 op1 = operands[2];
21463 op2 = operands[1];
21465 /* Optimize (__m128i) d | (__m128i) e and similar code
21466 when d and e are float vectors into float vector logical
21467 insn. In C/C++ without using intrinsics there is no other way
21468 to express vector logical operation on float vectors than
21469 to cast them temporarily to integer vectors. */
21470 if (op1
21471 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
21472 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
21473 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
21474 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
21475 && SUBREG_BYTE (op1) == 0
21476 && (GET_CODE (op2) == CONST_VECTOR
21477 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
21478 && SUBREG_BYTE (op2) == 0))
21479 && can_create_pseudo_p ())
21481 rtx dst;
21482 switch (GET_MODE (SUBREG_REG (op1)))
21484 case E_V4SFmode:
21485 case E_V8SFmode:
21486 case E_V16SFmode:
21487 case E_V2DFmode:
21488 case E_V4DFmode:
21489 case E_V8DFmode:
21490 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
21491 if (GET_CODE (op2) == CONST_VECTOR)
21493 op2 = gen_lowpart (GET_MODE (dst), op2);
21494 op2 = force_reg (GET_MODE (dst), op2);
21496 else
21498 op1 = operands[1];
21499 op2 = SUBREG_REG (operands[2]);
21500 if (!vector_operand (op2, GET_MODE (dst)))
21501 op2 = force_reg (GET_MODE (dst), op2);
21503 op1 = SUBREG_REG (op1);
21504 if (!vector_operand (op1, GET_MODE (dst)))
21505 op1 = force_reg (GET_MODE (dst), op1);
21506 emit_insn (gen_rtx_SET (dst,
21507 gen_rtx_fmt_ee (code, GET_MODE (dst),
21508 op1, op2)));
21509 emit_move_insn (operands[0], gen_lowpart (mode, dst));
21510 return;
21511 default:
21512 break;
21515 if (!vector_operand (operands[1], mode))
21516 operands[1] = force_reg (mode, operands[1]);
21517 if (!vector_operand (operands[2], mode))
21518 operands[2] = force_reg (mode, operands[2]);
21519 ix86_fixup_binary_operands_no_copy (code, mode, operands);
21520 emit_insn (gen_rtx_SET (operands[0],
21521 gen_rtx_fmt_ee (code, mode, operands[1],
21522 operands[2])));
21525 /* Return TRUE or FALSE depending on whether the binary operator meets the
21526 appropriate constraints. */
21528 bool
21529 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
21530 rtx operands[3])
21532 rtx dst = operands[0];
21533 rtx src1 = operands[1];
21534 rtx src2 = operands[2];
21536 /* Both source operands cannot be in memory. */
21537 if (MEM_P (src1) && MEM_P (src2))
21538 return false;
21540 /* Canonicalize operand order for commutative operators. */
21541 if (ix86_swap_binary_operands_p (code, mode, operands))
21542 std::swap (src1, src2);
21544 /* If the destination is memory, we must have a matching source operand. */
21545 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
21546 return false;
21548 /* Source 1 cannot be a constant. */
21549 if (CONSTANT_P (src1))
21550 return false;
21552 /* Source 1 cannot be a non-matching memory. */
21553 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
21554 /* Support "andhi/andsi/anddi" as a zero-extending move. */
21555 return (code == AND
21556 && (mode == HImode
21557 || mode == SImode
21558 || (TARGET_64BIT && mode == DImode))
21559 && satisfies_constraint_L (src2));
21561 return true;
21564 /* Attempt to expand a unary operator. Make the expansion closer to the
21565 actual machine, then just general_operand, which will allow 2 separate
21566 memory references (one output, one input) in a single insn. */
21568 void
21569 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
21570 rtx operands[])
21572 bool matching_memory = false;
21573 rtx src, dst, op, clob;
21575 dst = operands[0];
21576 src = operands[1];
21578 /* If the destination is memory, and we do not have matching source
21579 operands, do things in registers. */
21580 if (MEM_P (dst))
21582 if (rtx_equal_p (dst, src))
21583 matching_memory = true;
21584 else
21585 dst = gen_reg_rtx (mode);
21588 /* When source operand is memory, destination must match. */
21589 if (MEM_P (src) && !matching_memory)
21590 src = force_reg (mode, src);
21592 /* Emit the instruction. */
21594 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
21596 if (code == NOT)
21597 emit_insn (op);
21598 else
21600 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21601 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21604 /* Fix up the destination if needed. */
21605 if (dst != operands[0])
21606 emit_move_insn (operands[0], dst);
21609 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
21610 divisor are within the range [0-255]. */
21612 void
21613 ix86_split_idivmod (machine_mode mode, rtx operands[],
21614 bool signed_p)
21616 rtx_code_label *end_label, *qimode_label;
21617 rtx div, mod;
21618 rtx_insn *insn;
21619 rtx scratch, tmp0, tmp1, tmp2;
21620 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
21621 rtx (*gen_zero_extend) (rtx, rtx);
21622 rtx (*gen_test_ccno_1) (rtx, rtx);
21624 switch (mode)
21626 case E_SImode:
21627 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
21628 gen_test_ccno_1 = gen_testsi_ccno_1;
21629 gen_zero_extend = gen_zero_extendqisi2;
21630 break;
21631 case E_DImode:
21632 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
21633 gen_test_ccno_1 = gen_testdi_ccno_1;
21634 gen_zero_extend = gen_zero_extendqidi2;
21635 break;
21636 default:
21637 gcc_unreachable ();
21640 end_label = gen_label_rtx ();
21641 qimode_label = gen_label_rtx ();
21643 scratch = gen_reg_rtx (mode);
21645 /* Use 8bit unsigned divimod if dividend and divisor are within
21646 the range [0-255]. */
21647 emit_move_insn (scratch, operands[2]);
21648 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
21649 scratch, 1, OPTAB_DIRECT);
21650 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
21651 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
21652 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
21653 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
21654 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
21655 pc_rtx);
21656 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
21657 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21658 JUMP_LABEL (insn) = qimode_label;
21660 /* Generate original signed/unsigned divimod. */
21661 div = gen_divmod4_1 (operands[0], operands[1],
21662 operands[2], operands[3]);
21663 emit_insn (div);
21665 /* Branch to the end. */
21666 emit_jump_insn (gen_jump (end_label));
21667 emit_barrier ();
21669 /* Generate 8bit unsigned divide. */
21670 emit_label (qimode_label);
21671 /* Don't use operands[0] for result of 8bit divide since not all
21672 registers support QImode ZERO_EXTRACT. */
21673 tmp0 = lowpart_subreg (HImode, scratch, mode);
21674 tmp1 = lowpart_subreg (HImode, operands[2], mode);
21675 tmp2 = lowpart_subreg (QImode, operands[3], mode);
21676 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
21678 if (signed_p)
21680 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
21681 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
21683 else
21685 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
21686 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
21689 /* Extract remainder from AH. */
21690 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
21691 if (REG_P (operands[1]))
21692 insn = emit_move_insn (operands[1], tmp1);
21693 else
21695 /* Need a new scratch register since the old one has result
21696 of 8bit divide. */
21697 scratch = gen_reg_rtx (mode);
21698 emit_move_insn (scratch, tmp1);
21699 insn = emit_move_insn (operands[1], scratch);
21701 set_unique_reg_note (insn, REG_EQUAL, mod);
21703 /* Zero extend quotient from AL. */
21704 tmp1 = gen_lowpart (QImode, tmp0);
21705 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
21706 set_unique_reg_note (insn, REG_EQUAL, div);
21708 emit_label (end_label);
21711 #define LEA_MAX_STALL (3)
21712 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
21714 /* Increase given DISTANCE in half-cycles according to
21715 dependencies between PREV and NEXT instructions.
21716 Add 1 half-cycle if there is no dependency and
21717 go to next cycle if there is some dependecy. */
21719 static unsigned int
21720 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
21722 df_ref def, use;
21724 if (!prev || !next)
21725 return distance + (distance & 1) + 2;
21727 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
21728 return distance + 1;
21730 FOR_EACH_INSN_USE (use, next)
21731 FOR_EACH_INSN_DEF (def, prev)
21732 if (!DF_REF_IS_ARTIFICIAL (def)
21733 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
21734 return distance + (distance & 1) + 2;
21736 return distance + 1;
21739 /* Function checks if instruction INSN defines register number
21740 REGNO1 or REGNO2. */
21742 static bool
21743 insn_defines_reg (unsigned int regno1, unsigned int regno2,
21744 rtx_insn *insn)
21746 df_ref def;
21748 FOR_EACH_INSN_DEF (def, insn)
21749 if (DF_REF_REG_DEF_P (def)
21750 && !DF_REF_IS_ARTIFICIAL (def)
21751 && (regno1 == DF_REF_REGNO (def)
21752 || regno2 == DF_REF_REGNO (def)))
21753 return true;
21755 return false;
21758 /* Function checks if instruction INSN uses register number
21759 REGNO as a part of address expression. */
21761 static bool
21762 insn_uses_reg_mem (unsigned int regno, rtx insn)
21764 df_ref use;
21766 FOR_EACH_INSN_USE (use, insn)
21767 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
21768 return true;
21770 return false;
21773 /* Search backward for non-agu definition of register number REGNO1
21774 or register number REGNO2 in basic block starting from instruction
21775 START up to head of basic block or instruction INSN.
21777 Function puts true value into *FOUND var if definition was found
21778 and false otherwise.
21780 Distance in half-cycles between START and found instruction or head
21781 of BB is added to DISTANCE and returned. */
21783 static int
21784 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
21785 rtx_insn *insn, int distance,
21786 rtx_insn *start, bool *found)
21788 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
21789 rtx_insn *prev = start;
21790 rtx_insn *next = NULL;
21792 *found = false;
21794 while (prev
21795 && prev != insn
21796 && distance < LEA_SEARCH_THRESHOLD)
21798 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
21800 distance = increase_distance (prev, next, distance);
21801 if (insn_defines_reg (regno1, regno2, prev))
21803 if (recog_memoized (prev) < 0
21804 || get_attr_type (prev) != TYPE_LEA)
21806 *found = true;
21807 return distance;
21811 next = prev;
21813 if (prev == BB_HEAD (bb))
21814 break;
21816 prev = PREV_INSN (prev);
21819 return distance;
21822 /* Search backward for non-agu definition of register number REGNO1
21823 or register number REGNO2 in INSN's basic block until
21824 1. Pass LEA_SEARCH_THRESHOLD instructions, or
21825 2. Reach neighbor BBs boundary, or
21826 3. Reach agu definition.
21827 Returns the distance between the non-agu definition point and INSN.
21828 If no definition point, returns -1. */
21830 static int
21831 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
21832 rtx_insn *insn)
21834 basic_block bb = BLOCK_FOR_INSN (insn);
21835 int distance = 0;
21836 bool found = false;
21838 if (insn != BB_HEAD (bb))
21839 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
21840 distance, PREV_INSN (insn),
21841 &found);
21843 if (!found && distance < LEA_SEARCH_THRESHOLD)
21845 edge e;
21846 edge_iterator ei;
21847 bool simple_loop = false;
21849 FOR_EACH_EDGE (e, ei, bb->preds)
21850 if (e->src == bb)
21852 simple_loop = true;
21853 break;
21856 if (simple_loop)
21857 distance = distance_non_agu_define_in_bb (regno1, regno2,
21858 insn, distance,
21859 BB_END (bb), &found);
21860 else
21862 int shortest_dist = -1;
21863 bool found_in_bb = false;
21865 FOR_EACH_EDGE (e, ei, bb->preds)
21867 int bb_dist
21868 = distance_non_agu_define_in_bb (regno1, regno2,
21869 insn, distance,
21870 BB_END (e->src),
21871 &found_in_bb);
21872 if (found_in_bb)
21874 if (shortest_dist < 0)
21875 shortest_dist = bb_dist;
21876 else if (bb_dist > 0)
21877 shortest_dist = MIN (bb_dist, shortest_dist);
21879 found = true;
21883 distance = shortest_dist;
21887 /* get_attr_type may modify recog data. We want to make sure
21888 that recog data is valid for instruction INSN, on which
21889 distance_non_agu_define is called. INSN is unchanged here. */
21890 extract_insn_cached (insn);
21892 if (!found)
21893 return -1;
21895 return distance >> 1;
21898 /* Return the distance in half-cycles between INSN and the next
21899 insn that uses register number REGNO in memory address added
21900 to DISTANCE. Return -1 if REGNO0 is set.
21902 Put true value into *FOUND if register usage was found and
21903 false otherwise.
21904 Put true value into *REDEFINED if register redefinition was
21905 found and false otherwise. */
21907 static int
21908 distance_agu_use_in_bb (unsigned int regno,
21909 rtx_insn *insn, int distance, rtx_insn *start,
21910 bool *found, bool *redefined)
21912 basic_block bb = NULL;
21913 rtx_insn *next = start;
21914 rtx_insn *prev = NULL;
21916 *found = false;
21917 *redefined = false;
21919 if (start != NULL_RTX)
21921 bb = BLOCK_FOR_INSN (start);
21922 if (start != BB_HEAD (bb))
21923 /* If insn and start belong to the same bb, set prev to insn,
21924 so the call to increase_distance will increase the distance
21925 between insns by 1. */
21926 prev = insn;
21929 while (next
21930 && next != insn
21931 && distance < LEA_SEARCH_THRESHOLD)
21933 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21935 distance = increase_distance(prev, next, distance);
21936 if (insn_uses_reg_mem (regno, next))
21938 /* Return DISTANCE if OP0 is used in memory
21939 address in NEXT. */
21940 *found = true;
21941 return distance;
21944 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21946 /* Return -1 if OP0 is set in NEXT. */
21947 *redefined = true;
21948 return -1;
21951 prev = next;
21954 if (next == BB_END (bb))
21955 break;
21957 next = NEXT_INSN (next);
21960 return distance;
21963 /* Return the distance between INSN and the next insn that uses
21964 register number REGNO0 in memory address. Return -1 if no such
21965 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21967 static int
21968 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21970 basic_block bb = BLOCK_FOR_INSN (insn);
21971 int distance = 0;
21972 bool found = false;
21973 bool redefined = false;
21975 if (insn != BB_END (bb))
21976 distance = distance_agu_use_in_bb (regno0, insn, distance,
21977 NEXT_INSN (insn),
21978 &found, &redefined);
21980 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21982 edge e;
21983 edge_iterator ei;
21984 bool simple_loop = false;
21986 FOR_EACH_EDGE (e, ei, bb->succs)
21987 if (e->dest == bb)
21989 simple_loop = true;
21990 break;
21993 if (simple_loop)
21994 distance = distance_agu_use_in_bb (regno0, insn,
21995 distance, BB_HEAD (bb),
21996 &found, &redefined);
21997 else
21999 int shortest_dist = -1;
22000 bool found_in_bb = false;
22001 bool redefined_in_bb = false;
22003 FOR_EACH_EDGE (e, ei, bb->succs)
22005 int bb_dist
22006 = distance_agu_use_in_bb (regno0, insn,
22007 distance, BB_HEAD (e->dest),
22008 &found_in_bb, &redefined_in_bb);
22009 if (found_in_bb)
22011 if (shortest_dist < 0)
22012 shortest_dist = bb_dist;
22013 else if (bb_dist > 0)
22014 shortest_dist = MIN (bb_dist, shortest_dist);
22016 found = true;
22020 distance = shortest_dist;
22024 if (!found || redefined)
22025 return -1;
22027 return distance >> 1;
22030 /* Define this macro to tune LEA priority vs ADD, it take effect when
22031 there is a dilemma of choicing LEA or ADD
22032 Negative value: ADD is more preferred than LEA
22033 Zero: Netrual
22034 Positive value: LEA is more preferred than ADD*/
22035 #define IX86_LEA_PRIORITY 0
22037 /* Return true if usage of lea INSN has performance advantage
22038 over a sequence of instructions. Instructions sequence has
22039 SPLIT_COST cycles higher latency than lea latency. */
22041 static bool
22042 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
22043 unsigned int regno2, int split_cost, bool has_scale)
22045 int dist_define, dist_use;
22047 /* For Silvermont if using a 2-source or 3-source LEA for
22048 non-destructive destination purposes, or due to wanting
22049 ability to use SCALE, the use of LEA is justified. */
22050 if (TARGET_SILVERMONT || TARGET_INTEL)
22052 if (has_scale)
22053 return true;
22054 if (split_cost < 1)
22055 return false;
22056 if (regno0 == regno1 || regno0 == regno2)
22057 return false;
22058 return true;
22061 dist_define = distance_non_agu_define (regno1, regno2, insn);
22062 dist_use = distance_agu_use (regno0, insn);
22064 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
22066 /* If there is no non AGU operand definition, no AGU
22067 operand usage and split cost is 0 then both lea
22068 and non lea variants have same priority. Currently
22069 we prefer lea for 64 bit code and non lea on 32 bit
22070 code. */
22071 if (dist_use < 0 && split_cost == 0)
22072 return TARGET_64BIT || IX86_LEA_PRIORITY;
22073 else
22074 return true;
22077 /* With longer definitions distance lea is more preferable.
22078 Here we change it to take into account splitting cost and
22079 lea priority. */
22080 dist_define += split_cost + IX86_LEA_PRIORITY;
22082 /* If there is no use in memory addess then we just check
22083 that split cost exceeds AGU stall. */
22084 if (dist_use < 0)
22085 return dist_define > LEA_MAX_STALL;
22087 /* If this insn has both backward non-agu dependence and forward
22088 agu dependence, the one with short distance takes effect. */
22089 return dist_define >= dist_use;
22092 /* Return true if it is legal to clobber flags by INSN and
22093 false otherwise. */
22095 static bool
22096 ix86_ok_to_clobber_flags (rtx_insn *insn)
22098 basic_block bb = BLOCK_FOR_INSN (insn);
22099 df_ref use;
22100 bitmap live;
22102 while (insn)
22104 if (NONDEBUG_INSN_P (insn))
22106 FOR_EACH_INSN_USE (use, insn)
22107 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
22108 return false;
22110 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
22111 return true;
22114 if (insn == BB_END (bb))
22115 break;
22117 insn = NEXT_INSN (insn);
22120 live = df_get_live_out(bb);
22121 return !REGNO_REG_SET_P (live, FLAGS_REG);
22124 /* Return true if we need to split op0 = op1 + op2 into a sequence of
22125 move and add to avoid AGU stalls. */
22127 bool
22128 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
22130 unsigned int regno0, regno1, regno2;
22132 /* Check if we need to optimize. */
22133 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22134 return false;
22136 /* Check it is correct to split here. */
22137 if (!ix86_ok_to_clobber_flags(insn))
22138 return false;
22140 regno0 = true_regnum (operands[0]);
22141 regno1 = true_regnum (operands[1]);
22142 regno2 = true_regnum (operands[2]);
22144 /* We need to split only adds with non destructive
22145 destination operand. */
22146 if (regno0 == regno1 || regno0 == regno2)
22147 return false;
22148 else
22149 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
22152 /* Return true if we should emit lea instruction instead of mov
22153 instruction. */
22155 bool
22156 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
22158 unsigned int regno0, regno1;
22160 /* Check if we need to optimize. */
22161 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22162 return false;
22164 /* Use lea for reg to reg moves only. */
22165 if (!REG_P (operands[0]) || !REG_P (operands[1]))
22166 return false;
22168 regno0 = true_regnum (operands[0]);
22169 regno1 = true_regnum (operands[1]);
22171 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
22174 /* Return true if we need to split lea into a sequence of
22175 instructions to avoid AGU stalls. */
22177 bool
22178 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
22180 unsigned int regno0, regno1, regno2;
22181 int split_cost;
22182 struct ix86_address parts;
22183 int ok;
22185 /* Check we need to optimize. */
22186 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
22187 return false;
22189 /* The "at least two components" test below might not catch simple
22190 move or zero extension insns if parts.base is non-NULL and parts.disp
22191 is const0_rtx as the only components in the address, e.g. if the
22192 register is %rbp or %r13. As this test is much cheaper and moves or
22193 zero extensions are the common case, do this check first. */
22194 if (REG_P (operands[1])
22195 || (SImode_address_operand (operands[1], VOIDmode)
22196 && REG_P (XEXP (operands[1], 0))))
22197 return false;
22199 /* Check if it is OK to split here. */
22200 if (!ix86_ok_to_clobber_flags (insn))
22201 return false;
22203 ok = ix86_decompose_address (operands[1], &parts);
22204 gcc_assert (ok);
22206 /* There should be at least two components in the address. */
22207 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
22208 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
22209 return false;
22211 /* We should not split into add if non legitimate pic
22212 operand is used as displacement. */
22213 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
22214 return false;
22216 regno0 = true_regnum (operands[0]) ;
22217 regno1 = INVALID_REGNUM;
22218 regno2 = INVALID_REGNUM;
22220 if (parts.base)
22221 regno1 = true_regnum (parts.base);
22222 if (parts.index)
22223 regno2 = true_regnum (parts.index);
22225 split_cost = 0;
22227 /* Compute how many cycles we will add to execution time
22228 if split lea into a sequence of instructions. */
22229 if (parts.base || parts.index)
22231 /* Have to use mov instruction if non desctructive
22232 destination form is used. */
22233 if (regno1 != regno0 && regno2 != regno0)
22234 split_cost += 1;
22236 /* Have to add index to base if both exist. */
22237 if (parts.base && parts.index)
22238 split_cost += 1;
22240 /* Have to use shift and adds if scale is 2 or greater. */
22241 if (parts.scale > 1)
22243 if (regno0 != regno1)
22244 split_cost += 1;
22245 else if (regno2 == regno0)
22246 split_cost += 4;
22247 else
22248 split_cost += parts.scale;
22251 /* Have to use add instruction with immediate if
22252 disp is non zero. */
22253 if (parts.disp && parts.disp != const0_rtx)
22254 split_cost += 1;
22256 /* Subtract the price of lea. */
22257 split_cost -= 1;
22260 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
22261 parts.scale > 1);
22264 /* Emit x86 binary operand CODE in mode MODE, where the first operand
22265 matches destination. RTX includes clobber of FLAGS_REG. */
22267 static void
22268 ix86_emit_binop (enum rtx_code code, machine_mode mode,
22269 rtx dst, rtx src)
22271 rtx op, clob;
22273 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
22274 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22276 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
22279 /* Return true if regno1 def is nearest to the insn. */
22281 static bool
22282 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
22284 rtx_insn *prev = insn;
22285 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
22287 if (insn == start)
22288 return false;
22289 while (prev && prev != start)
22291 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
22293 prev = PREV_INSN (prev);
22294 continue;
22296 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
22297 return true;
22298 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
22299 return false;
22300 prev = PREV_INSN (prev);
22303 /* None of the regs is defined in the bb. */
22304 return false;
22307 /* Split lea instructions into a sequence of instructions
22308 which are executed on ALU to avoid AGU stalls.
22309 It is assumed that it is allowed to clobber flags register
22310 at lea position. */
22312 void
22313 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
22315 unsigned int regno0, regno1, regno2;
22316 struct ix86_address parts;
22317 rtx target, tmp;
22318 int ok, adds;
22320 ok = ix86_decompose_address (operands[1], &parts);
22321 gcc_assert (ok);
22323 target = gen_lowpart (mode, operands[0]);
22325 regno0 = true_regnum (target);
22326 regno1 = INVALID_REGNUM;
22327 regno2 = INVALID_REGNUM;
22329 if (parts.base)
22331 parts.base = gen_lowpart (mode, parts.base);
22332 regno1 = true_regnum (parts.base);
22335 if (parts.index)
22337 parts.index = gen_lowpart (mode, parts.index);
22338 regno2 = true_regnum (parts.index);
22341 if (parts.disp)
22342 parts.disp = gen_lowpart (mode, parts.disp);
22344 if (parts.scale > 1)
22346 /* Case r1 = r1 + ... */
22347 if (regno1 == regno0)
22349 /* If we have a case r1 = r1 + C * r2 then we
22350 should use multiplication which is very
22351 expensive. Assume cost model is wrong if we
22352 have such case here. */
22353 gcc_assert (regno2 != regno0);
22355 for (adds = parts.scale; adds > 0; adds--)
22356 ix86_emit_binop (PLUS, mode, target, parts.index);
22358 else
22360 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
22361 if (regno0 != regno2)
22362 emit_insn (gen_rtx_SET (target, parts.index));
22364 /* Use shift for scaling. */
22365 ix86_emit_binop (ASHIFT, mode, target,
22366 GEN_INT (exact_log2 (parts.scale)));
22368 if (parts.base)
22369 ix86_emit_binop (PLUS, mode, target, parts.base);
22371 if (parts.disp && parts.disp != const0_rtx)
22372 ix86_emit_binop (PLUS, mode, target, parts.disp);
22375 else if (!parts.base && !parts.index)
22377 gcc_assert(parts.disp);
22378 emit_insn (gen_rtx_SET (target, parts.disp));
22380 else
22382 if (!parts.base)
22384 if (regno0 != regno2)
22385 emit_insn (gen_rtx_SET (target, parts.index));
22387 else if (!parts.index)
22389 if (regno0 != regno1)
22390 emit_insn (gen_rtx_SET (target, parts.base));
22392 else
22394 if (regno0 == regno1)
22395 tmp = parts.index;
22396 else if (regno0 == regno2)
22397 tmp = parts.base;
22398 else
22400 rtx tmp1;
22402 /* Find better operand for SET instruction, depending
22403 on which definition is farther from the insn. */
22404 if (find_nearest_reg_def (insn, regno1, regno2))
22405 tmp = parts.index, tmp1 = parts.base;
22406 else
22407 tmp = parts.base, tmp1 = parts.index;
22409 emit_insn (gen_rtx_SET (target, tmp));
22411 if (parts.disp && parts.disp != const0_rtx)
22412 ix86_emit_binop (PLUS, mode, target, parts.disp);
22414 ix86_emit_binop (PLUS, mode, target, tmp1);
22415 return;
22418 ix86_emit_binop (PLUS, mode, target, tmp);
22421 if (parts.disp && parts.disp != const0_rtx)
22422 ix86_emit_binop (PLUS, mode, target, parts.disp);
22426 /* Return true if it is ok to optimize an ADD operation to LEA
22427 operation to avoid flag register consumation. For most processors,
22428 ADD is faster than LEA. For the processors like BONNELL, if the
22429 destination register of LEA holds an actual address which will be
22430 used soon, LEA is better and otherwise ADD is better. */
22432 bool
22433 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
22435 unsigned int regno0 = true_regnum (operands[0]);
22436 unsigned int regno1 = true_regnum (operands[1]);
22437 unsigned int regno2 = true_regnum (operands[2]);
22439 /* If a = b + c, (a!=b && a!=c), must use lea form. */
22440 if (regno0 != regno1 && regno0 != regno2)
22441 return true;
22443 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
22444 return false;
22446 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
22449 /* Return true if destination reg of SET_BODY is shift count of
22450 USE_BODY. */
22452 static bool
22453 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
22455 rtx set_dest;
22456 rtx shift_rtx;
22457 int i;
22459 /* Retrieve destination of SET_BODY. */
22460 switch (GET_CODE (set_body))
22462 case SET:
22463 set_dest = SET_DEST (set_body);
22464 if (!set_dest || !REG_P (set_dest))
22465 return false;
22466 break;
22467 case PARALLEL:
22468 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
22469 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
22470 use_body))
22471 return true;
22472 /* FALLTHROUGH */
22473 default:
22474 return false;
22477 /* Retrieve shift count of USE_BODY. */
22478 switch (GET_CODE (use_body))
22480 case SET:
22481 shift_rtx = XEXP (use_body, 1);
22482 break;
22483 case PARALLEL:
22484 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
22485 if (ix86_dep_by_shift_count_body (set_body,
22486 XVECEXP (use_body, 0, i)))
22487 return true;
22488 /* FALLTHROUGH */
22489 default:
22490 return false;
22493 if (shift_rtx
22494 && (GET_CODE (shift_rtx) == ASHIFT
22495 || GET_CODE (shift_rtx) == LSHIFTRT
22496 || GET_CODE (shift_rtx) == ASHIFTRT
22497 || GET_CODE (shift_rtx) == ROTATE
22498 || GET_CODE (shift_rtx) == ROTATERT))
22500 rtx shift_count = XEXP (shift_rtx, 1);
22502 /* Return true if shift count is dest of SET_BODY. */
22503 if (REG_P (shift_count))
22505 /* Add check since it can be invoked before register
22506 allocation in pre-reload schedule. */
22507 if (reload_completed
22508 && true_regnum (set_dest) == true_regnum (shift_count))
22509 return true;
22510 else if (REGNO(set_dest) == REGNO(shift_count))
22511 return true;
22515 return false;
22518 /* Return true if destination reg of SET_INSN is shift count of
22519 USE_INSN. */
22521 bool
22522 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
22524 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
22525 PATTERN (use_insn));
22528 /* Return TRUE or FALSE depending on whether the unary operator meets the
22529 appropriate constraints. */
22531 bool
22532 ix86_unary_operator_ok (enum rtx_code,
22533 machine_mode,
22534 rtx operands[2])
22536 /* If one of operands is memory, source and destination must match. */
22537 if ((MEM_P (operands[0])
22538 || MEM_P (operands[1]))
22539 && ! rtx_equal_p (operands[0], operands[1]))
22540 return false;
22541 return true;
22544 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
22545 are ok, keeping in mind the possible movddup alternative. */
22547 bool
22548 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
22550 if (MEM_P (operands[0]))
22551 return rtx_equal_p (operands[0], operands[1 + high]);
22552 if (MEM_P (operands[1]) && MEM_P (operands[2]))
22553 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
22554 return true;
22557 /* Post-reload splitter for converting an SF or DFmode value in an
22558 SSE register into an unsigned SImode. */
22560 void
22561 ix86_split_convert_uns_si_sse (rtx operands[])
22563 machine_mode vecmode;
22564 rtx value, large, zero_or_two31, input, two31, x;
22566 large = operands[1];
22567 zero_or_two31 = operands[2];
22568 input = operands[3];
22569 two31 = operands[4];
22570 vecmode = GET_MODE (large);
22571 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
22573 /* Load up the value into the low element. We must ensure that the other
22574 elements are valid floats -- zero is the easiest such value. */
22575 if (MEM_P (input))
22577 if (vecmode == V4SFmode)
22578 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
22579 else
22580 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
22582 else
22584 input = gen_rtx_REG (vecmode, REGNO (input));
22585 emit_move_insn (value, CONST0_RTX (vecmode));
22586 if (vecmode == V4SFmode)
22587 emit_insn (gen_sse_movss (value, value, input));
22588 else
22589 emit_insn (gen_sse2_movsd (value, value, input));
22592 emit_move_insn (large, two31);
22593 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
22595 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
22596 emit_insn (gen_rtx_SET (large, x));
22598 x = gen_rtx_AND (vecmode, zero_or_two31, large);
22599 emit_insn (gen_rtx_SET (zero_or_two31, x));
22601 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
22602 emit_insn (gen_rtx_SET (value, x));
22604 large = gen_rtx_REG (V4SImode, REGNO (large));
22605 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
22607 x = gen_rtx_REG (V4SImode, REGNO (value));
22608 if (vecmode == V4SFmode)
22609 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
22610 else
22611 emit_insn (gen_sse2_cvttpd2dq (x, value));
22612 value = x;
22614 emit_insn (gen_xorv4si3 (value, value, large));
22617 /* Convert an unsigned DImode value into a DFmode, using only SSE.
22618 Expects the 64-bit DImode to be supplied in a pair of integral
22619 registers. Requires SSE2; will use SSE3 if available. For x86_32,
22620 -mfpmath=sse, !optimize_size only. */
22622 void
22623 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
22625 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
22626 rtx int_xmm, fp_xmm;
22627 rtx biases, exponents;
22628 rtx x;
22630 int_xmm = gen_reg_rtx (V4SImode);
22631 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
22632 emit_insn (gen_movdi_to_sse (int_xmm, input));
22633 else if (TARGET_SSE_SPLIT_REGS)
22635 emit_clobber (int_xmm);
22636 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
22638 else
22640 x = gen_reg_rtx (V2DImode);
22641 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
22642 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
22645 x = gen_rtx_CONST_VECTOR (V4SImode,
22646 gen_rtvec (4, GEN_INT (0x43300000UL),
22647 GEN_INT (0x45300000UL),
22648 const0_rtx, const0_rtx));
22649 exponents = validize_mem (force_const_mem (V4SImode, x));
22651 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
22652 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
22654 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
22655 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
22656 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
22657 (0x1.0p84 + double(fp_value_hi_xmm)).
22658 Note these exponents differ by 32. */
22660 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
22662 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
22663 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
22664 real_ldexp (&bias_lo_rvt, &dconst1, 52);
22665 real_ldexp (&bias_hi_rvt, &dconst1, 84);
22666 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
22667 x = const_double_from_real_value (bias_hi_rvt, DFmode);
22668 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
22669 biases = validize_mem (force_const_mem (V2DFmode, biases));
22670 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
22672 /* Add the upper and lower DFmode values together. */
22673 if (TARGET_SSE3)
22674 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
22675 else
22677 x = copy_to_mode_reg (V2DFmode, fp_xmm);
22678 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
22679 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
22682 ix86_expand_vector_extract (false, target, fp_xmm, 0);
22685 /* Not used, but eases macroization of patterns. */
22686 void
22687 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
22689 gcc_unreachable ();
22692 /* Convert an unsigned SImode value into a DFmode. Only currently used
22693 for SSE, but applicable anywhere. */
22695 void
22696 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
22698 REAL_VALUE_TYPE TWO31r;
22699 rtx x, fp;
22701 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
22702 NULL, 1, OPTAB_DIRECT);
22704 fp = gen_reg_rtx (DFmode);
22705 emit_insn (gen_floatsidf2 (fp, x));
22707 real_ldexp (&TWO31r, &dconst1, 31);
22708 x = const_double_from_real_value (TWO31r, DFmode);
22710 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
22711 if (x != target)
22712 emit_move_insn (target, x);
22715 /* Convert a signed DImode value into a DFmode. Only used for SSE in
22716 32-bit mode; otherwise we have a direct convert instruction. */
22718 void
22719 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
22721 REAL_VALUE_TYPE TWO32r;
22722 rtx fp_lo, fp_hi, x;
22724 fp_lo = gen_reg_rtx (DFmode);
22725 fp_hi = gen_reg_rtx (DFmode);
22727 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
22729 real_ldexp (&TWO32r, &dconst1, 32);
22730 x = const_double_from_real_value (TWO32r, DFmode);
22731 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
22733 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
22735 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
22736 0, OPTAB_DIRECT);
22737 if (x != target)
22738 emit_move_insn (target, x);
22741 /* Convert an unsigned SImode value into a SFmode, using only SSE.
22742 For x86_32, -mfpmath=sse, !optimize_size only. */
22743 void
22744 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
22746 REAL_VALUE_TYPE ONE16r;
22747 rtx fp_hi, fp_lo, int_hi, int_lo, x;
22749 real_ldexp (&ONE16r, &dconst1, 16);
22750 x = const_double_from_real_value (ONE16r, SFmode);
22751 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
22752 NULL, 0, OPTAB_DIRECT);
22753 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
22754 NULL, 0, OPTAB_DIRECT);
22755 fp_hi = gen_reg_rtx (SFmode);
22756 fp_lo = gen_reg_rtx (SFmode);
22757 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
22758 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
22759 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
22760 0, OPTAB_DIRECT);
22761 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
22762 0, OPTAB_DIRECT);
22763 if (!rtx_equal_p (target, fp_hi))
22764 emit_move_insn (target, fp_hi);
22767 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
22768 a vector of unsigned ints VAL to vector of floats TARGET. */
22770 void
22771 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
22773 rtx tmp[8];
22774 REAL_VALUE_TYPE TWO16r;
22775 machine_mode intmode = GET_MODE (val);
22776 machine_mode fltmode = GET_MODE (target);
22777 rtx (*cvt) (rtx, rtx);
22779 if (intmode == V4SImode)
22780 cvt = gen_floatv4siv4sf2;
22781 else
22782 cvt = gen_floatv8siv8sf2;
22783 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
22784 tmp[0] = force_reg (intmode, tmp[0]);
22785 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
22786 OPTAB_DIRECT);
22787 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
22788 NULL_RTX, 1, OPTAB_DIRECT);
22789 tmp[3] = gen_reg_rtx (fltmode);
22790 emit_insn (cvt (tmp[3], tmp[1]));
22791 tmp[4] = gen_reg_rtx (fltmode);
22792 emit_insn (cvt (tmp[4], tmp[2]));
22793 real_ldexp (&TWO16r, &dconst1, 16);
22794 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
22795 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
22796 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
22797 OPTAB_DIRECT);
22798 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
22799 OPTAB_DIRECT);
22800 if (tmp[7] != target)
22801 emit_move_insn (target, tmp[7]);
22804 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
22805 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
22806 This is done by doing just signed conversion if < 0x1p31, and otherwise by
22807 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
22810 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
22812 REAL_VALUE_TYPE TWO31r;
22813 rtx two31r, tmp[4];
22814 machine_mode mode = GET_MODE (val);
22815 machine_mode scalarmode = GET_MODE_INNER (mode);
22816 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
22817 rtx (*cmp) (rtx, rtx, rtx, rtx);
22818 int i;
22820 for (i = 0; i < 3; i++)
22821 tmp[i] = gen_reg_rtx (mode);
22822 real_ldexp (&TWO31r, &dconst1, 31);
22823 two31r = const_double_from_real_value (TWO31r, scalarmode);
22824 two31r = ix86_build_const_vector (mode, 1, two31r);
22825 two31r = force_reg (mode, two31r);
22826 switch (mode)
22828 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
22829 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
22830 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
22831 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
22832 default: gcc_unreachable ();
22834 tmp[3] = gen_rtx_LE (mode, two31r, val);
22835 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
22836 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
22837 0, OPTAB_DIRECT);
22838 if (intmode == V4SImode || TARGET_AVX2)
22839 *xorp = expand_simple_binop (intmode, ASHIFT,
22840 gen_lowpart (intmode, tmp[0]),
22841 GEN_INT (31), NULL_RTX, 0,
22842 OPTAB_DIRECT);
22843 else
22845 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
22846 two31 = ix86_build_const_vector (intmode, 1, two31);
22847 *xorp = expand_simple_binop (intmode, AND,
22848 gen_lowpart (intmode, tmp[0]),
22849 two31, NULL_RTX, 0,
22850 OPTAB_DIRECT);
22852 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
22853 0, OPTAB_DIRECT);
22856 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
22857 then replicate the value for all elements of the vector
22858 register. */
22861 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
22863 int i, n_elt;
22864 rtvec v;
22865 machine_mode scalar_mode;
22867 switch (mode)
22869 case E_V64QImode:
22870 case E_V32QImode:
22871 case E_V16QImode:
22872 case E_V32HImode:
22873 case E_V16HImode:
22874 case E_V8HImode:
22875 case E_V16SImode:
22876 case E_V8SImode:
22877 case E_V4SImode:
22878 case E_V8DImode:
22879 case E_V4DImode:
22880 case E_V2DImode:
22881 gcc_assert (vect);
22882 /* FALLTHRU */
22883 case E_V16SFmode:
22884 case E_V8SFmode:
22885 case E_V4SFmode:
22886 case E_V8DFmode:
22887 case E_V4DFmode:
22888 case E_V2DFmode:
22889 n_elt = GET_MODE_NUNITS (mode);
22890 v = rtvec_alloc (n_elt);
22891 scalar_mode = GET_MODE_INNER (mode);
22893 RTVEC_ELT (v, 0) = value;
22895 for (i = 1; i < n_elt; ++i)
22896 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
22898 return gen_rtx_CONST_VECTOR (mode, v);
22900 default:
22901 gcc_unreachable ();
22905 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
22906 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
22907 for an SSE register. If VECT is true, then replicate the mask for
22908 all elements of the vector register. If INVERT is true, then create
22909 a mask excluding the sign bit. */
22912 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
22914 machine_mode vec_mode, imode;
22915 wide_int w;
22916 rtx mask, v;
22918 switch (mode)
22920 case E_V16SImode:
22921 case E_V16SFmode:
22922 case E_V8SImode:
22923 case E_V4SImode:
22924 case E_V8SFmode:
22925 case E_V4SFmode:
22926 vec_mode = mode;
22927 imode = SImode;
22928 break;
22930 case E_V8DImode:
22931 case E_V4DImode:
22932 case E_V2DImode:
22933 case E_V8DFmode:
22934 case E_V4DFmode:
22935 case E_V2DFmode:
22936 vec_mode = mode;
22937 imode = DImode;
22938 break;
22940 case E_TImode:
22941 case E_TFmode:
22942 vec_mode = VOIDmode;
22943 imode = TImode;
22944 break;
22946 default:
22947 gcc_unreachable ();
22950 machine_mode inner_mode = GET_MODE_INNER (mode);
22951 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22952 GET_MODE_BITSIZE (inner_mode));
22953 if (invert)
22954 w = wi::bit_not (w);
22956 /* Force this value into the low part of a fp vector constant. */
22957 mask = immed_wide_int_const (w, imode);
22958 mask = gen_lowpart (inner_mode, mask);
22960 if (vec_mode == VOIDmode)
22961 return force_reg (inner_mode, mask);
22963 v = ix86_build_const_vector (vec_mode, vect, mask);
22964 return force_reg (vec_mode, v);
22967 /* Generate code for floating point ABS or NEG. */
22969 void
22970 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22971 rtx operands[])
22973 rtx mask, set, dst, src;
22974 bool use_sse = false;
22975 bool vector_mode = VECTOR_MODE_P (mode);
22976 machine_mode vmode = mode;
22978 if (vector_mode)
22979 use_sse = true;
22980 else if (mode == TFmode)
22981 use_sse = true;
22982 else if (TARGET_SSE_MATH)
22984 use_sse = SSE_FLOAT_MODE_P (mode);
22985 if (mode == SFmode)
22986 vmode = V4SFmode;
22987 else if (mode == DFmode)
22988 vmode = V2DFmode;
22991 /* NEG and ABS performed with SSE use bitwise mask operations.
22992 Create the appropriate mask now. */
22993 if (use_sse)
22994 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22995 else
22996 mask = NULL_RTX;
22998 dst = operands[0];
22999 src = operands[1];
23001 set = gen_rtx_fmt_e (code, mode, src);
23002 set = gen_rtx_SET (dst, set);
23004 if (mask)
23006 rtx use, clob;
23007 rtvec par;
23009 use = gen_rtx_USE (VOIDmode, mask);
23010 if (vector_mode)
23011 par = gen_rtvec (2, set, use);
23012 else
23014 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
23015 par = gen_rtvec (3, set, use, clob);
23017 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
23019 else
23020 emit_insn (set);
23023 /* Expand a copysign operation. Special case operand 0 being a constant. */
23025 void
23026 ix86_expand_copysign (rtx operands[])
23028 machine_mode mode, vmode;
23029 rtx dest, op0, op1, mask, nmask;
23031 dest = operands[0];
23032 op0 = operands[1];
23033 op1 = operands[2];
23035 mode = GET_MODE (dest);
23037 if (mode == SFmode)
23038 vmode = V4SFmode;
23039 else if (mode == DFmode)
23040 vmode = V2DFmode;
23041 else
23042 vmode = mode;
23044 if (CONST_DOUBLE_P (op0))
23046 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
23048 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
23049 op0 = simplify_unary_operation (ABS, mode, op0, mode);
23051 if (mode == SFmode || mode == DFmode)
23053 if (op0 == CONST0_RTX (mode))
23054 op0 = CONST0_RTX (vmode);
23055 else
23057 rtx v = ix86_build_const_vector (vmode, false, op0);
23059 op0 = force_reg (vmode, v);
23062 else if (op0 != CONST0_RTX (mode))
23063 op0 = force_reg (mode, op0);
23065 mask = ix86_build_signbit_mask (vmode, 0, 0);
23067 if (mode == SFmode)
23068 copysign_insn = gen_copysignsf3_const;
23069 else if (mode == DFmode)
23070 copysign_insn = gen_copysigndf3_const;
23071 else
23072 copysign_insn = gen_copysigntf3_const;
23074 emit_insn (copysign_insn (dest, op0, op1, mask));
23076 else
23078 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
23080 nmask = ix86_build_signbit_mask (vmode, 0, 1);
23081 mask = ix86_build_signbit_mask (vmode, 0, 0);
23083 if (mode == SFmode)
23084 copysign_insn = gen_copysignsf3_var;
23085 else if (mode == DFmode)
23086 copysign_insn = gen_copysigndf3_var;
23087 else
23088 copysign_insn = gen_copysigntf3_var;
23090 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
23094 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
23095 be a constant, and so has already been expanded into a vector constant. */
23097 void
23098 ix86_split_copysign_const (rtx operands[])
23100 machine_mode mode, vmode;
23101 rtx dest, op0, mask, x;
23103 dest = operands[0];
23104 op0 = operands[1];
23105 mask = operands[3];
23107 mode = GET_MODE (dest);
23108 vmode = GET_MODE (mask);
23110 dest = lowpart_subreg (vmode, dest, mode);
23111 x = gen_rtx_AND (vmode, dest, mask);
23112 emit_insn (gen_rtx_SET (dest, x));
23114 if (op0 != CONST0_RTX (vmode))
23116 x = gen_rtx_IOR (vmode, dest, op0);
23117 emit_insn (gen_rtx_SET (dest, x));
23121 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
23122 so we have to do two masks. */
23124 void
23125 ix86_split_copysign_var (rtx operands[])
23127 machine_mode mode, vmode;
23128 rtx dest, scratch, op0, op1, mask, nmask, x;
23130 dest = operands[0];
23131 scratch = operands[1];
23132 op0 = operands[2];
23133 op1 = operands[3];
23134 nmask = operands[4];
23135 mask = operands[5];
23137 mode = GET_MODE (dest);
23138 vmode = GET_MODE (mask);
23140 if (rtx_equal_p (op0, op1))
23142 /* Shouldn't happen often (it's useless, obviously), but when it does
23143 we'd generate incorrect code if we continue below. */
23144 emit_move_insn (dest, op0);
23145 return;
23148 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
23150 gcc_assert (REGNO (op1) == REGNO (scratch));
23152 x = gen_rtx_AND (vmode, scratch, mask);
23153 emit_insn (gen_rtx_SET (scratch, x));
23155 dest = mask;
23156 op0 = lowpart_subreg (vmode, op0, mode);
23157 x = gen_rtx_NOT (vmode, dest);
23158 x = gen_rtx_AND (vmode, x, op0);
23159 emit_insn (gen_rtx_SET (dest, x));
23161 else
23163 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
23165 x = gen_rtx_AND (vmode, scratch, mask);
23167 else /* alternative 2,4 */
23169 gcc_assert (REGNO (mask) == REGNO (scratch));
23170 op1 = lowpart_subreg (vmode, op1, mode);
23171 x = gen_rtx_AND (vmode, scratch, op1);
23173 emit_insn (gen_rtx_SET (scratch, x));
23175 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
23177 dest = lowpart_subreg (vmode, op0, mode);
23178 x = gen_rtx_AND (vmode, dest, nmask);
23180 else /* alternative 3,4 */
23182 gcc_assert (REGNO (nmask) == REGNO (dest));
23183 dest = nmask;
23184 op0 = lowpart_subreg (vmode, op0, mode);
23185 x = gen_rtx_AND (vmode, dest, op0);
23187 emit_insn (gen_rtx_SET (dest, x));
23190 x = gen_rtx_IOR (vmode, dest, scratch);
23191 emit_insn (gen_rtx_SET (dest, x));
23194 /* Return TRUE or FALSE depending on whether the first SET in INSN
23195 has source and destination with matching CC modes, and that the
23196 CC mode is at least as constrained as REQ_MODE. */
23198 bool
23199 ix86_match_ccmode (rtx insn, machine_mode req_mode)
23201 rtx set;
23202 machine_mode set_mode;
23204 set = PATTERN (insn);
23205 if (GET_CODE (set) == PARALLEL)
23206 set = XVECEXP (set, 0, 0);
23207 gcc_assert (GET_CODE (set) == SET);
23208 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
23210 set_mode = GET_MODE (SET_DEST (set));
23211 switch (set_mode)
23213 case E_CCNOmode:
23214 if (req_mode != CCNOmode
23215 && (req_mode != CCmode
23216 || XEXP (SET_SRC (set), 1) != const0_rtx))
23217 return false;
23218 break;
23219 case E_CCmode:
23220 if (req_mode == CCGCmode)
23221 return false;
23222 /* FALLTHRU */
23223 case E_CCGCmode:
23224 if (req_mode == CCGOCmode || req_mode == CCNOmode)
23225 return false;
23226 /* FALLTHRU */
23227 case E_CCGOCmode:
23228 if (req_mode == CCZmode)
23229 return false;
23230 /* FALLTHRU */
23231 case E_CCZmode:
23232 break;
23234 case E_CCAmode:
23235 case E_CCCmode:
23236 case E_CCOmode:
23237 case E_CCPmode:
23238 case E_CCSmode:
23239 if (set_mode != req_mode)
23240 return false;
23241 break;
23243 default:
23244 gcc_unreachable ();
23247 return GET_MODE (SET_SRC (set)) == set_mode;
23250 /* Generate insn patterns to do an integer compare of OPERANDS. */
23252 static rtx
23253 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
23255 machine_mode cmpmode;
23256 rtx tmp, flags;
23258 cmpmode = SELECT_CC_MODE (code, op0, op1);
23259 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
23261 /* This is very simple, but making the interface the same as in the
23262 FP case makes the rest of the code easier. */
23263 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
23264 emit_insn (gen_rtx_SET (flags, tmp));
23266 /* Return the test that should be put into the flags user, i.e.
23267 the bcc, scc, or cmov instruction. */
23268 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
23271 /* Figure out whether to use ordered or unordered fp comparisons.
23272 Return the appropriate mode to use. */
23274 machine_mode
23275 ix86_fp_compare_mode (enum rtx_code)
23277 /* ??? In order to make all comparisons reversible, we do all comparisons
23278 non-trapping when compiling for IEEE. Once gcc is able to distinguish
23279 all forms trapping and nontrapping comparisons, we can make inequality
23280 comparisons trapping again, since it results in better code when using
23281 FCOM based compares. */
23282 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
23285 machine_mode
23286 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
23288 machine_mode mode = GET_MODE (op0);
23290 if (SCALAR_FLOAT_MODE_P (mode))
23292 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23293 return ix86_fp_compare_mode (code);
23296 switch (code)
23298 /* Only zero flag is needed. */
23299 case EQ: /* ZF=0 */
23300 case NE: /* ZF!=0 */
23301 return CCZmode;
23302 /* Codes needing carry flag. */
23303 case GEU: /* CF=0 */
23304 case LTU: /* CF=1 */
23305 /* Detect overflow checks. They need just the carry flag. */
23306 if (GET_CODE (op0) == PLUS
23307 && (rtx_equal_p (op1, XEXP (op0, 0))
23308 || rtx_equal_p (op1, XEXP (op0, 1))))
23309 return CCCmode;
23310 else
23311 return CCmode;
23312 case GTU: /* CF=0 & ZF=0 */
23313 case LEU: /* CF=1 | ZF=1 */
23314 return CCmode;
23315 /* Codes possibly doable only with sign flag when
23316 comparing against zero. */
23317 case GE: /* SF=OF or SF=0 */
23318 case LT: /* SF<>OF or SF=1 */
23319 if (op1 == const0_rtx)
23320 return CCGOCmode;
23321 else
23322 /* For other cases Carry flag is not required. */
23323 return CCGCmode;
23324 /* Codes doable only with sign flag when comparing
23325 against zero, but we miss jump instruction for it
23326 so we need to use relational tests against overflow
23327 that thus needs to be zero. */
23328 case GT: /* ZF=0 & SF=OF */
23329 case LE: /* ZF=1 | SF<>OF */
23330 if (op1 == const0_rtx)
23331 return CCNOmode;
23332 else
23333 return CCGCmode;
23334 /* strcmp pattern do (use flags) and combine may ask us for proper
23335 mode. */
23336 case USE:
23337 return CCmode;
23338 default:
23339 gcc_unreachable ();
23343 /* Return the fixed registers used for condition codes. */
23345 static bool
23346 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
23348 *p1 = FLAGS_REG;
23349 *p2 = FPSR_REG;
23350 return true;
23353 /* If two condition code modes are compatible, return a condition code
23354 mode which is compatible with both. Otherwise, return
23355 VOIDmode. */
23357 static machine_mode
23358 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
23360 if (m1 == m2)
23361 return m1;
23363 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
23364 return VOIDmode;
23366 if ((m1 == CCGCmode && m2 == CCGOCmode)
23367 || (m1 == CCGOCmode && m2 == CCGCmode))
23368 return CCGCmode;
23370 if ((m1 == CCNOmode && m2 == CCGOCmode)
23371 || (m1 == CCGOCmode && m2 == CCNOmode))
23372 return CCNOmode;
23374 if (m1 == CCZmode
23375 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
23376 return m2;
23377 else if (m2 == CCZmode
23378 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
23379 return m1;
23381 switch (m1)
23383 default:
23384 gcc_unreachable ();
23386 case E_CCmode:
23387 case E_CCGCmode:
23388 case E_CCGOCmode:
23389 case E_CCNOmode:
23390 case E_CCAmode:
23391 case E_CCCmode:
23392 case E_CCOmode:
23393 case E_CCPmode:
23394 case E_CCSmode:
23395 case E_CCZmode:
23396 switch (m2)
23398 default:
23399 return VOIDmode;
23401 case E_CCmode:
23402 case E_CCGCmode:
23403 case E_CCGOCmode:
23404 case E_CCNOmode:
23405 case E_CCAmode:
23406 case E_CCCmode:
23407 case E_CCOmode:
23408 case E_CCPmode:
23409 case E_CCSmode:
23410 case E_CCZmode:
23411 return CCmode;
23414 case E_CCFPmode:
23415 case E_CCFPUmode:
23416 /* These are only compatible with themselves, which we already
23417 checked above. */
23418 return VOIDmode;
23423 /* Return a comparison we can do and that it is equivalent to
23424 swap_condition (code) apart possibly from orderedness.
23425 But, never change orderedness if TARGET_IEEE_FP, returning
23426 UNKNOWN in that case if necessary. */
23428 static enum rtx_code
23429 ix86_fp_swap_condition (enum rtx_code code)
23431 switch (code)
23433 case GT: /* GTU - CF=0 & ZF=0 */
23434 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
23435 case GE: /* GEU - CF=0 */
23436 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
23437 case UNLT: /* LTU - CF=1 */
23438 return TARGET_IEEE_FP ? UNKNOWN : GT;
23439 case UNLE: /* LEU - CF=1 | ZF=1 */
23440 return TARGET_IEEE_FP ? UNKNOWN : GE;
23441 default:
23442 return swap_condition (code);
23446 /* Return cost of comparison CODE using the best strategy for performance.
23447 All following functions do use number of instructions as a cost metrics.
23448 In future this should be tweaked to compute bytes for optimize_size and
23449 take into account performance of various instructions on various CPUs. */
23451 static int
23452 ix86_fp_comparison_cost (enum rtx_code code)
23454 int arith_cost;
23456 /* The cost of code using bit-twiddling on %ah. */
23457 switch (code)
23459 case UNLE:
23460 case UNLT:
23461 case LTGT:
23462 case GT:
23463 case GE:
23464 case UNORDERED:
23465 case ORDERED:
23466 case UNEQ:
23467 arith_cost = 4;
23468 break;
23469 case LT:
23470 case NE:
23471 case EQ:
23472 case UNGE:
23473 arith_cost = TARGET_IEEE_FP ? 5 : 4;
23474 break;
23475 case LE:
23476 case UNGT:
23477 arith_cost = TARGET_IEEE_FP ? 6 : 4;
23478 break;
23479 default:
23480 gcc_unreachable ();
23483 switch (ix86_fp_comparison_strategy (code))
23485 case IX86_FPCMP_COMI:
23486 return arith_cost > 4 ? 3 : 2;
23487 case IX86_FPCMP_SAHF:
23488 return arith_cost > 4 ? 4 : 3;
23489 default:
23490 return arith_cost;
23494 /* Return strategy to use for floating-point. We assume that fcomi is always
23495 preferrable where available, since that is also true when looking at size
23496 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
23498 enum ix86_fpcmp_strategy
23499 ix86_fp_comparison_strategy (enum rtx_code)
23501 /* Do fcomi/sahf based test when profitable. */
23503 if (TARGET_CMOVE)
23504 return IX86_FPCMP_COMI;
23506 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
23507 return IX86_FPCMP_SAHF;
23509 return IX86_FPCMP_ARITH;
23512 /* Swap, force into registers, or otherwise massage the two operands
23513 to a fp comparison. The operands are updated in place; the new
23514 comparison code is returned. */
23516 static enum rtx_code
23517 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
23519 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
23520 rtx op0 = *pop0, op1 = *pop1;
23521 machine_mode op_mode = GET_MODE (op0);
23522 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
23524 /* All of the unordered compare instructions only work on registers.
23525 The same is true of the fcomi compare instructions. The XFmode
23526 compare instructions require registers except when comparing
23527 against zero or when converting operand 1 from fixed point to
23528 floating point. */
23530 if (!is_sse
23531 && (fpcmp_mode == CCFPUmode
23532 || (op_mode == XFmode
23533 && ! (standard_80387_constant_p (op0) == 1
23534 || standard_80387_constant_p (op1) == 1)
23535 && GET_CODE (op1) != FLOAT)
23536 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
23538 op0 = force_reg (op_mode, op0);
23539 op1 = force_reg (op_mode, op1);
23541 else
23543 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
23544 things around if they appear profitable, otherwise force op0
23545 into a register. */
23547 if (standard_80387_constant_p (op0) == 0
23548 || (MEM_P (op0)
23549 && ! (standard_80387_constant_p (op1) == 0
23550 || MEM_P (op1))))
23552 enum rtx_code new_code = ix86_fp_swap_condition (code);
23553 if (new_code != UNKNOWN)
23555 std::swap (op0, op1);
23556 code = new_code;
23560 if (!REG_P (op0))
23561 op0 = force_reg (op_mode, op0);
23563 if (CONSTANT_P (op1))
23565 int tmp = standard_80387_constant_p (op1);
23566 if (tmp == 0)
23567 op1 = validize_mem (force_const_mem (op_mode, op1));
23568 else if (tmp == 1)
23570 if (TARGET_CMOVE)
23571 op1 = force_reg (op_mode, op1);
23573 else
23574 op1 = force_reg (op_mode, op1);
23578 /* Try to rearrange the comparison to make it cheaper. */
23579 if (ix86_fp_comparison_cost (code)
23580 > ix86_fp_comparison_cost (swap_condition (code))
23581 && (REG_P (op1) || can_create_pseudo_p ()))
23583 std::swap (op0, op1);
23584 code = swap_condition (code);
23585 if (!REG_P (op0))
23586 op0 = force_reg (op_mode, op0);
23589 *pop0 = op0;
23590 *pop1 = op1;
23591 return code;
23594 /* Convert comparison codes we use to represent FP comparison to integer
23595 code that will result in proper branch. Return UNKNOWN if no such code
23596 is available. */
23598 enum rtx_code
23599 ix86_fp_compare_code_to_integer (enum rtx_code code)
23601 switch (code)
23603 case GT:
23604 return GTU;
23605 case GE:
23606 return GEU;
23607 case ORDERED:
23608 case UNORDERED:
23609 return code;
23610 case UNEQ:
23611 return EQ;
23612 case UNLT:
23613 return LTU;
23614 case UNLE:
23615 return LEU;
23616 case LTGT:
23617 return NE;
23618 default:
23619 return UNKNOWN;
23623 /* Generate insn patterns to do a floating point compare of OPERANDS. */
23625 static rtx
23626 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
23628 machine_mode fpcmp_mode, intcmp_mode;
23629 rtx tmp, tmp2;
23631 fpcmp_mode = ix86_fp_compare_mode (code);
23632 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
23634 /* Do fcomi/sahf based test when profitable. */
23635 switch (ix86_fp_comparison_strategy (code))
23637 case IX86_FPCMP_COMI:
23638 intcmp_mode = fpcmp_mode;
23639 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23640 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23641 emit_insn (tmp);
23642 break;
23644 case IX86_FPCMP_SAHF:
23645 intcmp_mode = fpcmp_mode;
23646 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23647 tmp = gen_rtx_SET (gen_rtx_REG (fpcmp_mode, FLAGS_REG), tmp);
23649 if (!scratch)
23650 scratch = gen_reg_rtx (HImode);
23651 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
23652 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
23653 break;
23655 case IX86_FPCMP_ARITH:
23656 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
23657 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
23658 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
23659 if (!scratch)
23660 scratch = gen_reg_rtx (HImode);
23661 emit_insn (gen_rtx_SET (scratch, tmp2));
23663 /* In the unordered case, we have to check C2 for NaN's, which
23664 doesn't happen to work out to anything nice combination-wise.
23665 So do some bit twiddling on the value we've got in AH to come
23666 up with an appropriate set of condition codes. */
23668 intcmp_mode = CCNOmode;
23669 switch (code)
23671 case GT:
23672 case UNGT:
23673 if (code == GT || !TARGET_IEEE_FP)
23675 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23676 code = EQ;
23678 else
23680 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23681 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23682 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
23683 intcmp_mode = CCmode;
23684 code = GEU;
23686 break;
23687 case LT:
23688 case UNLT:
23689 if (code == LT && TARGET_IEEE_FP)
23691 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23692 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
23693 intcmp_mode = CCmode;
23694 code = EQ;
23696 else
23698 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
23699 code = NE;
23701 break;
23702 case GE:
23703 case UNGE:
23704 if (code == GE || !TARGET_IEEE_FP)
23706 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
23707 code = EQ;
23709 else
23711 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23712 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
23713 code = NE;
23715 break;
23716 case LE:
23717 case UNLE:
23718 if (code == LE && TARGET_IEEE_FP)
23720 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23721 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
23722 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23723 intcmp_mode = CCmode;
23724 code = LTU;
23726 else
23728 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
23729 code = NE;
23731 break;
23732 case EQ:
23733 case UNEQ:
23734 if (code == EQ && TARGET_IEEE_FP)
23736 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23737 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
23738 intcmp_mode = CCmode;
23739 code = EQ;
23741 else
23743 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23744 code = NE;
23746 break;
23747 case NE:
23748 case LTGT:
23749 if (code == NE && TARGET_IEEE_FP)
23751 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
23752 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
23753 GEN_INT (0x40)));
23754 code = NE;
23756 else
23758 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
23759 code = EQ;
23761 break;
23763 case UNORDERED:
23764 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23765 code = NE;
23766 break;
23767 case ORDERED:
23768 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
23769 code = EQ;
23770 break;
23772 default:
23773 gcc_unreachable ();
23775 break;
23777 default:
23778 gcc_unreachable();
23781 /* Return the test that should be put into the flags user, i.e.
23782 the bcc, scc, or cmov instruction. */
23783 return gen_rtx_fmt_ee (code, VOIDmode,
23784 gen_rtx_REG (intcmp_mode, FLAGS_REG),
23785 const0_rtx);
23788 static rtx
23789 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
23791 rtx ret;
23793 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
23794 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
23796 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
23798 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
23799 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23801 else
23802 ret = ix86_expand_int_compare (code, op0, op1);
23804 return ret;
23807 void
23808 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
23810 machine_mode mode = GET_MODE (op0);
23811 rtx tmp;
23813 /* Handle special case - vector comparsion with boolean result, transform
23814 it using ptest instruction. */
23815 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
23817 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
23818 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
23820 gcc_assert (code == EQ || code == NE);
23821 /* Generate XOR since we can't check that one operand is zero vector. */
23822 tmp = gen_reg_rtx (mode);
23823 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
23824 tmp = gen_lowpart (p_mode, tmp);
23825 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
23826 gen_rtx_UNSPEC (CCmode,
23827 gen_rtvec (2, tmp, tmp),
23828 UNSPEC_PTEST)));
23829 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
23830 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23831 gen_rtx_LABEL_REF (VOIDmode, label),
23832 pc_rtx);
23833 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23834 return;
23837 switch (mode)
23839 case E_SFmode:
23840 case E_DFmode:
23841 case E_XFmode:
23842 case E_QImode:
23843 case E_HImode:
23844 case E_SImode:
23845 simple:
23846 tmp = ix86_expand_compare (code, op0, op1);
23847 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23848 gen_rtx_LABEL_REF (VOIDmode, label),
23849 pc_rtx);
23850 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
23851 return;
23853 case E_DImode:
23854 if (TARGET_64BIT)
23855 goto simple;
23856 /* For 32-bit target DI comparison may be performed on
23857 SSE registers. To allow this we should avoid split
23858 to SI mode which is achieved by doing xor in DI mode
23859 and then comparing with zero (which is recognized by
23860 STV pass). We don't compare using xor when optimizing
23861 for size. */
23862 if (!optimize_insn_for_size_p ()
23863 && TARGET_STV
23864 && (code == EQ || code == NE))
23866 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
23867 op1 = const0_rtx;
23869 /* FALLTHRU */
23870 case E_TImode:
23871 /* Expand DImode branch into multiple compare+branch. */
23873 rtx lo[2], hi[2];
23874 rtx_code_label *label2;
23875 enum rtx_code code1, code2, code3;
23876 machine_mode submode;
23878 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
23880 std::swap (op0, op1);
23881 code = swap_condition (code);
23884 split_double_mode (mode, &op0, 1, lo+0, hi+0);
23885 split_double_mode (mode, &op1, 1, lo+1, hi+1);
23887 submode = mode == DImode ? SImode : DImode;
23889 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
23890 avoid two branches. This costs one extra insn, so disable when
23891 optimizing for size. */
23893 if ((code == EQ || code == NE)
23894 && (!optimize_insn_for_size_p ()
23895 || hi[1] == const0_rtx || lo[1] == const0_rtx))
23897 rtx xor0, xor1;
23899 xor1 = hi[0];
23900 if (hi[1] != const0_rtx)
23901 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23902 NULL_RTX, 0, OPTAB_WIDEN);
23904 xor0 = lo[0];
23905 if (lo[1] != const0_rtx)
23906 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23907 NULL_RTX, 0, OPTAB_WIDEN);
23909 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23910 NULL_RTX, 0, OPTAB_WIDEN);
23912 ix86_expand_branch (code, tmp, const0_rtx, label);
23913 return;
23916 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23917 op1 is a constant and the low word is zero, then we can just
23918 examine the high word. Similarly for low word -1 and
23919 less-or-equal-than or greater-than. */
23921 if (CONST_INT_P (hi[1]))
23922 switch (code)
23924 case LT: case LTU: case GE: case GEU:
23925 if (lo[1] == const0_rtx)
23927 ix86_expand_branch (code, hi[0], hi[1], label);
23928 return;
23930 break;
23931 case LE: case LEU: case GT: case GTU:
23932 if (lo[1] == constm1_rtx)
23934 ix86_expand_branch (code, hi[0], hi[1], label);
23935 return;
23937 break;
23938 default:
23939 break;
23942 /* Otherwise, we need two or three jumps. */
23944 label2 = gen_label_rtx ();
23946 code1 = code;
23947 code2 = swap_condition (code);
23948 code3 = unsigned_condition (code);
23950 switch (code)
23952 case LT: case GT: case LTU: case GTU:
23953 break;
23955 case LE: code1 = LT; code2 = GT; break;
23956 case GE: code1 = GT; code2 = LT; break;
23957 case LEU: code1 = LTU; code2 = GTU; break;
23958 case GEU: code1 = GTU; code2 = LTU; break;
23960 case EQ: code1 = UNKNOWN; code2 = NE; break;
23961 case NE: code2 = UNKNOWN; break;
23963 default:
23964 gcc_unreachable ();
23968 * a < b =>
23969 * if (hi(a) < hi(b)) goto true;
23970 * if (hi(a) > hi(b)) goto false;
23971 * if (lo(a) < lo(b)) goto true;
23972 * false:
23975 if (code1 != UNKNOWN)
23976 ix86_expand_branch (code1, hi[0], hi[1], label);
23977 if (code2 != UNKNOWN)
23978 ix86_expand_branch (code2, hi[0], hi[1], label2);
23980 ix86_expand_branch (code3, lo[0], lo[1], label);
23982 if (code2 != UNKNOWN)
23983 emit_label (label2);
23984 return;
23987 default:
23988 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23989 goto simple;
23993 /* Split branch based on floating point condition. */
23994 void
23995 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
23996 rtx target1, rtx target2, rtx tmp)
23998 rtx condition;
23999 rtx_insn *i;
24001 if (target2 != pc_rtx)
24003 std::swap (target1, target2);
24004 code = reverse_condition_maybe_unordered (code);
24007 condition = ix86_expand_fp_compare (code, op1, op2,
24008 tmp);
24010 i = emit_jump_insn (gen_rtx_SET
24011 (pc_rtx,
24012 gen_rtx_IF_THEN_ELSE (VOIDmode,
24013 condition, target1, target2)));
24014 if (split_branch_probability.initialized_p ())
24015 add_reg_br_prob_note (i, split_branch_probability);
24018 void
24019 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
24021 rtx ret;
24023 gcc_assert (GET_MODE (dest) == QImode);
24025 ret = ix86_expand_compare (code, op0, op1);
24026 PUT_MODE (ret, QImode);
24027 emit_insn (gen_rtx_SET (dest, ret));
24030 /* Expand comparison setting or clearing carry flag. Return true when
24031 successful and set pop for the operation. */
24032 static bool
24033 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
24035 machine_mode mode =
24036 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
24038 /* Do not handle double-mode compares that go through special path. */
24039 if (mode == (TARGET_64BIT ? TImode : DImode))
24040 return false;
24042 if (SCALAR_FLOAT_MODE_P (mode))
24044 rtx compare_op;
24045 rtx_insn *compare_seq;
24047 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
24049 /* Shortcut: following common codes never translate
24050 into carry flag compares. */
24051 if (code == EQ || code == NE || code == UNEQ || code == LTGT
24052 || code == ORDERED || code == UNORDERED)
24053 return false;
24055 /* These comparisons require zero flag; swap operands so they won't. */
24056 if ((code == GT || code == UNLE || code == LE || code == UNGT)
24057 && !TARGET_IEEE_FP)
24059 std::swap (op0, op1);
24060 code = swap_condition (code);
24063 /* Try to expand the comparison and verify that we end up with
24064 carry flag based comparison. This fails to be true only when
24065 we decide to expand comparison using arithmetic that is not
24066 too common scenario. */
24067 start_sequence ();
24068 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
24069 compare_seq = get_insns ();
24070 end_sequence ();
24072 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
24073 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
24074 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
24075 else
24076 code = GET_CODE (compare_op);
24078 if (code != LTU && code != GEU)
24079 return false;
24081 emit_insn (compare_seq);
24082 *pop = compare_op;
24083 return true;
24086 if (!INTEGRAL_MODE_P (mode))
24087 return false;
24089 switch (code)
24091 case LTU:
24092 case GEU:
24093 break;
24095 /* Convert a==0 into (unsigned)a<1. */
24096 case EQ:
24097 case NE:
24098 if (op1 != const0_rtx)
24099 return false;
24100 op1 = const1_rtx;
24101 code = (code == EQ ? LTU : GEU);
24102 break;
24104 /* Convert a>b into b<a or a>=b-1. */
24105 case GTU:
24106 case LEU:
24107 if (CONST_INT_P (op1))
24109 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
24110 /* Bail out on overflow. We still can swap operands but that
24111 would force loading of the constant into register. */
24112 if (op1 == const0_rtx
24113 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
24114 return false;
24115 code = (code == GTU ? GEU : LTU);
24117 else
24119 std::swap (op0, op1);
24120 code = (code == GTU ? LTU : GEU);
24122 break;
24124 /* Convert a>=0 into (unsigned)a<0x80000000. */
24125 case LT:
24126 case GE:
24127 if (mode == DImode || op1 != const0_rtx)
24128 return false;
24129 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24130 code = (code == LT ? GEU : LTU);
24131 break;
24132 case LE:
24133 case GT:
24134 if (mode == DImode || op1 != constm1_rtx)
24135 return false;
24136 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
24137 code = (code == LE ? GEU : LTU);
24138 break;
24140 default:
24141 return false;
24143 /* Swapping operands may cause constant to appear as first operand. */
24144 if (!nonimmediate_operand (op0, VOIDmode))
24146 if (!can_create_pseudo_p ())
24147 return false;
24148 op0 = force_reg (mode, op0);
24150 *pop = ix86_expand_compare (code, op0, op1);
24151 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
24152 return true;
24155 bool
24156 ix86_expand_int_movcc (rtx operands[])
24158 enum rtx_code code = GET_CODE (operands[1]), compare_code;
24159 rtx_insn *compare_seq;
24160 rtx compare_op;
24161 machine_mode mode = GET_MODE (operands[0]);
24162 bool sign_bit_compare_p = false;
24163 rtx op0 = XEXP (operands[1], 0);
24164 rtx op1 = XEXP (operands[1], 1);
24166 if (GET_MODE (op0) == TImode
24167 || (GET_MODE (op0) == DImode
24168 && !TARGET_64BIT))
24169 return false;
24171 start_sequence ();
24172 compare_op = ix86_expand_compare (code, op0, op1);
24173 compare_seq = get_insns ();
24174 end_sequence ();
24176 compare_code = GET_CODE (compare_op);
24178 if ((op1 == const0_rtx && (code == GE || code == LT))
24179 || (op1 == constm1_rtx && (code == GT || code == LE)))
24180 sign_bit_compare_p = true;
24182 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
24183 HImode insns, we'd be swallowed in word prefix ops. */
24185 if ((mode != HImode || TARGET_FAST_PREFIX)
24186 && (mode != (TARGET_64BIT ? TImode : DImode))
24187 && CONST_INT_P (operands[2])
24188 && CONST_INT_P (operands[3]))
24190 rtx out = operands[0];
24191 HOST_WIDE_INT ct = INTVAL (operands[2]);
24192 HOST_WIDE_INT cf = INTVAL (operands[3]);
24193 HOST_WIDE_INT diff;
24195 diff = ct - cf;
24196 /* Sign bit compares are better done using shifts than we do by using
24197 sbb. */
24198 if (sign_bit_compare_p
24199 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24201 /* Detect overlap between destination and compare sources. */
24202 rtx tmp = out;
24204 if (!sign_bit_compare_p)
24206 rtx flags;
24207 bool fpcmp = false;
24209 compare_code = GET_CODE (compare_op);
24211 flags = XEXP (compare_op, 0);
24213 if (GET_MODE (flags) == CCFPmode
24214 || GET_MODE (flags) == CCFPUmode)
24216 fpcmp = true;
24217 compare_code
24218 = ix86_fp_compare_code_to_integer (compare_code);
24221 /* To simplify rest of code, restrict to the GEU case. */
24222 if (compare_code == LTU)
24224 std::swap (ct, cf);
24225 compare_code = reverse_condition (compare_code);
24226 code = reverse_condition (code);
24228 else
24230 if (fpcmp)
24231 PUT_CODE (compare_op,
24232 reverse_condition_maybe_unordered
24233 (GET_CODE (compare_op)));
24234 else
24235 PUT_CODE (compare_op,
24236 reverse_condition (GET_CODE (compare_op)));
24238 diff = ct - cf;
24240 if (reg_overlap_mentioned_p (out, op0)
24241 || reg_overlap_mentioned_p (out, op1))
24242 tmp = gen_reg_rtx (mode);
24244 if (mode == DImode)
24245 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
24246 else
24247 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
24248 flags, compare_op));
24250 else
24252 if (code == GT || code == GE)
24253 code = reverse_condition (code);
24254 else
24256 std::swap (ct, cf);
24257 diff = ct - cf;
24259 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
24262 if (diff == 1)
24265 * cmpl op0,op1
24266 * sbbl dest,dest
24267 * [addl dest, ct]
24269 * Size 5 - 8.
24271 if (ct)
24272 tmp = expand_simple_binop (mode, PLUS,
24273 tmp, GEN_INT (ct),
24274 copy_rtx (tmp), 1, OPTAB_DIRECT);
24276 else if (cf == -1)
24279 * cmpl op0,op1
24280 * sbbl dest,dest
24281 * orl $ct, dest
24283 * Size 8.
24285 tmp = expand_simple_binop (mode, IOR,
24286 tmp, GEN_INT (ct),
24287 copy_rtx (tmp), 1, OPTAB_DIRECT);
24289 else if (diff == -1 && ct)
24292 * cmpl op0,op1
24293 * sbbl dest,dest
24294 * notl dest
24295 * [addl dest, cf]
24297 * Size 8 - 11.
24299 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24300 if (cf)
24301 tmp = expand_simple_binop (mode, PLUS,
24302 copy_rtx (tmp), GEN_INT (cf),
24303 copy_rtx (tmp), 1, OPTAB_DIRECT);
24305 else
24308 * cmpl op0,op1
24309 * sbbl dest,dest
24310 * [notl dest]
24311 * andl cf - ct, dest
24312 * [addl dest, ct]
24314 * Size 8 - 11.
24317 if (cf == 0)
24319 cf = ct;
24320 ct = 0;
24321 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
24324 tmp = expand_simple_binop (mode, AND,
24325 copy_rtx (tmp),
24326 gen_int_mode (cf - ct, mode),
24327 copy_rtx (tmp), 1, OPTAB_DIRECT);
24328 if (ct)
24329 tmp = expand_simple_binop (mode, PLUS,
24330 copy_rtx (tmp), GEN_INT (ct),
24331 copy_rtx (tmp), 1, OPTAB_DIRECT);
24334 if (!rtx_equal_p (tmp, out))
24335 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
24337 return true;
24340 if (diff < 0)
24342 machine_mode cmp_mode = GET_MODE (op0);
24343 enum rtx_code new_code;
24345 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24347 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24349 /* We may be reversing unordered compare to normal compare, that
24350 is not valid in general (we may convert non-trapping condition
24351 to trapping one), however on i386 we currently emit all
24352 comparisons unordered. */
24353 new_code = reverse_condition_maybe_unordered (code);
24355 else
24356 new_code = ix86_reverse_condition (code, cmp_mode);
24357 if (new_code != UNKNOWN)
24359 std::swap (ct, cf);
24360 diff = -diff;
24361 code = new_code;
24365 compare_code = UNKNOWN;
24366 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
24367 && CONST_INT_P (op1))
24369 if (op1 == const0_rtx
24370 && (code == LT || code == GE))
24371 compare_code = code;
24372 else if (op1 == constm1_rtx)
24374 if (code == LE)
24375 compare_code = LT;
24376 else if (code == GT)
24377 compare_code = GE;
24381 /* Optimize dest = (op0 < 0) ? -1 : cf. */
24382 if (compare_code != UNKNOWN
24383 && GET_MODE (op0) == GET_MODE (out)
24384 && (cf == -1 || ct == -1))
24386 /* If lea code below could be used, only optimize
24387 if it results in a 2 insn sequence. */
24389 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
24390 || diff == 3 || diff == 5 || diff == 9)
24391 || (compare_code == LT && ct == -1)
24392 || (compare_code == GE && cf == -1))
24395 * notl op1 (if necessary)
24396 * sarl $31, op1
24397 * orl cf, op1
24399 if (ct != -1)
24401 cf = ct;
24402 ct = -1;
24403 code = reverse_condition (code);
24406 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24408 out = expand_simple_binop (mode, IOR,
24409 out, GEN_INT (cf),
24410 out, 1, OPTAB_DIRECT);
24411 if (out != operands[0])
24412 emit_move_insn (operands[0], out);
24414 return true;
24419 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
24420 || diff == 3 || diff == 5 || diff == 9)
24421 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
24422 && (mode != DImode
24423 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
24426 * xorl dest,dest
24427 * cmpl op1,op2
24428 * setcc dest
24429 * lea cf(dest*(ct-cf)),dest
24431 * Size 14.
24433 * This also catches the degenerate setcc-only case.
24436 rtx tmp;
24437 int nops;
24439 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24441 nops = 0;
24442 /* On x86_64 the lea instruction operates on Pmode, so we need
24443 to get arithmetics done in proper mode to match. */
24444 if (diff == 1)
24445 tmp = copy_rtx (out);
24446 else
24448 rtx out1;
24449 out1 = copy_rtx (out);
24450 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
24451 nops++;
24452 if (diff & 1)
24454 tmp = gen_rtx_PLUS (mode, tmp, out1);
24455 nops++;
24458 if (cf != 0)
24460 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
24461 nops++;
24463 if (!rtx_equal_p (tmp, out))
24465 if (nops == 1)
24466 out = force_operand (tmp, copy_rtx (out));
24467 else
24468 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
24470 if (!rtx_equal_p (out, operands[0]))
24471 emit_move_insn (operands[0], copy_rtx (out));
24473 return true;
24477 * General case: Jumpful:
24478 * xorl dest,dest cmpl op1, op2
24479 * cmpl op1, op2 movl ct, dest
24480 * setcc dest jcc 1f
24481 * decl dest movl cf, dest
24482 * andl (cf-ct),dest 1:
24483 * addl ct,dest
24485 * Size 20. Size 14.
24487 * This is reasonably steep, but branch mispredict costs are
24488 * high on modern cpus, so consider failing only if optimizing
24489 * for space.
24492 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24493 && BRANCH_COST (optimize_insn_for_speed_p (),
24494 false) >= 2)
24496 if (cf == 0)
24498 machine_mode cmp_mode = GET_MODE (op0);
24499 enum rtx_code new_code;
24501 if (SCALAR_FLOAT_MODE_P (cmp_mode))
24503 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
24505 /* We may be reversing unordered compare to normal compare,
24506 that is not valid in general (we may convert non-trapping
24507 condition to trapping one), however on i386 we currently
24508 emit all comparisons unordered. */
24509 new_code = reverse_condition_maybe_unordered (code);
24511 else
24513 new_code = ix86_reverse_condition (code, cmp_mode);
24514 if (compare_code != UNKNOWN && new_code != UNKNOWN)
24515 compare_code = reverse_condition (compare_code);
24518 if (new_code != UNKNOWN)
24520 cf = ct;
24521 ct = 0;
24522 code = new_code;
24526 if (compare_code != UNKNOWN)
24528 /* notl op1 (if needed)
24529 sarl $31, op1
24530 andl (cf-ct), op1
24531 addl ct, op1
24533 For x < 0 (resp. x <= -1) there will be no notl,
24534 so if possible swap the constants to get rid of the
24535 complement.
24536 True/false will be -1/0 while code below (store flag
24537 followed by decrement) is 0/-1, so the constants need
24538 to be exchanged once more. */
24540 if (compare_code == GE || !cf)
24542 code = reverse_condition (code);
24543 compare_code = LT;
24545 else
24546 std::swap (ct, cf);
24548 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
24550 else
24552 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
24554 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
24555 constm1_rtx,
24556 copy_rtx (out), 1, OPTAB_DIRECT);
24559 out = expand_simple_binop (mode, AND, copy_rtx (out),
24560 gen_int_mode (cf - ct, mode),
24561 copy_rtx (out), 1, OPTAB_DIRECT);
24562 if (ct)
24563 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
24564 copy_rtx (out), 1, OPTAB_DIRECT);
24565 if (!rtx_equal_p (out, operands[0]))
24566 emit_move_insn (operands[0], copy_rtx (out));
24568 return true;
24572 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
24574 /* Try a few things more with specific constants and a variable. */
24576 optab op;
24577 rtx var, orig_out, out, tmp;
24579 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
24580 return false;
24582 /* If one of the two operands is an interesting constant, load a
24583 constant with the above and mask it in with a logical operation. */
24585 if (CONST_INT_P (operands[2]))
24587 var = operands[3];
24588 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
24589 operands[3] = constm1_rtx, op = and_optab;
24590 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
24591 operands[3] = const0_rtx, op = ior_optab;
24592 else
24593 return false;
24595 else if (CONST_INT_P (operands[3]))
24597 var = operands[2];
24598 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
24599 operands[2] = constm1_rtx, op = and_optab;
24600 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
24601 operands[2] = const0_rtx, op = ior_optab;
24602 else
24603 return false;
24605 else
24606 return false;
24608 orig_out = operands[0];
24609 tmp = gen_reg_rtx (mode);
24610 operands[0] = tmp;
24612 /* Recurse to get the constant loaded. */
24613 if (!ix86_expand_int_movcc (operands))
24614 return false;
24616 /* Mask in the interesting variable. */
24617 out = expand_binop (mode, op, var, tmp, orig_out, 0,
24618 OPTAB_WIDEN);
24619 if (!rtx_equal_p (out, orig_out))
24620 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
24622 return true;
24626 * For comparison with above,
24628 * movl cf,dest
24629 * movl ct,tmp
24630 * cmpl op1,op2
24631 * cmovcc tmp,dest
24633 * Size 15.
24636 if (! nonimmediate_operand (operands[2], mode))
24637 operands[2] = force_reg (mode, operands[2]);
24638 if (! nonimmediate_operand (operands[3], mode))
24639 operands[3] = force_reg (mode, operands[3]);
24641 if (! register_operand (operands[2], VOIDmode)
24642 && (mode == QImode
24643 || ! register_operand (operands[3], VOIDmode)))
24644 operands[2] = force_reg (mode, operands[2]);
24646 if (mode == QImode
24647 && ! register_operand (operands[3], VOIDmode))
24648 operands[3] = force_reg (mode, operands[3]);
24650 emit_insn (compare_seq);
24651 emit_insn (gen_rtx_SET (operands[0],
24652 gen_rtx_IF_THEN_ELSE (mode,
24653 compare_op, operands[2],
24654 operands[3])));
24655 return true;
24658 /* Swap, force into registers, or otherwise massage the two operands
24659 to an sse comparison with a mask result. Thus we differ a bit from
24660 ix86_prepare_fp_compare_args which expects to produce a flags result.
24662 The DEST operand exists to help determine whether to commute commutative
24663 operators. The POP0/POP1 operands are updated in place. The new
24664 comparison code is returned, or UNKNOWN if not implementable. */
24666 static enum rtx_code
24667 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
24668 rtx *pop0, rtx *pop1)
24670 switch (code)
24672 case LTGT:
24673 case UNEQ:
24674 /* AVX supports all the needed comparisons. */
24675 if (TARGET_AVX)
24676 break;
24677 /* We have no LTGT as an operator. We could implement it with
24678 NE & ORDERED, but this requires an extra temporary. It's
24679 not clear that it's worth it. */
24680 return UNKNOWN;
24682 case LT:
24683 case LE:
24684 case UNGT:
24685 case UNGE:
24686 /* These are supported directly. */
24687 break;
24689 case EQ:
24690 case NE:
24691 case UNORDERED:
24692 case ORDERED:
24693 /* AVX has 3 operand comparisons, no need to swap anything. */
24694 if (TARGET_AVX)
24695 break;
24696 /* For commutative operators, try to canonicalize the destination
24697 operand to be first in the comparison - this helps reload to
24698 avoid extra moves. */
24699 if (!dest || !rtx_equal_p (dest, *pop1))
24700 break;
24701 /* FALLTHRU */
24703 case GE:
24704 case GT:
24705 case UNLE:
24706 case UNLT:
24707 /* These are not supported directly before AVX, and furthermore
24708 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
24709 comparison operands to transform into something that is
24710 supported. */
24711 std::swap (*pop0, *pop1);
24712 code = swap_condition (code);
24713 break;
24715 default:
24716 gcc_unreachable ();
24719 return code;
24722 /* Detect conditional moves that exactly match min/max operational
24723 semantics. Note that this is IEEE safe, as long as we don't
24724 interchange the operands.
24726 Returns FALSE if this conditional move doesn't match a MIN/MAX,
24727 and TRUE if the operation is successful and instructions are emitted. */
24729 static bool
24730 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
24731 rtx cmp_op1, rtx if_true, rtx if_false)
24733 machine_mode mode;
24734 bool is_min;
24735 rtx tmp;
24737 if (code == LT)
24739 else if (code == UNGE)
24740 std::swap (if_true, if_false);
24741 else
24742 return false;
24744 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
24745 is_min = true;
24746 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
24747 is_min = false;
24748 else
24749 return false;
24751 mode = GET_MODE (dest);
24753 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
24754 but MODE may be a vector mode and thus not appropriate. */
24755 if (!flag_finite_math_only || flag_signed_zeros)
24757 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
24758 rtvec v;
24760 if_true = force_reg (mode, if_true);
24761 v = gen_rtvec (2, if_true, if_false);
24762 tmp = gen_rtx_UNSPEC (mode, v, u);
24764 else
24766 code = is_min ? SMIN : SMAX;
24767 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
24770 emit_insn (gen_rtx_SET (dest, tmp));
24771 return true;
24774 /* Expand an sse vector comparison. Return the register with the result. */
24776 static rtx
24777 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
24778 rtx op_true, rtx op_false)
24780 machine_mode mode = GET_MODE (dest);
24781 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
24783 /* In general case result of comparison can differ from operands' type. */
24784 machine_mode cmp_mode;
24786 /* In AVX512F the result of comparison is an integer mask. */
24787 bool maskcmp = false;
24788 rtx x;
24790 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
24792 cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
24793 gcc_assert (cmp_mode != BLKmode);
24795 maskcmp = true;
24797 else
24798 cmp_mode = cmp_ops_mode;
24801 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
24802 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
24803 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
24805 if (optimize
24806 || (maskcmp && cmp_mode != mode)
24807 || (op_true && reg_overlap_mentioned_p (dest, op_true))
24808 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
24809 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
24811 /* Compare patterns for int modes are unspec in AVX512F only. */
24812 if (maskcmp && (code == GT || code == EQ))
24814 rtx (*gen)(rtx, rtx, rtx);
24816 switch (cmp_ops_mode)
24818 case E_V64QImode:
24819 gcc_assert (TARGET_AVX512BW);
24820 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
24821 break;
24822 case E_V32HImode:
24823 gcc_assert (TARGET_AVX512BW);
24824 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
24825 break;
24826 case E_V16SImode:
24827 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
24828 break;
24829 case E_V8DImode:
24830 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
24831 break;
24832 default:
24833 gen = NULL;
24836 if (gen)
24838 emit_insn (gen (dest, cmp_op0, cmp_op1));
24839 return dest;
24842 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
24844 if (cmp_mode != mode && !maskcmp)
24846 x = force_reg (cmp_ops_mode, x);
24847 convert_move (dest, x, false);
24849 else
24850 emit_insn (gen_rtx_SET (dest, x));
24852 return dest;
24855 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
24856 operations. This is used for both scalar and vector conditional moves. */
24858 void
24859 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
24861 machine_mode mode = GET_MODE (dest);
24862 machine_mode cmpmode = GET_MODE (cmp);
24864 /* In AVX512F the result of comparison is an integer mask. */
24865 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
24867 rtx t2, t3, x;
24869 /* If we have an integer mask and FP value then we need
24870 to cast mask to FP mode. */
24871 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
24873 cmp = force_reg (cmpmode, cmp);
24874 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24877 if (vector_all_ones_operand (op_true, mode)
24878 && rtx_equal_p (op_false, CONST0_RTX (mode))
24879 && !maskcmp)
24881 emit_insn (gen_rtx_SET (dest, cmp));
24883 else if (op_false == CONST0_RTX (mode)
24884 && !maskcmp)
24886 op_true = force_reg (mode, op_true);
24887 x = gen_rtx_AND (mode, cmp, op_true);
24888 emit_insn (gen_rtx_SET (dest, x));
24890 else if (op_true == CONST0_RTX (mode)
24891 && !maskcmp)
24893 op_false = force_reg (mode, op_false);
24894 x = gen_rtx_NOT (mode, cmp);
24895 x = gen_rtx_AND (mode, x, op_false);
24896 emit_insn (gen_rtx_SET (dest, x));
24898 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24899 && !maskcmp)
24901 op_false = force_reg (mode, op_false);
24902 x = gen_rtx_IOR (mode, cmp, op_false);
24903 emit_insn (gen_rtx_SET (dest, x));
24905 else if (TARGET_XOP
24906 && !maskcmp)
24908 op_true = force_reg (mode, op_true);
24910 if (!nonimmediate_operand (op_false, mode))
24911 op_false = force_reg (mode, op_false);
24913 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24914 op_true,
24915 op_false)));
24917 else
24919 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24920 rtx d = dest;
24922 if (!nonimmediate_operand (op_true, mode))
24923 op_true = force_reg (mode, op_true);
24925 op_false = force_reg (mode, op_false);
24927 switch (mode)
24929 case E_V4SFmode:
24930 if (TARGET_SSE4_1)
24931 gen = gen_sse4_1_blendvps;
24932 break;
24933 case E_V2DFmode:
24934 if (TARGET_SSE4_1)
24935 gen = gen_sse4_1_blendvpd;
24936 break;
24937 case E_V16QImode:
24938 case E_V8HImode:
24939 case E_V4SImode:
24940 case E_V2DImode:
24941 if (TARGET_SSE4_1)
24943 gen = gen_sse4_1_pblendvb;
24944 if (mode != V16QImode)
24945 d = gen_reg_rtx (V16QImode);
24946 op_false = gen_lowpart (V16QImode, op_false);
24947 op_true = gen_lowpart (V16QImode, op_true);
24948 cmp = gen_lowpart (V16QImode, cmp);
24950 break;
24951 case E_V8SFmode:
24952 if (TARGET_AVX)
24953 gen = gen_avx_blendvps256;
24954 break;
24955 case E_V4DFmode:
24956 if (TARGET_AVX)
24957 gen = gen_avx_blendvpd256;
24958 break;
24959 case E_V32QImode:
24960 case E_V16HImode:
24961 case E_V8SImode:
24962 case E_V4DImode:
24963 if (TARGET_AVX2)
24965 gen = gen_avx2_pblendvb;
24966 if (mode != V32QImode)
24967 d = gen_reg_rtx (V32QImode);
24968 op_false = gen_lowpart (V32QImode, op_false);
24969 op_true = gen_lowpart (V32QImode, op_true);
24970 cmp = gen_lowpart (V32QImode, cmp);
24972 break;
24974 case E_V64QImode:
24975 gen = gen_avx512bw_blendmv64qi;
24976 break;
24977 case E_V32HImode:
24978 gen = gen_avx512bw_blendmv32hi;
24979 break;
24980 case E_V16SImode:
24981 gen = gen_avx512f_blendmv16si;
24982 break;
24983 case E_V8DImode:
24984 gen = gen_avx512f_blendmv8di;
24985 break;
24986 case E_V8DFmode:
24987 gen = gen_avx512f_blendmv8df;
24988 break;
24989 case E_V16SFmode:
24990 gen = gen_avx512f_blendmv16sf;
24991 break;
24993 default:
24994 break;
24997 if (gen != NULL)
24999 emit_insn (gen (d, op_false, op_true, cmp));
25000 if (d != dest)
25001 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
25003 else
25005 op_true = force_reg (mode, op_true);
25007 t2 = gen_reg_rtx (mode);
25008 if (optimize)
25009 t3 = gen_reg_rtx (mode);
25010 else
25011 t3 = dest;
25013 x = gen_rtx_AND (mode, op_true, cmp);
25014 emit_insn (gen_rtx_SET (t2, x));
25016 x = gen_rtx_NOT (mode, cmp);
25017 x = gen_rtx_AND (mode, x, op_false);
25018 emit_insn (gen_rtx_SET (t3, x));
25020 x = gen_rtx_IOR (mode, t3, t2);
25021 emit_insn (gen_rtx_SET (dest, x));
25026 /* Expand a floating-point conditional move. Return true if successful. */
25028 bool
25029 ix86_expand_fp_movcc (rtx operands[])
25031 machine_mode mode = GET_MODE (operands[0]);
25032 enum rtx_code code = GET_CODE (operands[1]);
25033 rtx tmp, compare_op;
25034 rtx op0 = XEXP (operands[1], 0);
25035 rtx op1 = XEXP (operands[1], 1);
25037 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
25039 machine_mode cmode;
25041 /* Since we've no cmove for sse registers, don't force bad register
25042 allocation just to gain access to it. Deny movcc when the
25043 comparison mode doesn't match the move mode. */
25044 cmode = GET_MODE (op0);
25045 if (cmode == VOIDmode)
25046 cmode = GET_MODE (op1);
25047 if (cmode != mode)
25048 return false;
25050 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
25051 if (code == UNKNOWN)
25052 return false;
25054 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
25055 operands[2], operands[3]))
25056 return true;
25058 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
25059 operands[2], operands[3]);
25060 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
25061 return true;
25064 if (GET_MODE (op0) == TImode
25065 || (GET_MODE (op0) == DImode
25066 && !TARGET_64BIT))
25067 return false;
25069 /* The floating point conditional move instructions don't directly
25070 support conditions resulting from a signed integer comparison. */
25072 compare_op = ix86_expand_compare (code, op0, op1);
25073 if (!fcmov_comparison_operator (compare_op, VOIDmode))
25075 tmp = gen_reg_rtx (QImode);
25076 ix86_expand_setcc (tmp, code, op0, op1);
25078 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
25081 emit_insn (gen_rtx_SET (operands[0],
25082 gen_rtx_IF_THEN_ELSE (mode, compare_op,
25083 operands[2], operands[3])));
25085 return true;
25088 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
25090 static int
25091 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
25093 switch (code)
25095 case EQ:
25096 return 0;
25097 case LT:
25098 case LTU:
25099 return 1;
25100 case LE:
25101 case LEU:
25102 return 2;
25103 case NE:
25104 return 4;
25105 case GE:
25106 case GEU:
25107 return 5;
25108 case GT:
25109 case GTU:
25110 return 6;
25111 default:
25112 gcc_unreachable ();
25116 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
25118 static int
25119 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
25121 switch (code)
25123 case EQ:
25124 return 0x00;
25125 case NE:
25126 return 0x04;
25127 case GT:
25128 return 0x0e;
25129 case LE:
25130 return 0x02;
25131 case GE:
25132 return 0x0d;
25133 case LT:
25134 return 0x01;
25135 case UNLE:
25136 return 0x0a;
25137 case UNLT:
25138 return 0x09;
25139 case UNGE:
25140 return 0x05;
25141 case UNGT:
25142 return 0x06;
25143 case UNEQ:
25144 return 0x18;
25145 case LTGT:
25146 return 0x0c;
25147 case ORDERED:
25148 return 0x07;
25149 case UNORDERED:
25150 return 0x03;
25151 default:
25152 gcc_unreachable ();
25156 /* Return immediate value to be used in UNSPEC_PCMP
25157 for comparison CODE in MODE. */
25159 static int
25160 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
25162 if (FLOAT_MODE_P (mode))
25163 return ix86_fp_cmp_code_to_pcmp_immediate (code);
25164 return ix86_int_cmp_code_to_pcmp_immediate (code);
25167 /* Expand AVX-512 vector comparison. */
25169 bool
25170 ix86_expand_mask_vec_cmp (rtx operands[])
25172 machine_mode mask_mode = GET_MODE (operands[0]);
25173 machine_mode cmp_mode = GET_MODE (operands[2]);
25174 enum rtx_code code = GET_CODE (operands[1]);
25175 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
25176 int unspec_code;
25177 rtx unspec;
25179 switch (code)
25181 case LEU:
25182 case GTU:
25183 case GEU:
25184 case LTU:
25185 unspec_code = UNSPEC_UNSIGNED_PCMP;
25186 break;
25188 default:
25189 unspec_code = UNSPEC_PCMP;
25192 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
25193 operands[3], imm),
25194 unspec_code);
25195 emit_insn (gen_rtx_SET (operands[0], unspec));
25197 return true;
25200 /* Expand fp vector comparison. */
25202 bool
25203 ix86_expand_fp_vec_cmp (rtx operands[])
25205 enum rtx_code code = GET_CODE (operands[1]);
25206 rtx cmp;
25208 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25209 &operands[2], &operands[3]);
25210 if (code == UNKNOWN)
25212 rtx temp;
25213 switch (GET_CODE (operands[1]))
25215 case LTGT:
25216 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
25217 operands[3], NULL, NULL);
25218 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
25219 operands[3], NULL, NULL);
25220 code = AND;
25221 break;
25222 case UNEQ:
25223 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
25224 operands[3], NULL, NULL);
25225 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
25226 operands[3], NULL, NULL);
25227 code = IOR;
25228 break;
25229 default:
25230 gcc_unreachable ();
25232 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25233 OPTAB_DIRECT);
25235 else
25236 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
25237 operands[1], operands[2]);
25239 if (operands[0] != cmp)
25240 emit_move_insn (operands[0], cmp);
25242 return true;
25245 static rtx
25246 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
25247 rtx op_true, rtx op_false, bool *negate)
25249 machine_mode data_mode = GET_MODE (dest);
25250 machine_mode mode = GET_MODE (cop0);
25251 rtx x;
25253 *negate = false;
25255 /* XOP supports all of the comparisons on all 128-bit vector int types. */
25256 if (TARGET_XOP
25257 && (mode == V16QImode || mode == V8HImode
25258 || mode == V4SImode || mode == V2DImode))
25260 else
25262 /* Canonicalize the comparison to EQ, GT, GTU. */
25263 switch (code)
25265 case EQ:
25266 case GT:
25267 case GTU:
25268 break;
25270 case NE:
25271 case LE:
25272 case LEU:
25273 code = reverse_condition (code);
25274 *negate = true;
25275 break;
25277 case GE:
25278 case GEU:
25279 code = reverse_condition (code);
25280 *negate = true;
25281 /* FALLTHRU */
25283 case LT:
25284 case LTU:
25285 std::swap (cop0, cop1);
25286 code = swap_condition (code);
25287 break;
25289 default:
25290 gcc_unreachable ();
25293 /* Only SSE4.1/SSE4.2 supports V2DImode. */
25294 if (mode == V2DImode)
25296 switch (code)
25298 case EQ:
25299 /* SSE4.1 supports EQ. */
25300 if (!TARGET_SSE4_1)
25301 return NULL;
25302 break;
25304 case GT:
25305 case GTU:
25306 /* SSE4.2 supports GT/GTU. */
25307 if (!TARGET_SSE4_2)
25308 return NULL;
25309 break;
25311 default:
25312 gcc_unreachable ();
25316 /* Unsigned parallel compare is not supported by the hardware.
25317 Play some tricks to turn this into a signed comparison
25318 against 0. */
25319 if (code == GTU)
25321 cop0 = force_reg (mode, cop0);
25323 switch (mode)
25325 case E_V16SImode:
25326 case E_V8DImode:
25327 case E_V8SImode:
25328 case E_V4DImode:
25329 case E_V4SImode:
25330 case E_V2DImode:
25332 rtx t1, t2, mask;
25333 rtx (*gen_sub3) (rtx, rtx, rtx);
25335 switch (mode)
25337 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
25338 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
25339 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
25340 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
25341 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
25342 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
25343 default:
25344 gcc_unreachable ();
25346 /* Subtract (-(INT MAX) - 1) from both operands to make
25347 them signed. */
25348 mask = ix86_build_signbit_mask (mode, true, false);
25349 t1 = gen_reg_rtx (mode);
25350 emit_insn (gen_sub3 (t1, cop0, mask));
25352 t2 = gen_reg_rtx (mode);
25353 emit_insn (gen_sub3 (t2, cop1, mask));
25355 cop0 = t1;
25356 cop1 = t2;
25357 code = GT;
25359 break;
25361 case E_V64QImode:
25362 case E_V32HImode:
25363 case E_V32QImode:
25364 case E_V16HImode:
25365 case E_V16QImode:
25366 case E_V8HImode:
25367 /* Perform a parallel unsigned saturating subtraction. */
25368 x = gen_reg_rtx (mode);
25369 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
25370 cop1)));
25372 cop0 = x;
25373 cop1 = CONST0_RTX (mode);
25374 code = EQ;
25375 *negate = !*negate;
25376 break;
25378 default:
25379 gcc_unreachable ();
25384 if (*negate)
25385 std::swap (op_true, op_false);
25387 /* Allow the comparison to be done in one mode, but the movcc to
25388 happen in another mode. */
25389 if (data_mode == mode)
25391 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
25392 op_true, op_false);
25394 else
25396 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
25397 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
25398 op_true, op_false);
25399 if (GET_MODE (x) == mode)
25400 x = gen_lowpart (data_mode, x);
25403 return x;
25406 /* Expand integer vector comparison. */
25408 bool
25409 ix86_expand_int_vec_cmp (rtx operands[])
25411 rtx_code code = GET_CODE (operands[1]);
25412 bool negate = false;
25413 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
25414 operands[3], NULL, NULL, &negate);
25416 if (!cmp)
25417 return false;
25419 if (negate)
25420 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
25421 CONST0_RTX (GET_MODE (cmp)),
25422 NULL, NULL, &negate);
25424 gcc_assert (!negate);
25426 if (operands[0] != cmp)
25427 emit_move_insn (operands[0], cmp);
25429 return true;
25432 /* Expand a floating-point vector conditional move; a vcond operation
25433 rather than a movcc operation. */
25435 bool
25436 ix86_expand_fp_vcond (rtx operands[])
25438 enum rtx_code code = GET_CODE (operands[3]);
25439 rtx cmp;
25441 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
25442 &operands[4], &operands[5]);
25443 if (code == UNKNOWN)
25445 rtx temp;
25446 switch (GET_CODE (operands[3]))
25448 case LTGT:
25449 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
25450 operands[5], operands[0], operands[0]);
25451 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
25452 operands[5], operands[1], operands[2]);
25453 code = AND;
25454 break;
25455 case UNEQ:
25456 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
25457 operands[5], operands[0], operands[0]);
25458 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
25459 operands[5], operands[1], operands[2]);
25460 code = IOR;
25461 break;
25462 default:
25463 gcc_unreachable ();
25465 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
25466 OPTAB_DIRECT);
25467 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25468 return true;
25471 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
25472 operands[5], operands[1], operands[2]))
25473 return true;
25475 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
25476 operands[1], operands[2]);
25477 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
25478 return true;
25481 /* Expand a signed/unsigned integral vector conditional move. */
25483 bool
25484 ix86_expand_int_vcond (rtx operands[])
25486 machine_mode data_mode = GET_MODE (operands[0]);
25487 machine_mode mode = GET_MODE (operands[4]);
25488 enum rtx_code code = GET_CODE (operands[3]);
25489 bool negate = false;
25490 rtx x, cop0, cop1;
25492 cop0 = operands[4];
25493 cop1 = operands[5];
25495 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
25496 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
25497 if ((code == LT || code == GE)
25498 && data_mode == mode
25499 && cop1 == CONST0_RTX (mode)
25500 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
25501 && GET_MODE_UNIT_SIZE (data_mode) > 1
25502 && GET_MODE_UNIT_SIZE (data_mode) <= 8
25503 && (GET_MODE_SIZE (data_mode) == 16
25504 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
25506 rtx negop = operands[2 - (code == LT)];
25507 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
25508 if (negop == CONST1_RTX (data_mode))
25510 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
25511 operands[0], 1, OPTAB_DIRECT);
25512 if (res != operands[0])
25513 emit_move_insn (operands[0], res);
25514 return true;
25516 else if (GET_MODE_INNER (data_mode) != DImode
25517 && vector_all_ones_operand (negop, data_mode))
25519 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
25520 operands[0], 0, OPTAB_DIRECT);
25521 if (res != operands[0])
25522 emit_move_insn (operands[0], res);
25523 return true;
25527 if (!nonimmediate_operand (cop1, mode))
25528 cop1 = force_reg (mode, cop1);
25529 if (!general_operand (operands[1], data_mode))
25530 operands[1] = force_reg (data_mode, operands[1]);
25531 if (!general_operand (operands[2], data_mode))
25532 operands[2] = force_reg (data_mode, operands[2]);
25534 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
25535 operands[1], operands[2], &negate);
25537 if (!x)
25538 return false;
25540 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
25541 operands[2-negate]);
25542 return true;
25545 /* AVX512F does support 64-byte integer vector operations,
25546 thus the longest vector we are faced with is V64QImode. */
25547 #define MAX_VECT_LEN 64
25549 struct expand_vec_perm_d
25551 rtx target, op0, op1;
25552 unsigned char perm[MAX_VECT_LEN];
25553 machine_mode vmode;
25554 unsigned char nelt;
25555 bool one_operand_p;
25556 bool testing_p;
25559 static bool
25560 ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
25561 struct expand_vec_perm_d *d)
25563 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25564 expander, so args are either in d, or in op0, op1 etc. */
25565 machine_mode mode = GET_MODE (d ? d->op0 : op0);
25566 machine_mode maskmode = mode;
25567 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
25569 switch (mode)
25571 case E_V8HImode:
25572 if (TARGET_AVX512VL && TARGET_AVX512BW)
25573 gen = gen_avx512vl_vpermi2varv8hi3;
25574 break;
25575 case E_V16HImode:
25576 if (TARGET_AVX512VL && TARGET_AVX512BW)
25577 gen = gen_avx512vl_vpermi2varv16hi3;
25578 break;
25579 case E_V64QImode:
25580 if (TARGET_AVX512VBMI)
25581 gen = gen_avx512bw_vpermi2varv64qi3;
25582 break;
25583 case E_V32HImode:
25584 if (TARGET_AVX512BW)
25585 gen = gen_avx512bw_vpermi2varv32hi3;
25586 break;
25587 case E_V4SImode:
25588 if (TARGET_AVX512VL)
25589 gen = gen_avx512vl_vpermi2varv4si3;
25590 break;
25591 case E_V8SImode:
25592 if (TARGET_AVX512VL)
25593 gen = gen_avx512vl_vpermi2varv8si3;
25594 break;
25595 case E_V16SImode:
25596 if (TARGET_AVX512F)
25597 gen = gen_avx512f_vpermi2varv16si3;
25598 break;
25599 case E_V4SFmode:
25600 if (TARGET_AVX512VL)
25602 gen = gen_avx512vl_vpermi2varv4sf3;
25603 maskmode = V4SImode;
25605 break;
25606 case E_V8SFmode:
25607 if (TARGET_AVX512VL)
25609 gen = gen_avx512vl_vpermi2varv8sf3;
25610 maskmode = V8SImode;
25612 break;
25613 case E_V16SFmode:
25614 if (TARGET_AVX512F)
25616 gen = gen_avx512f_vpermi2varv16sf3;
25617 maskmode = V16SImode;
25619 break;
25620 case E_V2DImode:
25621 if (TARGET_AVX512VL)
25622 gen = gen_avx512vl_vpermi2varv2di3;
25623 break;
25624 case E_V4DImode:
25625 if (TARGET_AVX512VL)
25626 gen = gen_avx512vl_vpermi2varv4di3;
25627 break;
25628 case E_V8DImode:
25629 if (TARGET_AVX512F)
25630 gen = gen_avx512f_vpermi2varv8di3;
25631 break;
25632 case E_V2DFmode:
25633 if (TARGET_AVX512VL)
25635 gen = gen_avx512vl_vpermi2varv2df3;
25636 maskmode = V2DImode;
25638 break;
25639 case E_V4DFmode:
25640 if (TARGET_AVX512VL)
25642 gen = gen_avx512vl_vpermi2varv4df3;
25643 maskmode = V4DImode;
25645 break;
25646 case E_V8DFmode:
25647 if (TARGET_AVX512F)
25649 gen = gen_avx512f_vpermi2varv8df3;
25650 maskmode = V8DImode;
25652 break;
25653 default:
25654 break;
25657 if (gen == NULL)
25658 return false;
25660 /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
25661 expander, so args are either in d, or in op0, op1 etc. */
25662 if (d)
25664 rtx vec[64];
25665 target = d->target;
25666 op0 = d->op0;
25667 op1 = d->op1;
25668 for (int i = 0; i < d->nelt; ++i)
25669 vec[i] = GEN_INT (d->perm[i]);
25670 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
25673 emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
25674 return true;
25677 /* Expand a variable vector permutation. */
25679 void
25680 ix86_expand_vec_perm (rtx operands[])
25682 rtx target = operands[0];
25683 rtx op0 = operands[1];
25684 rtx op1 = operands[2];
25685 rtx mask = operands[3];
25686 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
25687 machine_mode mode = GET_MODE (op0);
25688 machine_mode maskmode = GET_MODE (mask);
25689 int w, e, i;
25690 bool one_operand_shuffle = rtx_equal_p (op0, op1);
25692 /* Number of elements in the vector. */
25693 w = GET_MODE_NUNITS (mode);
25694 e = GET_MODE_UNIT_SIZE (mode);
25695 gcc_assert (w <= 64);
25697 if (TARGET_AVX512F && one_operand_shuffle)
25699 rtx (*gen) (rtx, rtx, rtx) = NULL;
25700 switch (mode)
25702 case E_V16SImode:
25703 gen =gen_avx512f_permvarv16si;
25704 break;
25705 case E_V16SFmode:
25706 gen = gen_avx512f_permvarv16sf;
25707 break;
25708 case E_V8DImode:
25709 gen = gen_avx512f_permvarv8di;
25710 break;
25711 case E_V8DFmode:
25712 gen = gen_avx512f_permvarv8df;
25713 break;
25714 default:
25715 break;
25717 if (gen != NULL)
25719 emit_insn (gen (target, op0, mask));
25720 return;
25724 if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
25725 return;
25727 if (TARGET_AVX2)
25729 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
25731 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
25732 an constant shuffle operand. With a tiny bit of effort we can
25733 use VPERMD instead. A re-interpretation stall for V4DFmode is
25734 unfortunate but there's no avoiding it.
25735 Similarly for V16HImode we don't have instructions for variable
25736 shuffling, while for V32QImode we can use after preparing suitable
25737 masks vpshufb; vpshufb; vpermq; vpor. */
25739 if (mode == V16HImode)
25741 maskmode = mode = V32QImode;
25742 w = 32;
25743 e = 1;
25745 else
25747 maskmode = mode = V8SImode;
25748 w = 8;
25749 e = 4;
25751 t1 = gen_reg_rtx (maskmode);
25753 /* Replicate the low bits of the V4DImode mask into V8SImode:
25754 mask = { A B C D }
25755 t1 = { A A B B C C D D }. */
25756 for (i = 0; i < w / 2; ++i)
25757 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
25758 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25759 vt = force_reg (maskmode, vt);
25760 mask = gen_lowpart (maskmode, mask);
25761 if (maskmode == V8SImode)
25762 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
25763 else
25764 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
25766 /* Multiply the shuffle indicies by two. */
25767 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
25768 OPTAB_DIRECT);
25770 /* Add one to the odd shuffle indicies:
25771 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
25772 for (i = 0; i < w / 2; ++i)
25774 vec[i * 2] = const0_rtx;
25775 vec[i * 2 + 1] = const1_rtx;
25777 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25778 vt = validize_mem (force_const_mem (maskmode, vt));
25779 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
25780 OPTAB_DIRECT);
25782 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
25783 operands[3] = mask = t1;
25784 target = gen_reg_rtx (mode);
25785 op0 = gen_lowpart (mode, op0);
25786 op1 = gen_lowpart (mode, op1);
25789 switch (mode)
25791 case E_V8SImode:
25792 /* The VPERMD and VPERMPS instructions already properly ignore
25793 the high bits of the shuffle elements. No need for us to
25794 perform an AND ourselves. */
25795 if (one_operand_shuffle)
25797 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
25798 if (target != operands[0])
25799 emit_move_insn (operands[0],
25800 gen_lowpart (GET_MODE (operands[0]), target));
25802 else
25804 t1 = gen_reg_rtx (V8SImode);
25805 t2 = gen_reg_rtx (V8SImode);
25806 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
25807 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
25808 goto merge_two;
25810 return;
25812 case E_V8SFmode:
25813 mask = gen_lowpart (V8SImode, mask);
25814 if (one_operand_shuffle)
25815 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
25816 else
25818 t1 = gen_reg_rtx (V8SFmode);
25819 t2 = gen_reg_rtx (V8SFmode);
25820 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
25821 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
25822 goto merge_two;
25824 return;
25826 case E_V4SImode:
25827 /* By combining the two 128-bit input vectors into one 256-bit
25828 input vector, we can use VPERMD and VPERMPS for the full
25829 two-operand shuffle. */
25830 t1 = gen_reg_rtx (V8SImode);
25831 t2 = gen_reg_rtx (V8SImode);
25832 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
25833 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25834 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
25835 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
25836 return;
25838 case E_V4SFmode:
25839 t1 = gen_reg_rtx (V8SFmode);
25840 t2 = gen_reg_rtx (V8SImode);
25841 mask = gen_lowpart (V4SImode, mask);
25842 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
25843 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
25844 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
25845 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
25846 return;
25848 case E_V32QImode:
25849 t1 = gen_reg_rtx (V32QImode);
25850 t2 = gen_reg_rtx (V32QImode);
25851 t3 = gen_reg_rtx (V32QImode);
25852 vt2 = GEN_INT (-128);
25853 for (i = 0; i < 32; i++)
25854 vec[i] = vt2;
25855 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25856 vt = force_reg (V32QImode, vt);
25857 for (i = 0; i < 32; i++)
25858 vec[i] = i < 16 ? vt2 : const0_rtx;
25859 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
25860 vt2 = force_reg (V32QImode, vt2);
25861 /* From mask create two adjusted masks, which contain the same
25862 bits as mask in the low 7 bits of each vector element.
25863 The first mask will have the most significant bit clear
25864 if it requests element from the same 128-bit lane
25865 and MSB set if it requests element from the other 128-bit lane.
25866 The second mask will have the opposite values of the MSB,
25867 and additionally will have its 128-bit lanes swapped.
25868 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
25869 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
25870 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
25871 stands for other 12 bytes. */
25872 /* The bit whether element is from the same lane or the other
25873 lane is bit 4, so shift it up by 3 to the MSB position. */
25874 t5 = gen_reg_rtx (V4DImode);
25875 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25876 GEN_INT (3)));
25877 /* Clear MSB bits from the mask just in case it had them set. */
25878 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25879 /* After this t1 will have MSB set for elements from other lane. */
25880 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25881 /* Clear bits other than MSB. */
25882 emit_insn (gen_andv32qi3 (t1, t1, vt));
25883 /* Or in the lower bits from mask into t3. */
25884 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25885 /* And invert MSB bits in t1, so MSB is set for elements from the same
25886 lane. */
25887 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25888 /* Swap 128-bit lanes in t3. */
25889 t6 = gen_reg_rtx (V4DImode);
25890 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25891 const2_rtx, GEN_INT (3),
25892 const0_rtx, const1_rtx));
25893 /* And or in the lower bits from mask into t1. */
25894 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25895 if (one_operand_shuffle)
25897 /* Each of these shuffles will put 0s in places where
25898 element from the other 128-bit lane is needed, otherwise
25899 will shuffle in the requested value. */
25900 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25901 gen_lowpart (V32QImode, t6)));
25902 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25903 /* For t3 the 128-bit lanes are swapped again. */
25904 t7 = gen_reg_rtx (V4DImode);
25905 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25906 const2_rtx, GEN_INT (3),
25907 const0_rtx, const1_rtx));
25908 /* And oring both together leads to the result. */
25909 emit_insn (gen_iorv32qi3 (target, t1,
25910 gen_lowpart (V32QImode, t7)));
25911 if (target != operands[0])
25912 emit_move_insn (operands[0],
25913 gen_lowpart (GET_MODE (operands[0]), target));
25914 return;
25917 t4 = gen_reg_rtx (V32QImode);
25918 /* Similarly to the above one_operand_shuffle code,
25919 just for repeated twice for each operand. merge_two:
25920 code will merge the two results together. */
25921 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25922 gen_lowpart (V32QImode, t6)));
25923 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25924 gen_lowpart (V32QImode, t6)));
25925 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25926 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25927 t7 = gen_reg_rtx (V4DImode);
25928 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25929 const2_rtx, GEN_INT (3),
25930 const0_rtx, const1_rtx));
25931 t8 = gen_reg_rtx (V4DImode);
25932 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25933 const2_rtx, GEN_INT (3),
25934 const0_rtx, const1_rtx));
25935 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25936 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25937 t1 = t4;
25938 t2 = t3;
25939 goto merge_two;
25941 default:
25942 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25943 break;
25947 if (TARGET_XOP)
25949 /* The XOP VPPERM insn supports three inputs. By ignoring the
25950 one_operand_shuffle special case, we avoid creating another
25951 set of constant vectors in memory. */
25952 one_operand_shuffle = false;
25954 /* mask = mask & {2*w-1, ...} */
25955 vt = GEN_INT (2*w - 1);
25957 else
25959 /* mask = mask & {w-1, ...} */
25960 vt = GEN_INT (w - 1);
25963 for (i = 0; i < w; i++)
25964 vec[i] = vt;
25965 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
25966 mask = expand_simple_binop (maskmode, AND, mask, vt,
25967 NULL_RTX, 0, OPTAB_DIRECT);
25969 /* For non-QImode operations, convert the word permutation control
25970 into a byte permutation control. */
25971 if (mode != V16QImode)
25973 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25974 GEN_INT (exact_log2 (e)),
25975 NULL_RTX, 0, OPTAB_DIRECT);
25977 /* Convert mask to vector of chars. */
25978 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25980 /* Replicate each of the input bytes into byte positions:
25981 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25982 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25983 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25984 for (i = 0; i < 16; ++i)
25985 vec[i] = GEN_INT (i/e * e);
25986 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25987 vt = validize_mem (force_const_mem (V16QImode, vt));
25988 if (TARGET_XOP)
25989 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25990 else
25991 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25993 /* Convert it into the byte positions by doing
25994 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25995 for (i = 0; i < 16; ++i)
25996 vec[i] = GEN_INT (i % e);
25997 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25998 vt = validize_mem (force_const_mem (V16QImode, vt));
25999 emit_insn (gen_addv16qi3 (mask, mask, vt));
26002 /* The actual shuffle operations all operate on V16QImode. */
26003 op0 = gen_lowpart (V16QImode, op0);
26004 op1 = gen_lowpart (V16QImode, op1);
26006 if (TARGET_XOP)
26008 if (GET_MODE (target) != V16QImode)
26009 target = gen_reg_rtx (V16QImode);
26010 emit_insn (gen_xop_pperm (target, op0, op1, mask));
26011 if (target != operands[0])
26012 emit_move_insn (operands[0],
26013 gen_lowpart (GET_MODE (operands[0]), target));
26015 else if (one_operand_shuffle)
26017 if (GET_MODE (target) != V16QImode)
26018 target = gen_reg_rtx (V16QImode);
26019 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
26020 if (target != operands[0])
26021 emit_move_insn (operands[0],
26022 gen_lowpart (GET_MODE (operands[0]), target));
26024 else
26026 rtx xops[6];
26027 bool ok;
26029 /* Shuffle the two input vectors independently. */
26030 t1 = gen_reg_rtx (V16QImode);
26031 t2 = gen_reg_rtx (V16QImode);
26032 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
26033 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
26035 merge_two:
26036 /* Then merge them together. The key is whether any given control
26037 element contained a bit set that indicates the second word. */
26038 mask = operands[3];
26039 vt = GEN_INT (w);
26040 if (maskmode == V2DImode && !TARGET_SSE4_1)
26042 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
26043 more shuffle to convert the V2DI input mask into a V4SI
26044 input mask. At which point the masking that expand_int_vcond
26045 will work as desired. */
26046 rtx t3 = gen_reg_rtx (V4SImode);
26047 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
26048 const0_rtx, const0_rtx,
26049 const2_rtx, const2_rtx));
26050 mask = t3;
26051 maskmode = V4SImode;
26052 e = w = 4;
26055 for (i = 0; i < w; i++)
26056 vec[i] = vt;
26057 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
26058 vt = force_reg (maskmode, vt);
26059 mask = expand_simple_binop (maskmode, AND, mask, vt,
26060 NULL_RTX, 0, OPTAB_DIRECT);
26062 if (GET_MODE (target) != mode)
26063 target = gen_reg_rtx (mode);
26064 xops[0] = target;
26065 xops[1] = gen_lowpart (mode, t2);
26066 xops[2] = gen_lowpart (mode, t1);
26067 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
26068 xops[4] = mask;
26069 xops[5] = vt;
26070 ok = ix86_expand_int_vcond (xops);
26071 gcc_assert (ok);
26072 if (target != operands[0])
26073 emit_move_insn (operands[0],
26074 gen_lowpart (GET_MODE (operands[0]), target));
26078 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
26079 true if we should do zero extension, else sign extension. HIGH_P is
26080 true if we want the N/2 high elements, else the low elements. */
26082 void
26083 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
26085 machine_mode imode = GET_MODE (src);
26086 rtx tmp;
26088 if (TARGET_SSE4_1)
26090 rtx (*unpack)(rtx, rtx);
26091 rtx (*extract)(rtx, rtx) = NULL;
26092 machine_mode halfmode = BLKmode;
26094 switch (imode)
26096 case E_V64QImode:
26097 if (unsigned_p)
26098 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
26099 else
26100 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
26101 halfmode = V32QImode;
26102 extract
26103 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
26104 break;
26105 case E_V32QImode:
26106 if (unsigned_p)
26107 unpack = gen_avx2_zero_extendv16qiv16hi2;
26108 else
26109 unpack = gen_avx2_sign_extendv16qiv16hi2;
26110 halfmode = V16QImode;
26111 extract
26112 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
26113 break;
26114 case E_V32HImode:
26115 if (unsigned_p)
26116 unpack = gen_avx512f_zero_extendv16hiv16si2;
26117 else
26118 unpack = gen_avx512f_sign_extendv16hiv16si2;
26119 halfmode = V16HImode;
26120 extract
26121 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
26122 break;
26123 case E_V16HImode:
26124 if (unsigned_p)
26125 unpack = gen_avx2_zero_extendv8hiv8si2;
26126 else
26127 unpack = gen_avx2_sign_extendv8hiv8si2;
26128 halfmode = V8HImode;
26129 extract
26130 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
26131 break;
26132 case E_V16SImode:
26133 if (unsigned_p)
26134 unpack = gen_avx512f_zero_extendv8siv8di2;
26135 else
26136 unpack = gen_avx512f_sign_extendv8siv8di2;
26137 halfmode = V8SImode;
26138 extract
26139 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
26140 break;
26141 case E_V8SImode:
26142 if (unsigned_p)
26143 unpack = gen_avx2_zero_extendv4siv4di2;
26144 else
26145 unpack = gen_avx2_sign_extendv4siv4di2;
26146 halfmode = V4SImode;
26147 extract
26148 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
26149 break;
26150 case E_V16QImode:
26151 if (unsigned_p)
26152 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
26153 else
26154 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
26155 break;
26156 case E_V8HImode:
26157 if (unsigned_p)
26158 unpack = gen_sse4_1_zero_extendv4hiv4si2;
26159 else
26160 unpack = gen_sse4_1_sign_extendv4hiv4si2;
26161 break;
26162 case E_V4SImode:
26163 if (unsigned_p)
26164 unpack = gen_sse4_1_zero_extendv2siv2di2;
26165 else
26166 unpack = gen_sse4_1_sign_extendv2siv2di2;
26167 break;
26168 default:
26169 gcc_unreachable ();
26172 if (GET_MODE_SIZE (imode) >= 32)
26174 tmp = gen_reg_rtx (halfmode);
26175 emit_insn (extract (tmp, src));
26177 else if (high_p)
26179 /* Shift higher 8 bytes to lower 8 bytes. */
26180 tmp = gen_reg_rtx (V1TImode);
26181 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
26182 GEN_INT (64)));
26183 tmp = gen_lowpart (imode, tmp);
26185 else
26186 tmp = src;
26188 emit_insn (unpack (dest, tmp));
26190 else
26192 rtx (*unpack)(rtx, rtx, rtx);
26194 switch (imode)
26196 case E_V16QImode:
26197 if (high_p)
26198 unpack = gen_vec_interleave_highv16qi;
26199 else
26200 unpack = gen_vec_interleave_lowv16qi;
26201 break;
26202 case E_V8HImode:
26203 if (high_p)
26204 unpack = gen_vec_interleave_highv8hi;
26205 else
26206 unpack = gen_vec_interleave_lowv8hi;
26207 break;
26208 case E_V4SImode:
26209 if (high_p)
26210 unpack = gen_vec_interleave_highv4si;
26211 else
26212 unpack = gen_vec_interleave_lowv4si;
26213 break;
26214 default:
26215 gcc_unreachable ();
26218 if (unsigned_p)
26219 tmp = force_reg (imode, CONST0_RTX (imode));
26220 else
26221 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
26222 src, pc_rtx, pc_rtx);
26224 rtx tmp2 = gen_reg_rtx (imode);
26225 emit_insn (unpack (tmp2, src, tmp));
26226 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
26230 /* Expand conditional increment or decrement using adb/sbb instructions.
26231 The default case using setcc followed by the conditional move can be
26232 done by generic code. */
26233 bool
26234 ix86_expand_int_addcc (rtx operands[])
26236 enum rtx_code code = GET_CODE (operands[1]);
26237 rtx flags;
26238 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
26239 rtx compare_op;
26240 rtx val = const0_rtx;
26241 bool fpcmp = false;
26242 machine_mode mode;
26243 rtx op0 = XEXP (operands[1], 0);
26244 rtx op1 = XEXP (operands[1], 1);
26246 if (operands[3] != const1_rtx
26247 && operands[3] != constm1_rtx)
26248 return false;
26249 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
26250 return false;
26251 code = GET_CODE (compare_op);
26253 flags = XEXP (compare_op, 0);
26255 if (GET_MODE (flags) == CCFPmode
26256 || GET_MODE (flags) == CCFPUmode)
26258 fpcmp = true;
26259 code = ix86_fp_compare_code_to_integer (code);
26262 if (code != LTU)
26264 val = constm1_rtx;
26265 if (fpcmp)
26266 PUT_CODE (compare_op,
26267 reverse_condition_maybe_unordered
26268 (GET_CODE (compare_op)));
26269 else
26270 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
26273 mode = GET_MODE (operands[0]);
26275 /* Construct either adc or sbb insn. */
26276 if ((code == LTU) == (operands[3] == constm1_rtx))
26278 switch (mode)
26280 case E_QImode:
26281 insn = gen_subqi3_carry;
26282 break;
26283 case E_HImode:
26284 insn = gen_subhi3_carry;
26285 break;
26286 case E_SImode:
26287 insn = gen_subsi3_carry;
26288 break;
26289 case E_DImode:
26290 insn = gen_subdi3_carry;
26291 break;
26292 default:
26293 gcc_unreachable ();
26296 else
26298 switch (mode)
26300 case E_QImode:
26301 insn = gen_addqi3_carry;
26302 break;
26303 case E_HImode:
26304 insn = gen_addhi3_carry;
26305 break;
26306 case E_SImode:
26307 insn = gen_addsi3_carry;
26308 break;
26309 case E_DImode:
26310 insn = gen_adddi3_carry;
26311 break;
26312 default:
26313 gcc_unreachable ();
26316 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
26318 return true;
26322 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
26323 but works for floating pointer parameters and nonoffsetable memories.
26324 For pushes, it returns just stack offsets; the values will be saved
26325 in the right order. Maximally three parts are generated. */
26327 static int
26328 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
26330 int size;
26332 if (!TARGET_64BIT)
26333 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
26334 else
26335 size = (GET_MODE_SIZE (mode) + 4) / 8;
26337 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
26338 gcc_assert (size >= 2 && size <= 4);
26340 /* Optimize constant pool reference to immediates. This is used by fp
26341 moves, that force all constants to memory to allow combining. */
26342 if (MEM_P (operand) && MEM_READONLY_P (operand))
26344 rtx tmp = maybe_get_pool_constant (operand);
26345 if (tmp)
26346 operand = tmp;
26349 if (MEM_P (operand) && !offsettable_memref_p (operand))
26351 /* The only non-offsetable memories we handle are pushes. */
26352 int ok = push_operand (operand, VOIDmode);
26354 gcc_assert (ok);
26356 operand = copy_rtx (operand);
26357 PUT_MODE (operand, word_mode);
26358 parts[0] = parts[1] = parts[2] = parts[3] = operand;
26359 return size;
26362 if (GET_CODE (operand) == CONST_VECTOR)
26364 machine_mode imode = int_mode_for_mode (mode);
26365 /* Caution: if we looked through a constant pool memory above,
26366 the operand may actually have a different mode now. That's
26367 ok, since we want to pun this all the way back to an integer. */
26368 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
26369 gcc_assert (operand != NULL);
26370 mode = imode;
26373 if (!TARGET_64BIT)
26375 if (mode == DImode)
26376 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26377 else
26379 int i;
26381 if (REG_P (operand))
26383 gcc_assert (reload_completed);
26384 for (i = 0; i < size; i++)
26385 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
26387 else if (offsettable_memref_p (operand))
26389 operand = adjust_address (operand, SImode, 0);
26390 parts[0] = operand;
26391 for (i = 1; i < size; i++)
26392 parts[i] = adjust_address (operand, SImode, 4 * i);
26394 else if (CONST_DOUBLE_P (operand))
26396 const REAL_VALUE_TYPE *r;
26397 long l[4];
26399 r = CONST_DOUBLE_REAL_VALUE (operand);
26400 switch (mode)
26402 case E_TFmode:
26403 real_to_target (l, r, mode);
26404 parts[3] = gen_int_mode (l[3], SImode);
26405 parts[2] = gen_int_mode (l[2], SImode);
26406 break;
26407 case E_XFmode:
26408 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
26409 long double may not be 80-bit. */
26410 real_to_target (l, r, mode);
26411 parts[2] = gen_int_mode (l[2], SImode);
26412 break;
26413 case E_DFmode:
26414 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
26415 break;
26416 default:
26417 gcc_unreachable ();
26419 parts[1] = gen_int_mode (l[1], SImode);
26420 parts[0] = gen_int_mode (l[0], SImode);
26422 else
26423 gcc_unreachable ();
26426 else
26428 if (mode == TImode)
26429 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
26430 if (mode == XFmode || mode == TFmode)
26432 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
26433 if (REG_P (operand))
26435 gcc_assert (reload_completed);
26436 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
26437 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
26439 else if (offsettable_memref_p (operand))
26441 operand = adjust_address (operand, DImode, 0);
26442 parts[0] = operand;
26443 parts[1] = adjust_address (operand, upper_mode, 8);
26445 else if (CONST_DOUBLE_P (operand))
26447 long l[4];
26449 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
26451 /* real_to_target puts 32-bit pieces in each long. */
26452 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
26453 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
26454 << 32), DImode);
26456 if (upper_mode == SImode)
26457 parts[1] = gen_int_mode (l[2], SImode);
26458 else
26459 parts[1]
26460 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
26461 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
26462 << 32), DImode);
26464 else
26465 gcc_unreachable ();
26469 return size;
26472 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
26473 Return false when normal moves are needed; true when all required
26474 insns have been emitted. Operands 2-4 contain the input values
26475 int the correct order; operands 5-7 contain the output values. */
26477 void
26478 ix86_split_long_move (rtx operands[])
26480 rtx part[2][4];
26481 int nparts, i, j;
26482 int push = 0;
26483 int collisions = 0;
26484 machine_mode mode = GET_MODE (operands[0]);
26485 bool collisionparts[4];
26487 /* The DFmode expanders may ask us to move double.
26488 For 64bit target this is single move. By hiding the fact
26489 here we simplify i386.md splitters. */
26490 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
26492 /* Optimize constant pool reference to immediates. This is used by
26493 fp moves, that force all constants to memory to allow combining. */
26495 if (MEM_P (operands[1])
26496 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
26497 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
26498 operands[1] = get_pool_constant (XEXP (operands[1], 0));
26499 if (push_operand (operands[0], VOIDmode))
26501 operands[0] = copy_rtx (operands[0]);
26502 PUT_MODE (operands[0], word_mode);
26504 else
26505 operands[0] = gen_lowpart (DImode, operands[0]);
26506 operands[1] = gen_lowpart (DImode, operands[1]);
26507 emit_move_insn (operands[0], operands[1]);
26508 return;
26511 /* The only non-offsettable memory we handle is push. */
26512 if (push_operand (operands[0], VOIDmode))
26513 push = 1;
26514 else
26515 gcc_assert (!MEM_P (operands[0])
26516 || offsettable_memref_p (operands[0]));
26518 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
26519 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
26521 /* When emitting push, take care for source operands on the stack. */
26522 if (push && MEM_P (operands[1])
26523 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
26525 rtx src_base = XEXP (part[1][nparts - 1], 0);
26527 /* Compensate for the stack decrement by 4. */
26528 if (!TARGET_64BIT && nparts == 3
26529 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
26530 src_base = plus_constant (Pmode, src_base, 4);
26532 /* src_base refers to the stack pointer and is
26533 automatically decreased by emitted push. */
26534 for (i = 0; i < nparts; i++)
26535 part[1][i] = change_address (part[1][i],
26536 GET_MODE (part[1][i]), src_base);
26539 /* We need to do copy in the right order in case an address register
26540 of the source overlaps the destination. */
26541 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
26543 rtx tmp;
26545 for (i = 0; i < nparts; i++)
26547 collisionparts[i]
26548 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
26549 if (collisionparts[i])
26550 collisions++;
26553 /* Collision in the middle part can be handled by reordering. */
26554 if (collisions == 1 && nparts == 3 && collisionparts [1])
26556 std::swap (part[0][1], part[0][2]);
26557 std::swap (part[1][1], part[1][2]);
26559 else if (collisions == 1
26560 && nparts == 4
26561 && (collisionparts [1] || collisionparts [2]))
26563 if (collisionparts [1])
26565 std::swap (part[0][1], part[0][2]);
26566 std::swap (part[1][1], part[1][2]);
26568 else
26570 std::swap (part[0][2], part[0][3]);
26571 std::swap (part[1][2], part[1][3]);
26575 /* If there are more collisions, we can't handle it by reordering.
26576 Do an lea to the last part and use only one colliding move. */
26577 else if (collisions > 1)
26579 rtx base, addr, tls_base = NULL_RTX;
26581 collisions = 1;
26583 base = part[0][nparts - 1];
26585 /* Handle the case when the last part isn't valid for lea.
26586 Happens in 64-bit mode storing the 12-byte XFmode. */
26587 if (GET_MODE (base) != Pmode)
26588 base = gen_rtx_REG (Pmode, REGNO (base));
26590 addr = XEXP (part[1][0], 0);
26591 if (TARGET_TLS_DIRECT_SEG_REFS)
26593 struct ix86_address parts;
26594 int ok = ix86_decompose_address (addr, &parts);
26595 gcc_assert (ok);
26596 if (parts.seg == DEFAULT_TLS_SEG_REG)
26598 /* It is not valid to use %gs: or %fs: in
26599 lea though, so we need to remove it from the
26600 address used for lea and add it to each individual
26601 memory loads instead. */
26602 addr = copy_rtx (addr);
26603 rtx *x = &addr;
26604 while (GET_CODE (*x) == PLUS)
26606 for (i = 0; i < 2; i++)
26608 rtx u = XEXP (*x, i);
26609 if (GET_CODE (u) == ZERO_EXTEND)
26610 u = XEXP (u, 0);
26611 if (GET_CODE (u) == UNSPEC
26612 && XINT (u, 1) == UNSPEC_TP)
26614 tls_base = XEXP (*x, i);
26615 *x = XEXP (*x, 1 - i);
26616 break;
26619 if (tls_base)
26620 break;
26621 x = &XEXP (*x, 0);
26623 gcc_assert (tls_base);
26626 emit_insn (gen_rtx_SET (base, addr));
26627 if (tls_base)
26628 base = gen_rtx_PLUS (GET_MODE (base), base, tls_base);
26629 part[1][0] = replace_equiv_address (part[1][0], base);
26630 for (i = 1; i < nparts; i++)
26632 if (tls_base)
26633 base = copy_rtx (base);
26634 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
26635 part[1][i] = replace_equiv_address (part[1][i], tmp);
26640 if (push)
26642 if (!TARGET_64BIT)
26644 if (nparts == 3)
26646 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
26647 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
26648 stack_pointer_rtx, GEN_INT (-4)));
26649 emit_move_insn (part[0][2], part[1][2]);
26651 else if (nparts == 4)
26653 emit_move_insn (part[0][3], part[1][3]);
26654 emit_move_insn (part[0][2], part[1][2]);
26657 else
26659 /* In 64bit mode we don't have 32bit push available. In case this is
26660 register, it is OK - we will just use larger counterpart. We also
26661 retype memory - these comes from attempt to avoid REX prefix on
26662 moving of second half of TFmode value. */
26663 if (GET_MODE (part[1][1]) == SImode)
26665 switch (GET_CODE (part[1][1]))
26667 case MEM:
26668 part[1][1] = adjust_address (part[1][1], DImode, 0);
26669 break;
26671 case REG:
26672 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
26673 break;
26675 default:
26676 gcc_unreachable ();
26679 if (GET_MODE (part[1][0]) == SImode)
26680 part[1][0] = part[1][1];
26683 emit_move_insn (part[0][1], part[1][1]);
26684 emit_move_insn (part[0][0], part[1][0]);
26685 return;
26688 /* Choose correct order to not overwrite the source before it is copied. */
26689 if ((REG_P (part[0][0])
26690 && REG_P (part[1][1])
26691 && (REGNO (part[0][0]) == REGNO (part[1][1])
26692 || (nparts == 3
26693 && REGNO (part[0][0]) == REGNO (part[1][2]))
26694 || (nparts == 4
26695 && REGNO (part[0][0]) == REGNO (part[1][3]))))
26696 || (collisions > 0
26697 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
26699 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
26701 operands[2 + i] = part[0][j];
26702 operands[6 + i] = part[1][j];
26705 else
26707 for (i = 0; i < nparts; i++)
26709 operands[2 + i] = part[0][i];
26710 operands[6 + i] = part[1][i];
26714 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
26715 if (optimize_insn_for_size_p ())
26717 for (j = 0; j < nparts - 1; j++)
26718 if (CONST_INT_P (operands[6 + j])
26719 && operands[6 + j] != const0_rtx
26720 && REG_P (operands[2 + j]))
26721 for (i = j; i < nparts - 1; i++)
26722 if (CONST_INT_P (operands[7 + i])
26723 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
26724 operands[7 + i] = operands[2 + j];
26727 for (i = 0; i < nparts; i++)
26728 emit_move_insn (operands[2 + i], operands[6 + i]);
26730 return;
26733 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
26734 left shift by a constant, either using a single shift or
26735 a sequence of add instructions. */
26737 static void
26738 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
26740 rtx (*insn)(rtx, rtx, rtx);
26742 if (count == 1
26743 || (count * ix86_cost->add <= ix86_cost->shift_const
26744 && !optimize_insn_for_size_p ()))
26746 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
26747 while (count-- > 0)
26748 emit_insn (insn (operand, operand, operand));
26750 else
26752 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26753 emit_insn (insn (operand, operand, GEN_INT (count)));
26757 void
26758 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
26760 rtx (*gen_ashl3)(rtx, rtx, rtx);
26761 rtx (*gen_shld)(rtx, rtx, rtx);
26762 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26764 rtx low[2], high[2];
26765 int count;
26767 if (CONST_INT_P (operands[2]))
26769 split_double_mode (mode, operands, 2, low, high);
26770 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26772 if (count >= half_width)
26774 emit_move_insn (high[0], low[1]);
26775 emit_move_insn (low[0], const0_rtx);
26777 if (count > half_width)
26778 ix86_expand_ashl_const (high[0], count - half_width, mode);
26780 else
26782 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26784 if (!rtx_equal_p (operands[0], operands[1]))
26785 emit_move_insn (operands[0], operands[1]);
26787 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
26788 ix86_expand_ashl_const (low[0], count, mode);
26790 return;
26793 split_double_mode (mode, operands, 1, low, high);
26795 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
26797 if (operands[1] == const1_rtx)
26799 /* Assuming we've chosen a QImode capable registers, then 1 << N
26800 can be done with two 32/64-bit shifts, no branches, no cmoves. */
26801 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
26803 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
26805 ix86_expand_clear (low[0]);
26806 ix86_expand_clear (high[0]);
26807 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
26809 d = gen_lowpart (QImode, low[0]);
26810 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26811 s = gen_rtx_EQ (QImode, flags, const0_rtx);
26812 emit_insn (gen_rtx_SET (d, s));
26814 d = gen_lowpart (QImode, high[0]);
26815 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
26816 s = gen_rtx_NE (QImode, flags, const0_rtx);
26817 emit_insn (gen_rtx_SET (d, s));
26820 /* Otherwise, we can get the same results by manually performing
26821 a bit extract operation on bit 5/6, and then performing the two
26822 shifts. The two methods of getting 0/1 into low/high are exactly
26823 the same size. Avoiding the shift in the bit extract case helps
26824 pentium4 a bit; no one else seems to care much either way. */
26825 else
26827 machine_mode half_mode;
26828 rtx (*gen_lshr3)(rtx, rtx, rtx);
26829 rtx (*gen_and3)(rtx, rtx, rtx);
26830 rtx (*gen_xor3)(rtx, rtx, rtx);
26831 HOST_WIDE_INT bits;
26832 rtx x;
26834 if (mode == DImode)
26836 half_mode = SImode;
26837 gen_lshr3 = gen_lshrsi3;
26838 gen_and3 = gen_andsi3;
26839 gen_xor3 = gen_xorsi3;
26840 bits = 5;
26842 else
26844 half_mode = DImode;
26845 gen_lshr3 = gen_lshrdi3;
26846 gen_and3 = gen_anddi3;
26847 gen_xor3 = gen_xordi3;
26848 bits = 6;
26851 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
26852 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
26853 else
26854 x = gen_lowpart (half_mode, operands[2]);
26855 emit_insn (gen_rtx_SET (high[0], x));
26857 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
26858 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
26859 emit_move_insn (low[0], high[0]);
26860 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
26863 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26864 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
26865 return;
26868 if (operands[1] == constm1_rtx)
26870 /* For -1 << N, we can avoid the shld instruction, because we
26871 know that we're shifting 0...31/63 ones into a -1. */
26872 emit_move_insn (low[0], constm1_rtx);
26873 if (optimize_insn_for_size_p ())
26874 emit_move_insn (high[0], low[0]);
26875 else
26876 emit_move_insn (high[0], constm1_rtx);
26878 else
26880 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
26882 if (!rtx_equal_p (operands[0], operands[1]))
26883 emit_move_insn (operands[0], operands[1]);
26885 split_double_mode (mode, operands, 1, low, high);
26886 emit_insn (gen_shld (high[0], low[0], operands[2]));
26889 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
26891 if (TARGET_CMOVE && scratch)
26893 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26894 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26896 ix86_expand_clear (scratch);
26897 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
26899 else
26901 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26902 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26904 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
26908 void
26909 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
26911 rtx (*gen_ashr3)(rtx, rtx, rtx)
26912 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
26913 rtx (*gen_shrd)(rtx, rtx, rtx);
26914 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26916 rtx low[2], high[2];
26917 int count;
26919 if (CONST_INT_P (operands[2]))
26921 split_double_mode (mode, operands, 2, low, high);
26922 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26924 if (count == GET_MODE_BITSIZE (mode) - 1)
26926 emit_move_insn (high[0], high[1]);
26927 emit_insn (gen_ashr3 (high[0], high[0],
26928 GEN_INT (half_width - 1)));
26929 emit_move_insn (low[0], high[0]);
26932 else if (count >= half_width)
26934 emit_move_insn (low[0], high[1]);
26935 emit_move_insn (high[0], low[0]);
26936 emit_insn (gen_ashr3 (high[0], high[0],
26937 GEN_INT (half_width - 1)));
26939 if (count > half_width)
26940 emit_insn (gen_ashr3 (low[0], low[0],
26941 GEN_INT (count - half_width)));
26943 else
26945 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26947 if (!rtx_equal_p (operands[0], operands[1]))
26948 emit_move_insn (operands[0], operands[1]);
26950 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26951 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26954 else
26956 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26958 if (!rtx_equal_p (operands[0], operands[1]))
26959 emit_move_insn (operands[0], operands[1]);
26961 split_double_mode (mode, operands, 1, low, high);
26963 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26964 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26966 if (TARGET_CMOVE && scratch)
26968 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26969 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26971 emit_move_insn (scratch, high[0]);
26972 emit_insn (gen_ashr3 (scratch, scratch,
26973 GEN_INT (half_width - 1)));
26974 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26975 scratch));
26977 else
26979 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26980 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26982 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26987 void
26988 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26990 rtx (*gen_lshr3)(rtx, rtx, rtx)
26991 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26992 rtx (*gen_shrd)(rtx, rtx, rtx);
26993 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26995 rtx low[2], high[2];
26996 int count;
26998 if (CONST_INT_P (operands[2]))
27000 split_double_mode (mode, operands, 2, low, high);
27001 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
27003 if (count >= half_width)
27005 emit_move_insn (low[0], high[1]);
27006 ix86_expand_clear (high[0]);
27008 if (count > half_width)
27009 emit_insn (gen_lshr3 (low[0], low[0],
27010 GEN_INT (count - half_width)));
27012 else
27014 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27016 if (!rtx_equal_p (operands[0], operands[1]))
27017 emit_move_insn (operands[0], operands[1]);
27019 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
27020 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
27023 else
27025 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
27027 if (!rtx_equal_p (operands[0], operands[1]))
27028 emit_move_insn (operands[0], operands[1]);
27030 split_double_mode (mode, operands, 1, low, high);
27032 emit_insn (gen_shrd (low[0], high[0], operands[2]));
27033 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
27035 if (TARGET_CMOVE && scratch)
27037 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
27038 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
27040 ix86_expand_clear (scratch);
27041 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
27042 scratch));
27044 else
27046 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
27047 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
27049 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
27054 /* Predict just emitted jump instruction to be taken with probability PROB. */
27055 static void
27056 predict_jump (int prob)
27058 rtx_insn *insn = get_last_insn ();
27059 gcc_assert (JUMP_P (insn));
27060 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
27063 /* Helper function for the string operations below. Dest VARIABLE whether
27064 it is aligned to VALUE bytes. If true, jump to the label. */
27065 static rtx_code_label *
27066 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
27068 rtx_code_label *label = gen_label_rtx ();
27069 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
27070 if (GET_MODE (variable) == DImode)
27071 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
27072 else
27073 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
27074 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
27075 1, label);
27076 if (epilogue)
27077 predict_jump (REG_BR_PROB_BASE * 50 / 100);
27078 else
27079 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27080 return label;
27083 /* Adjust COUNTER by the VALUE. */
27084 static void
27085 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
27087 rtx (*gen_add)(rtx, rtx, rtx)
27088 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
27090 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
27093 /* Zero extend possibly SImode EXP to Pmode register. */
27095 ix86_zero_extend_to_Pmode (rtx exp)
27097 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
27100 /* Divide COUNTREG by SCALE. */
27101 static rtx
27102 scale_counter (rtx countreg, int scale)
27104 rtx sc;
27106 if (scale == 1)
27107 return countreg;
27108 if (CONST_INT_P (countreg))
27109 return GEN_INT (INTVAL (countreg) / scale);
27110 gcc_assert (REG_P (countreg));
27112 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
27113 GEN_INT (exact_log2 (scale)),
27114 NULL, 1, OPTAB_DIRECT);
27115 return sc;
27118 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
27119 DImode for constant loop counts. */
27121 static machine_mode
27122 counter_mode (rtx count_exp)
27124 if (GET_MODE (count_exp) != VOIDmode)
27125 return GET_MODE (count_exp);
27126 if (!CONST_INT_P (count_exp))
27127 return Pmode;
27128 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
27129 return DImode;
27130 return SImode;
27133 /* Copy the address to a Pmode register. This is used for x32 to
27134 truncate DImode TLS address to a SImode register. */
27136 static rtx
27137 ix86_copy_addr_to_reg (rtx addr)
27139 rtx reg;
27140 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
27142 reg = copy_addr_to_reg (addr);
27143 REG_POINTER (reg) = 1;
27144 return reg;
27146 else
27148 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
27149 reg = copy_to_mode_reg (DImode, addr);
27150 REG_POINTER (reg) = 1;
27151 return gen_rtx_SUBREG (SImode, reg, 0);
27155 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
27156 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
27157 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
27158 memory by VALUE (supposed to be in MODE).
27160 The size is rounded down to whole number of chunk size moved at once.
27161 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
27164 static void
27165 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
27166 rtx destptr, rtx srcptr, rtx value,
27167 rtx count, machine_mode mode, int unroll,
27168 int expected_size, bool issetmem)
27170 rtx_code_label *out_label, *top_label;
27171 rtx iter, tmp;
27172 machine_mode iter_mode = counter_mode (count);
27173 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
27174 rtx piece_size = GEN_INT (piece_size_n);
27175 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
27176 rtx size;
27177 int i;
27179 top_label = gen_label_rtx ();
27180 out_label = gen_label_rtx ();
27181 iter = gen_reg_rtx (iter_mode);
27183 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
27184 NULL, 1, OPTAB_DIRECT);
27185 /* Those two should combine. */
27186 if (piece_size == const1_rtx)
27188 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
27189 true, out_label);
27190 predict_jump (REG_BR_PROB_BASE * 10 / 100);
27192 emit_move_insn (iter, const0_rtx);
27194 emit_label (top_label);
27196 tmp = convert_modes (Pmode, iter_mode, iter, true);
27198 /* This assert could be relaxed - in this case we'll need to compute
27199 smallest power of two, containing in PIECE_SIZE_N and pass it to
27200 offset_address. */
27201 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
27202 destmem = offset_address (destmem, tmp, piece_size_n);
27203 destmem = adjust_address (destmem, mode, 0);
27205 if (!issetmem)
27207 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
27208 srcmem = adjust_address (srcmem, mode, 0);
27210 /* When unrolling for chips that reorder memory reads and writes,
27211 we can save registers by using single temporary.
27212 Also using 4 temporaries is overkill in 32bit mode. */
27213 if (!TARGET_64BIT && 0)
27215 for (i = 0; i < unroll; i++)
27217 if (i)
27219 destmem =
27220 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27221 srcmem =
27222 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27224 emit_move_insn (destmem, srcmem);
27227 else
27229 rtx tmpreg[4];
27230 gcc_assert (unroll <= 4);
27231 for (i = 0; i < unroll; i++)
27233 tmpreg[i] = gen_reg_rtx (mode);
27234 if (i)
27236 srcmem =
27237 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
27239 emit_move_insn (tmpreg[i], srcmem);
27241 for (i = 0; i < unroll; i++)
27243 if (i)
27245 destmem =
27246 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27248 emit_move_insn (destmem, tmpreg[i]);
27252 else
27253 for (i = 0; i < unroll; i++)
27255 if (i)
27256 destmem =
27257 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
27258 emit_move_insn (destmem, value);
27261 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
27262 true, OPTAB_LIB_WIDEN);
27263 if (tmp != iter)
27264 emit_move_insn (iter, tmp);
27266 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
27267 true, top_label);
27268 if (expected_size != -1)
27270 expected_size /= GET_MODE_SIZE (mode) * unroll;
27271 if (expected_size == 0)
27272 predict_jump (0);
27273 else if (expected_size > REG_BR_PROB_BASE)
27274 predict_jump (REG_BR_PROB_BASE - 1);
27275 else
27276 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
27278 else
27279 predict_jump (REG_BR_PROB_BASE * 80 / 100);
27280 iter = ix86_zero_extend_to_Pmode (iter);
27281 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
27282 true, OPTAB_LIB_WIDEN);
27283 if (tmp != destptr)
27284 emit_move_insn (destptr, tmp);
27285 if (!issetmem)
27287 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
27288 true, OPTAB_LIB_WIDEN);
27289 if (tmp != srcptr)
27290 emit_move_insn (srcptr, tmp);
27292 emit_label (out_label);
27295 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
27296 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
27297 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
27298 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
27299 ORIG_VALUE is the original value passed to memset to fill the memory with.
27300 Other arguments have same meaning as for previous function. */
27302 static void
27303 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
27304 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
27305 rtx count,
27306 machine_mode mode, bool issetmem)
27308 rtx destexp;
27309 rtx srcexp;
27310 rtx countreg;
27311 HOST_WIDE_INT rounded_count;
27313 /* If possible, it is shorter to use rep movs.
27314 TODO: Maybe it is better to move this logic to decide_alg. */
27315 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
27316 && (!issetmem || orig_value == const0_rtx))
27317 mode = SImode;
27319 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
27320 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
27322 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
27323 GET_MODE_SIZE (mode)));
27324 if (mode != QImode)
27326 destexp = gen_rtx_ASHIFT (Pmode, countreg,
27327 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27328 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
27330 else
27331 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
27332 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
27334 rounded_count
27335 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27336 destmem = shallow_copy_rtx (destmem);
27337 set_mem_size (destmem, rounded_count);
27339 else if (MEM_SIZE_KNOWN_P (destmem))
27340 clear_mem_size (destmem);
27342 if (issetmem)
27344 value = force_reg (mode, gen_lowpart (mode, value));
27345 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
27347 else
27349 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
27350 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
27351 if (mode != QImode)
27353 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
27354 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
27355 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
27357 else
27358 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
27359 if (CONST_INT_P (count))
27361 rounded_count
27362 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
27363 srcmem = shallow_copy_rtx (srcmem);
27364 set_mem_size (srcmem, rounded_count);
27366 else
27368 if (MEM_SIZE_KNOWN_P (srcmem))
27369 clear_mem_size (srcmem);
27371 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
27372 destexp, srcexp));
27376 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
27377 DESTMEM.
27378 SRC is passed by pointer to be updated on return.
27379 Return value is updated DST. */
27380 static rtx
27381 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
27382 HOST_WIDE_INT size_to_move)
27384 rtx dst = destmem, src = *srcmem, adjust, tempreg;
27385 enum insn_code code;
27386 machine_mode move_mode;
27387 int piece_size, i;
27389 /* Find the widest mode in which we could perform moves.
27390 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27391 it until move of such size is supported. */
27392 piece_size = 1 << floor_log2 (size_to_move);
27393 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27394 code = optab_handler (mov_optab, move_mode);
27395 while (code == CODE_FOR_nothing && piece_size > 1)
27397 piece_size >>= 1;
27398 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
27399 code = optab_handler (mov_optab, move_mode);
27402 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27403 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27404 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27406 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27407 move_mode = mode_for_vector (word_mode, nunits);
27408 code = optab_handler (mov_optab, move_mode);
27409 if (code == CODE_FOR_nothing)
27411 move_mode = word_mode;
27412 piece_size = GET_MODE_SIZE (move_mode);
27413 code = optab_handler (mov_optab, move_mode);
27416 gcc_assert (code != CODE_FOR_nothing);
27418 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27419 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
27421 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27422 gcc_assert (size_to_move % piece_size == 0);
27423 adjust = GEN_INT (piece_size);
27424 for (i = 0; i < size_to_move; i += piece_size)
27426 /* We move from memory to memory, so we'll need to do it via
27427 a temporary register. */
27428 tempreg = gen_reg_rtx (move_mode);
27429 emit_insn (GEN_FCN (code) (tempreg, src));
27430 emit_insn (GEN_FCN (code) (dst, tempreg));
27432 emit_move_insn (destptr,
27433 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27434 emit_move_insn (srcptr,
27435 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
27437 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27438 piece_size);
27439 src = adjust_automodify_address_nv (src, move_mode, srcptr,
27440 piece_size);
27443 /* Update DST and SRC rtx. */
27444 *srcmem = src;
27445 return dst;
27448 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
27449 static void
27450 expand_movmem_epilogue (rtx destmem, rtx srcmem,
27451 rtx destptr, rtx srcptr, rtx count, int max_size)
27453 rtx src, dest;
27454 if (CONST_INT_P (count))
27456 HOST_WIDE_INT countval = INTVAL (count);
27457 HOST_WIDE_INT epilogue_size = countval % max_size;
27458 int i;
27460 /* For now MAX_SIZE should be a power of 2. This assert could be
27461 relaxed, but it'll require a bit more complicated epilogue
27462 expanding. */
27463 gcc_assert ((max_size & (max_size - 1)) == 0);
27464 for (i = max_size; i >= 1; i >>= 1)
27466 if (epilogue_size & i)
27467 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27469 return;
27471 if (max_size > 8)
27473 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
27474 count, 1, OPTAB_DIRECT);
27475 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
27476 count, QImode, 1, 4, false);
27477 return;
27480 /* When there are stringops, we can cheaply increase dest and src pointers.
27481 Otherwise we save code size by maintaining offset (zero is readily
27482 available from preceding rep operation) and using x86 addressing modes.
27484 if (TARGET_SINGLE_STRINGOP)
27486 if (max_size > 4)
27488 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27489 src = change_address (srcmem, SImode, srcptr);
27490 dest = change_address (destmem, SImode, destptr);
27491 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27492 emit_label (label);
27493 LABEL_NUSES (label) = 1;
27495 if (max_size > 2)
27497 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27498 src = change_address (srcmem, HImode, srcptr);
27499 dest = change_address (destmem, HImode, destptr);
27500 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27501 emit_label (label);
27502 LABEL_NUSES (label) = 1;
27504 if (max_size > 1)
27506 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27507 src = change_address (srcmem, QImode, srcptr);
27508 dest = change_address (destmem, QImode, destptr);
27509 emit_insn (gen_strmov (destptr, dest, srcptr, src));
27510 emit_label (label);
27511 LABEL_NUSES (label) = 1;
27514 else
27516 rtx offset = force_reg (Pmode, const0_rtx);
27517 rtx tmp;
27519 if (max_size > 4)
27521 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27522 src = change_address (srcmem, SImode, srcptr);
27523 dest = change_address (destmem, SImode, destptr);
27524 emit_move_insn (dest, src);
27525 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
27526 true, OPTAB_LIB_WIDEN);
27527 if (tmp != offset)
27528 emit_move_insn (offset, tmp);
27529 emit_label (label);
27530 LABEL_NUSES (label) = 1;
27532 if (max_size > 2)
27534 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27535 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27536 src = change_address (srcmem, HImode, tmp);
27537 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27538 dest = change_address (destmem, HImode, tmp);
27539 emit_move_insn (dest, src);
27540 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
27541 true, OPTAB_LIB_WIDEN);
27542 if (tmp != offset)
27543 emit_move_insn (offset, tmp);
27544 emit_label (label);
27545 LABEL_NUSES (label) = 1;
27547 if (max_size > 1)
27549 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27550 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
27551 src = change_address (srcmem, QImode, tmp);
27552 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
27553 dest = change_address (destmem, QImode, tmp);
27554 emit_move_insn (dest, src);
27555 emit_label (label);
27556 LABEL_NUSES (label) = 1;
27561 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
27562 with value PROMOTED_VAL.
27563 SRC is passed by pointer to be updated on return.
27564 Return value is updated DST. */
27565 static rtx
27566 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
27567 HOST_WIDE_INT size_to_move)
27569 rtx dst = destmem, adjust;
27570 enum insn_code code;
27571 machine_mode move_mode;
27572 int piece_size, i;
27574 /* Find the widest mode in which we could perform moves.
27575 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
27576 it until move of such size is supported. */
27577 move_mode = GET_MODE (promoted_val);
27578 if (move_mode == VOIDmode)
27579 move_mode = QImode;
27580 if (size_to_move < GET_MODE_SIZE (move_mode))
27582 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
27583 promoted_val = gen_lowpart (move_mode, promoted_val);
27585 piece_size = GET_MODE_SIZE (move_mode);
27586 code = optab_handler (mov_optab, move_mode);
27587 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
27589 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
27591 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
27592 gcc_assert (size_to_move % piece_size == 0);
27593 adjust = GEN_INT (piece_size);
27594 for (i = 0; i < size_to_move; i += piece_size)
27596 if (piece_size <= GET_MODE_SIZE (word_mode))
27598 emit_insn (gen_strset (destptr, dst, promoted_val));
27599 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27600 piece_size);
27601 continue;
27604 emit_insn (GEN_FCN (code) (dst, promoted_val));
27606 emit_move_insn (destptr,
27607 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
27609 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
27610 piece_size);
27613 /* Update DST rtx. */
27614 return dst;
27616 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27617 static void
27618 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
27619 rtx count, int max_size)
27621 count =
27622 expand_simple_binop (counter_mode (count), AND, count,
27623 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
27624 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
27625 gen_lowpart (QImode, value), count, QImode,
27626 1, max_size / 2, true);
27629 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
27630 static void
27631 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
27632 rtx count, int max_size)
27634 rtx dest;
27636 if (CONST_INT_P (count))
27638 HOST_WIDE_INT countval = INTVAL (count);
27639 HOST_WIDE_INT epilogue_size = countval % max_size;
27640 int i;
27642 /* For now MAX_SIZE should be a power of 2. This assert could be
27643 relaxed, but it'll require a bit more complicated epilogue
27644 expanding. */
27645 gcc_assert ((max_size & (max_size - 1)) == 0);
27646 for (i = max_size; i >= 1; i >>= 1)
27648 if (epilogue_size & i)
27650 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27651 destmem = emit_memset (destmem, destptr, vec_value, i);
27652 else
27653 destmem = emit_memset (destmem, destptr, value, i);
27656 return;
27658 if (max_size > 32)
27660 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
27661 return;
27663 if (max_size > 16)
27665 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
27666 if (TARGET_64BIT)
27668 dest = change_address (destmem, DImode, destptr);
27669 emit_insn (gen_strset (destptr, dest, value));
27670 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
27671 emit_insn (gen_strset (destptr, dest, value));
27673 else
27675 dest = change_address (destmem, SImode, destptr);
27676 emit_insn (gen_strset (destptr, dest, value));
27677 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27678 emit_insn (gen_strset (destptr, dest, value));
27679 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
27680 emit_insn (gen_strset (destptr, dest, value));
27681 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
27682 emit_insn (gen_strset (destptr, dest, value));
27684 emit_label (label);
27685 LABEL_NUSES (label) = 1;
27687 if (max_size > 8)
27689 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
27690 if (TARGET_64BIT)
27692 dest = change_address (destmem, DImode, destptr);
27693 emit_insn (gen_strset (destptr, dest, value));
27695 else
27697 dest = change_address (destmem, SImode, destptr);
27698 emit_insn (gen_strset (destptr, dest, value));
27699 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
27700 emit_insn (gen_strset (destptr, dest, value));
27702 emit_label (label);
27703 LABEL_NUSES (label) = 1;
27705 if (max_size > 4)
27707 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
27708 dest = change_address (destmem, SImode, destptr);
27709 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
27710 emit_label (label);
27711 LABEL_NUSES (label) = 1;
27713 if (max_size > 2)
27715 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
27716 dest = change_address (destmem, HImode, destptr);
27717 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
27718 emit_label (label);
27719 LABEL_NUSES (label) = 1;
27721 if (max_size > 1)
27723 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
27724 dest = change_address (destmem, QImode, destptr);
27725 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
27726 emit_label (label);
27727 LABEL_NUSES (label) = 1;
27731 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
27732 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
27733 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
27734 ignored.
27735 Return value is updated DESTMEM. */
27736 static rtx
27737 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
27738 rtx destptr, rtx srcptr, rtx value,
27739 rtx vec_value, rtx count, int align,
27740 int desired_alignment, bool issetmem)
27742 int i;
27743 for (i = 1; i < desired_alignment; i <<= 1)
27745 if (align <= i)
27747 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
27748 if (issetmem)
27750 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
27751 destmem = emit_memset (destmem, destptr, vec_value, i);
27752 else
27753 destmem = emit_memset (destmem, destptr, value, i);
27755 else
27756 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
27757 ix86_adjust_counter (count, i);
27758 emit_label (label);
27759 LABEL_NUSES (label) = 1;
27760 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
27763 return destmem;
27766 /* Test if COUNT&SIZE is nonzero and if so, expand movme
27767 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
27768 and jump to DONE_LABEL. */
27769 static void
27770 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
27771 rtx destptr, rtx srcptr,
27772 rtx value, rtx vec_value,
27773 rtx count, int size,
27774 rtx done_label, bool issetmem)
27776 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
27777 machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
27778 rtx modesize;
27779 int n;
27781 /* If we do not have vector value to copy, we must reduce size. */
27782 if (issetmem)
27784 if (!vec_value)
27786 if (GET_MODE (value) == VOIDmode && size > 8)
27787 mode = Pmode;
27788 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
27789 mode = GET_MODE (value);
27791 else
27792 mode = GET_MODE (vec_value), value = vec_value;
27794 else
27796 /* Choose appropriate vector mode. */
27797 if (size >= 32)
27798 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
27799 else if (size >= 16)
27800 mode = TARGET_SSE ? V16QImode : DImode;
27801 srcmem = change_address (srcmem, mode, srcptr);
27803 destmem = change_address (destmem, mode, destptr);
27804 modesize = GEN_INT (GET_MODE_SIZE (mode));
27805 gcc_assert (GET_MODE_SIZE (mode) <= size);
27806 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27808 if (issetmem)
27809 emit_move_insn (destmem, gen_lowpart (mode, value));
27810 else
27812 emit_move_insn (destmem, srcmem);
27813 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27815 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27818 destmem = offset_address (destmem, count, 1);
27819 destmem = offset_address (destmem, GEN_INT (-2 * size),
27820 GET_MODE_SIZE (mode));
27821 if (!issetmem)
27823 srcmem = offset_address (srcmem, count, 1);
27824 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
27825 GET_MODE_SIZE (mode));
27827 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
27829 if (issetmem)
27830 emit_move_insn (destmem, gen_lowpart (mode, value));
27831 else
27833 emit_move_insn (destmem, srcmem);
27834 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27836 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27838 emit_jump_insn (gen_jump (done_label));
27839 emit_barrier ();
27841 emit_label (label);
27842 LABEL_NUSES (label) = 1;
27845 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
27846 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
27847 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
27848 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
27849 DONE_LABEL is a label after the whole copying sequence. The label is created
27850 on demand if *DONE_LABEL is NULL.
27851 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
27852 bounds after the initial copies.
27854 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
27855 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
27856 we will dispatch to a library call for large blocks.
27858 In pseudocode we do:
27860 if (COUNT < SIZE)
27862 Assume that SIZE is 4. Bigger sizes are handled analogously
27863 if (COUNT & 4)
27865 copy 4 bytes from SRCPTR to DESTPTR
27866 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
27867 goto done_label
27869 if (!COUNT)
27870 goto done_label;
27871 copy 1 byte from SRCPTR to DESTPTR
27872 if (COUNT & 2)
27874 copy 2 bytes from SRCPTR to DESTPTR
27875 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
27878 else
27880 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
27881 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
27883 OLD_DESPTR = DESTPTR;
27884 Align DESTPTR up to DESIRED_ALIGN
27885 SRCPTR += DESTPTR - OLD_DESTPTR
27886 COUNT -= DEST_PTR - OLD_DESTPTR
27887 if (DYNAMIC_CHECK)
27888 Round COUNT down to multiple of SIZE
27889 << optional caller supplied zero size guard is here >>
27890 << optional caller supplied dynamic check is here >>
27891 << caller supplied main copy loop is here >>
27893 done_label:
27895 static void
27896 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
27897 rtx *destptr, rtx *srcptr,
27898 machine_mode mode,
27899 rtx value, rtx vec_value,
27900 rtx *count,
27901 rtx_code_label **done_label,
27902 int size,
27903 int desired_align,
27904 int align,
27905 unsigned HOST_WIDE_INT *min_size,
27906 bool dynamic_check,
27907 bool issetmem)
27909 rtx_code_label *loop_label = NULL, *label;
27910 int n;
27911 rtx modesize;
27912 int prolog_size = 0;
27913 rtx mode_value;
27915 /* Chose proper value to copy. */
27916 if (issetmem && VECTOR_MODE_P (mode))
27917 mode_value = vec_value;
27918 else
27919 mode_value = value;
27920 gcc_assert (GET_MODE_SIZE (mode) <= size);
27922 /* See if block is big or small, handle small blocks. */
27923 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27925 int size2 = size;
27926 loop_label = gen_label_rtx ();
27928 if (!*done_label)
27929 *done_label = gen_label_rtx ();
27931 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27932 1, loop_label);
27933 size2 >>= 1;
27935 /* Handle sizes > 3. */
27936 for (;size2 > 2; size2 >>= 1)
27937 expand_small_movmem_or_setmem (destmem, srcmem,
27938 *destptr, *srcptr,
27939 value, vec_value,
27940 *count,
27941 size2, *done_label, issetmem);
27942 /* Nothing to copy? Jump to DONE_LABEL if so */
27943 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27944 1, *done_label);
27946 /* Do a byte copy. */
27947 destmem = change_address (destmem, QImode, *destptr);
27948 if (issetmem)
27949 emit_move_insn (destmem, gen_lowpart (QImode, value));
27950 else
27952 srcmem = change_address (srcmem, QImode, *srcptr);
27953 emit_move_insn (destmem, srcmem);
27956 /* Handle sizes 2 and 3. */
27957 label = ix86_expand_aligntest (*count, 2, false);
27958 destmem = change_address (destmem, HImode, *destptr);
27959 destmem = offset_address (destmem, *count, 1);
27960 destmem = offset_address (destmem, GEN_INT (-2), 2);
27961 if (issetmem)
27962 emit_move_insn (destmem, gen_lowpart (HImode, value));
27963 else
27965 srcmem = change_address (srcmem, HImode, *srcptr);
27966 srcmem = offset_address (srcmem, *count, 1);
27967 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27968 emit_move_insn (destmem, srcmem);
27971 emit_label (label);
27972 LABEL_NUSES (label) = 1;
27973 emit_jump_insn (gen_jump (*done_label));
27974 emit_barrier ();
27976 else
27977 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27978 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27980 /* Start memcpy for COUNT >= SIZE. */
27981 if (loop_label)
27983 emit_label (loop_label);
27984 LABEL_NUSES (loop_label) = 1;
27987 /* Copy first desired_align bytes. */
27988 if (!issetmem)
27989 srcmem = change_address (srcmem, mode, *srcptr);
27990 destmem = change_address (destmem, mode, *destptr);
27991 modesize = GEN_INT (GET_MODE_SIZE (mode));
27992 for (n = 0; prolog_size < desired_align - align; n++)
27994 if (issetmem)
27995 emit_move_insn (destmem, mode_value);
27996 else
27998 emit_move_insn (destmem, srcmem);
27999 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
28001 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
28002 prolog_size += GET_MODE_SIZE (mode);
28006 /* Copy last SIZE bytes. */
28007 destmem = offset_address (destmem, *count, 1);
28008 destmem = offset_address (destmem,
28009 GEN_INT (-size - prolog_size),
28011 if (issetmem)
28012 emit_move_insn (destmem, mode_value);
28013 else
28015 srcmem = offset_address (srcmem, *count, 1);
28016 srcmem = offset_address (srcmem,
28017 GEN_INT (-size - prolog_size),
28019 emit_move_insn (destmem, srcmem);
28021 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
28023 destmem = offset_address (destmem, modesize, 1);
28024 if (issetmem)
28025 emit_move_insn (destmem, mode_value);
28026 else
28028 srcmem = offset_address (srcmem, modesize, 1);
28029 emit_move_insn (destmem, srcmem);
28033 /* Align destination. */
28034 if (desired_align > 1 && desired_align > align)
28036 rtx saveddest = *destptr;
28038 gcc_assert (desired_align <= size);
28039 /* Align destptr up, place it to new register. */
28040 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
28041 GEN_INT (prolog_size),
28042 NULL_RTX, 1, OPTAB_DIRECT);
28043 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
28044 REG_POINTER (*destptr) = 1;
28045 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
28046 GEN_INT (-desired_align),
28047 *destptr, 1, OPTAB_DIRECT);
28048 /* See how many bytes we skipped. */
28049 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
28050 *destptr,
28051 saveddest, 1, OPTAB_DIRECT);
28052 /* Adjust srcptr and count. */
28053 if (!issetmem)
28054 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
28055 saveddest, *srcptr, 1, OPTAB_DIRECT);
28056 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28057 saveddest, *count, 1, OPTAB_DIRECT);
28058 /* We copied at most size + prolog_size. */
28059 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
28060 *min_size
28061 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
28062 else
28063 *min_size = 0;
28065 /* Our loops always round down the block size, but for dispatch to
28066 library we need precise value. */
28067 if (dynamic_check)
28068 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
28069 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
28071 else
28073 gcc_assert (prolog_size == 0);
28074 /* Decrease count, so we won't end up copying last word twice. */
28075 if (!CONST_INT_P (*count))
28076 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
28077 constm1_rtx, *count, 1, OPTAB_DIRECT);
28078 else
28079 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
28080 (unsigned HOST_WIDE_INT)size));
28081 if (*min_size)
28082 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
28087 /* This function is like the previous one, except here we know how many bytes
28088 need to be copied. That allows us to update alignment not only of DST, which
28089 is returned, but also of SRC, which is passed as a pointer for that
28090 reason. */
28091 static rtx
28092 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
28093 rtx srcreg, rtx value, rtx vec_value,
28094 int desired_align, int align_bytes,
28095 bool issetmem)
28097 rtx src = NULL;
28098 rtx orig_dst = dst;
28099 rtx orig_src = NULL;
28100 int piece_size = 1;
28101 int copied_bytes = 0;
28103 if (!issetmem)
28105 gcc_assert (srcp != NULL);
28106 src = *srcp;
28107 orig_src = src;
28110 for (piece_size = 1;
28111 piece_size <= desired_align && copied_bytes < align_bytes;
28112 piece_size <<= 1)
28114 if (align_bytes & piece_size)
28116 if (issetmem)
28118 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
28119 dst = emit_memset (dst, destreg, vec_value, piece_size);
28120 else
28121 dst = emit_memset (dst, destreg, value, piece_size);
28123 else
28124 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
28125 copied_bytes += piece_size;
28128 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
28129 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28130 if (MEM_SIZE_KNOWN_P (orig_dst))
28131 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
28133 if (!issetmem)
28135 int src_align_bytes = get_mem_align_offset (src, desired_align
28136 * BITS_PER_UNIT);
28137 if (src_align_bytes >= 0)
28138 src_align_bytes = desired_align - src_align_bytes;
28139 if (src_align_bytes >= 0)
28141 unsigned int src_align;
28142 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
28144 if ((src_align_bytes & (src_align - 1))
28145 == (align_bytes & (src_align - 1)))
28146 break;
28148 if (src_align > (unsigned int) desired_align)
28149 src_align = desired_align;
28150 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
28151 set_mem_align (src, src_align * BITS_PER_UNIT);
28153 if (MEM_SIZE_KNOWN_P (orig_src))
28154 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
28155 *srcp = src;
28158 return dst;
28161 /* Return true if ALG can be used in current context.
28162 Assume we expand memset if MEMSET is true. */
28163 static bool
28164 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
28166 if (alg == no_stringop)
28167 return false;
28168 if (alg == vector_loop)
28169 return TARGET_SSE || TARGET_AVX;
28170 /* Algorithms using the rep prefix want at least edi and ecx;
28171 additionally, memset wants eax and memcpy wants esi. Don't
28172 consider such algorithms if the user has appropriated those
28173 registers for their own purposes, or if we have a non-default
28174 address space, since some string insns cannot override the segment. */
28175 if (alg == rep_prefix_1_byte
28176 || alg == rep_prefix_4_byte
28177 || alg == rep_prefix_8_byte)
28179 if (have_as)
28180 return false;
28181 if (fixed_regs[CX_REG]
28182 || fixed_regs[DI_REG]
28183 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
28184 return false;
28186 return true;
28189 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
28190 static enum stringop_alg
28191 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
28192 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
28193 bool memset, bool zero_memset, bool have_as,
28194 int *dynamic_check, bool *noalign, bool recur)
28196 const struct stringop_algs *algs;
28197 bool optimize_for_speed;
28198 int max = 0;
28199 const struct processor_costs *cost;
28200 int i;
28201 bool any_alg_usable_p = false;
28203 *noalign = false;
28204 *dynamic_check = -1;
28206 /* Even if the string operation call is cold, we still might spend a lot
28207 of time processing large blocks. */
28208 if (optimize_function_for_size_p (cfun)
28209 || (optimize_insn_for_size_p ()
28210 && (max_size < 256
28211 || (expected_size != -1 && expected_size < 256))))
28212 optimize_for_speed = false;
28213 else
28214 optimize_for_speed = true;
28216 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
28217 if (memset)
28218 algs = &cost->memset[TARGET_64BIT != 0];
28219 else
28220 algs = &cost->memcpy[TARGET_64BIT != 0];
28222 /* See maximal size for user defined algorithm. */
28223 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28225 enum stringop_alg candidate = algs->size[i].alg;
28226 bool usable = alg_usable_p (candidate, memset, have_as);
28227 any_alg_usable_p |= usable;
28229 if (candidate != libcall && candidate && usable)
28230 max = algs->size[i].max;
28233 /* If expected size is not known but max size is small enough
28234 so inline version is a win, set expected size into
28235 the range. */
28236 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
28237 && expected_size == -1)
28238 expected_size = min_size / 2 + max_size / 2;
28240 /* If user specified the algorithm, honor it if possible. */
28241 if (ix86_stringop_alg != no_stringop
28242 && alg_usable_p (ix86_stringop_alg, memset, have_as))
28243 return ix86_stringop_alg;
28244 /* rep; movq or rep; movl is the smallest variant. */
28245 else if (!optimize_for_speed)
28247 *noalign = true;
28248 if (!count || (count & 3) || (memset && !zero_memset))
28249 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
28250 ? rep_prefix_1_byte : loop_1_byte;
28251 else
28252 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
28253 ? rep_prefix_4_byte : loop;
28255 /* Very tiny blocks are best handled via the loop, REP is expensive to
28256 setup. */
28257 else if (expected_size != -1 && expected_size < 4)
28258 return loop_1_byte;
28259 else if (expected_size != -1)
28261 enum stringop_alg alg = libcall;
28262 bool alg_noalign = false;
28263 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
28265 /* We get here if the algorithms that were not libcall-based
28266 were rep-prefix based and we are unable to use rep prefixes
28267 based on global register usage. Break out of the loop and
28268 use the heuristic below. */
28269 if (algs->size[i].max == 0)
28270 break;
28271 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
28273 enum stringop_alg candidate = algs->size[i].alg;
28275 if (candidate != libcall
28276 && alg_usable_p (candidate, memset, have_as))
28278 alg = candidate;
28279 alg_noalign = algs->size[i].noalign;
28281 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
28282 last non-libcall inline algorithm. */
28283 if (TARGET_INLINE_ALL_STRINGOPS)
28285 /* When the current size is best to be copied by a libcall,
28286 but we are still forced to inline, run the heuristic below
28287 that will pick code for medium sized blocks. */
28288 if (alg != libcall)
28290 *noalign = alg_noalign;
28291 return alg;
28293 else if (!any_alg_usable_p)
28294 break;
28296 else if (alg_usable_p (candidate, memset, have_as))
28298 *noalign = algs->size[i].noalign;
28299 return candidate;
28304 /* When asked to inline the call anyway, try to pick meaningful choice.
28305 We look for maximal size of block that is faster to copy by hand and
28306 take blocks of at most of that size guessing that average size will
28307 be roughly half of the block.
28309 If this turns out to be bad, we might simply specify the preferred
28310 choice in ix86_costs. */
28311 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28312 && (algs->unknown_size == libcall
28313 || !alg_usable_p (algs->unknown_size, memset, have_as)))
28315 enum stringop_alg alg;
28316 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
28318 /* If there aren't any usable algorithms or if recursing already,
28319 then recursing on smaller sizes or same size isn't going to
28320 find anything. Just return the simple byte-at-a-time copy loop. */
28321 if (!any_alg_usable_p || recur)
28323 /* Pick something reasonable. */
28324 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
28325 *dynamic_check = 128;
28326 return loop_1_byte;
28328 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
28329 zero_memset, have_as, dynamic_check, noalign, true);
28330 gcc_assert (*dynamic_check == -1);
28331 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
28332 *dynamic_check = max;
28333 else
28334 gcc_assert (alg != libcall);
28335 return alg;
28337 return (alg_usable_p (algs->unknown_size, memset, have_as)
28338 ? algs->unknown_size : libcall);
28341 /* Decide on alignment. We know that the operand is already aligned to ALIGN
28342 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
28343 static int
28344 decide_alignment (int align,
28345 enum stringop_alg alg,
28346 int expected_size,
28347 machine_mode move_mode)
28349 int desired_align = 0;
28351 gcc_assert (alg != no_stringop);
28353 if (alg == libcall)
28354 return 0;
28355 if (move_mode == VOIDmode)
28356 return 0;
28358 desired_align = GET_MODE_SIZE (move_mode);
28359 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
28360 copying whole cacheline at once. */
28361 if (TARGET_PENTIUMPRO
28362 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
28363 desired_align = 8;
28365 if (optimize_size)
28366 desired_align = 1;
28367 if (desired_align < align)
28368 desired_align = align;
28369 if (expected_size != -1 && expected_size < 4)
28370 desired_align = align;
28372 return desired_align;
28376 /* Helper function for memcpy. For QImode value 0xXY produce
28377 0xXYXYXYXY of wide specified by MODE. This is essentially
28378 a * 0x10101010, but we can do slightly better than
28379 synth_mult by unwinding the sequence by hand on CPUs with
28380 slow multiply. */
28381 static rtx
28382 promote_duplicated_reg (machine_mode mode, rtx val)
28384 machine_mode valmode = GET_MODE (val);
28385 rtx tmp;
28386 int nops = mode == DImode ? 3 : 2;
28388 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
28389 if (val == const0_rtx)
28390 return copy_to_mode_reg (mode, CONST0_RTX (mode));
28391 if (CONST_INT_P (val))
28393 HOST_WIDE_INT v = INTVAL (val) & 255;
28395 v |= v << 8;
28396 v |= v << 16;
28397 if (mode == DImode)
28398 v |= (v << 16) << 16;
28399 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
28402 if (valmode == VOIDmode)
28403 valmode = QImode;
28404 if (valmode != QImode)
28405 val = gen_lowpart (QImode, val);
28406 if (mode == QImode)
28407 return val;
28408 if (!TARGET_PARTIAL_REG_STALL)
28409 nops--;
28410 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
28411 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
28412 <= (ix86_cost->shift_const + ix86_cost->add) * nops
28413 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
28415 rtx reg = convert_modes (mode, QImode, val, true);
28416 tmp = promote_duplicated_reg (mode, const1_rtx);
28417 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
28418 OPTAB_DIRECT);
28420 else
28422 rtx reg = convert_modes (mode, QImode, val, true);
28424 if (!TARGET_PARTIAL_REG_STALL)
28425 if (mode == SImode)
28426 emit_insn (gen_insvsi_1 (reg, reg));
28427 else
28428 emit_insn (gen_insvdi_1 (reg, reg));
28429 else
28431 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
28432 NULL, 1, OPTAB_DIRECT);
28433 reg =
28434 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28436 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
28437 NULL, 1, OPTAB_DIRECT);
28438 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28439 if (mode == SImode)
28440 return reg;
28441 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
28442 NULL, 1, OPTAB_DIRECT);
28443 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
28444 return reg;
28448 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
28449 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
28450 alignment from ALIGN to DESIRED_ALIGN. */
28451 static rtx
28452 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
28453 int align)
28455 rtx promoted_val;
28457 if (TARGET_64BIT
28458 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
28459 promoted_val = promote_duplicated_reg (DImode, val);
28460 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
28461 promoted_val = promote_duplicated_reg (SImode, val);
28462 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
28463 promoted_val = promote_duplicated_reg (HImode, val);
28464 else
28465 promoted_val = val;
28467 return promoted_val;
28470 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
28471 operations when profitable. The code depends upon architecture, block size
28472 and alignment, but always has one of the following overall structures:
28474 Aligned move sequence:
28476 1) Prologue guard: Conditional that jumps up to epilogues for small
28477 blocks that can be handled by epilogue alone. This is faster
28478 but also needed for correctness, since prologue assume the block
28479 is larger than the desired alignment.
28481 Optional dynamic check for size and libcall for large
28482 blocks is emitted here too, with -minline-stringops-dynamically.
28484 2) Prologue: copy first few bytes in order to get destination
28485 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
28486 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
28487 copied. We emit either a jump tree on power of two sized
28488 blocks, or a byte loop.
28490 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28491 with specified algorithm.
28493 4) Epilogue: code copying tail of the block that is too small to be
28494 handled by main body (or up to size guarded by prologue guard).
28496 Misaligned move sequence
28498 1) missaligned move prologue/epilogue containing:
28499 a) Prologue handling small memory blocks and jumping to done_label
28500 (skipped if blocks are known to be large enough)
28501 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
28502 needed by single possibly misaligned move
28503 (skipped if alignment is not needed)
28504 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
28506 2) Zero size guard dispatching to done_label, if needed
28508 3) dispatch to library call, if needed,
28510 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
28511 with specified algorithm. */
28512 bool
28513 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
28514 rtx align_exp, rtx expected_align_exp,
28515 rtx expected_size_exp, rtx min_size_exp,
28516 rtx max_size_exp, rtx probable_max_size_exp,
28517 bool issetmem)
28519 rtx destreg;
28520 rtx srcreg = NULL;
28521 rtx_code_label *label = NULL;
28522 rtx tmp;
28523 rtx_code_label *jump_around_label = NULL;
28524 HOST_WIDE_INT align = 1;
28525 unsigned HOST_WIDE_INT count = 0;
28526 HOST_WIDE_INT expected_size = -1;
28527 int size_needed = 0, epilogue_size_needed;
28528 int desired_align = 0, align_bytes = 0;
28529 enum stringop_alg alg;
28530 rtx promoted_val = NULL;
28531 rtx vec_promoted_val = NULL;
28532 bool force_loopy_epilogue = false;
28533 int dynamic_check;
28534 bool need_zero_guard = false;
28535 bool noalign;
28536 machine_mode move_mode = VOIDmode;
28537 int unroll_factor = 1;
28538 /* TODO: Once value ranges are available, fill in proper data. */
28539 unsigned HOST_WIDE_INT min_size = 0;
28540 unsigned HOST_WIDE_INT max_size = -1;
28541 unsigned HOST_WIDE_INT probable_max_size = -1;
28542 bool misaligned_prologue_used = false;
28543 bool have_as;
28545 if (CONST_INT_P (align_exp))
28546 align = INTVAL (align_exp);
28547 /* i386 can do misaligned access on reasonably increased cost. */
28548 if (CONST_INT_P (expected_align_exp)
28549 && INTVAL (expected_align_exp) > align)
28550 align = INTVAL (expected_align_exp);
28551 /* ALIGN is the minimum of destination and source alignment, but we care here
28552 just about destination alignment. */
28553 else if (!issetmem
28554 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
28555 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
28557 if (CONST_INT_P (count_exp))
28559 min_size = max_size = probable_max_size = count = expected_size
28560 = INTVAL (count_exp);
28561 /* When COUNT is 0, there is nothing to do. */
28562 if (!count)
28563 return true;
28565 else
28567 if (min_size_exp)
28568 min_size = INTVAL (min_size_exp);
28569 if (max_size_exp)
28570 max_size = INTVAL (max_size_exp);
28571 if (probable_max_size_exp)
28572 probable_max_size = INTVAL (probable_max_size_exp);
28573 if (CONST_INT_P (expected_size_exp))
28574 expected_size = INTVAL (expected_size_exp);
28577 /* Make sure we don't need to care about overflow later on. */
28578 if (count > (HOST_WIDE_INT_1U << 30))
28579 return false;
28581 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
28582 if (!issetmem)
28583 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
28585 /* Step 0: Decide on preferred algorithm, desired alignment and
28586 size of chunks to be copied by main loop. */
28587 alg = decide_alg (count, expected_size, min_size, probable_max_size,
28588 issetmem,
28589 issetmem && val_exp == const0_rtx, have_as,
28590 &dynamic_check, &noalign, false);
28591 if (alg == libcall)
28592 return false;
28593 gcc_assert (alg != no_stringop);
28595 /* For now vector-version of memset is generated only for memory zeroing, as
28596 creating of promoted vector value is very cheap in this case. */
28597 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
28598 alg = unrolled_loop;
28600 if (!count)
28601 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
28602 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
28603 if (!issetmem)
28604 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
28606 unroll_factor = 1;
28607 move_mode = word_mode;
28608 switch (alg)
28610 case libcall:
28611 case no_stringop:
28612 case last_alg:
28613 gcc_unreachable ();
28614 case loop_1_byte:
28615 need_zero_guard = true;
28616 move_mode = QImode;
28617 break;
28618 case loop:
28619 need_zero_guard = true;
28620 break;
28621 case unrolled_loop:
28622 need_zero_guard = true;
28623 unroll_factor = (TARGET_64BIT ? 4 : 2);
28624 break;
28625 case vector_loop:
28626 need_zero_guard = true;
28627 unroll_factor = 4;
28628 /* Find the widest supported mode. */
28629 move_mode = word_mode;
28630 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
28631 != CODE_FOR_nothing)
28632 move_mode = GET_MODE_WIDER_MODE (move_mode);
28634 /* Find the corresponding vector mode with the same size as MOVE_MODE.
28635 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
28636 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
28638 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
28639 move_mode = mode_for_vector (word_mode, nunits);
28640 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
28641 move_mode = word_mode;
28643 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
28644 break;
28645 case rep_prefix_8_byte:
28646 move_mode = DImode;
28647 break;
28648 case rep_prefix_4_byte:
28649 move_mode = SImode;
28650 break;
28651 case rep_prefix_1_byte:
28652 move_mode = QImode;
28653 break;
28655 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
28656 epilogue_size_needed = size_needed;
28658 /* If we are going to call any library calls conditionally, make sure any
28659 pending stack adjustment happen before the first conditional branch,
28660 otherwise they will be emitted before the library call only and won't
28661 happen from the other branches. */
28662 if (dynamic_check != -1)
28663 do_pending_stack_adjust ();
28665 desired_align = decide_alignment (align, alg, expected_size, move_mode);
28666 if (!TARGET_ALIGN_STRINGOPS || noalign)
28667 align = desired_align;
28669 /* Step 1: Prologue guard. */
28671 /* Alignment code needs count to be in register. */
28672 if (CONST_INT_P (count_exp) && desired_align > align)
28674 if (INTVAL (count_exp) > desired_align
28675 && INTVAL (count_exp) > size_needed)
28677 align_bytes
28678 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
28679 if (align_bytes <= 0)
28680 align_bytes = 0;
28681 else
28682 align_bytes = desired_align - align_bytes;
28684 if (align_bytes == 0)
28685 count_exp = force_reg (counter_mode (count_exp), count_exp);
28687 gcc_assert (desired_align >= 1 && align >= 1);
28689 /* Misaligned move sequences handle both prologue and epilogue at once.
28690 Default code generation results in a smaller code for large alignments
28691 and also avoids redundant job when sizes are known precisely. */
28692 misaligned_prologue_used
28693 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
28694 && MAX (desired_align, epilogue_size_needed) <= 32
28695 && desired_align <= epilogue_size_needed
28696 && ((desired_align > align && !align_bytes)
28697 || (!count && epilogue_size_needed > 1)));
28699 /* Do the cheap promotion to allow better CSE across the
28700 main loop and epilogue (ie one load of the big constant in the
28701 front of all code.
28702 For now the misaligned move sequences do not have fast path
28703 without broadcasting. */
28704 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
28706 if (alg == vector_loop)
28708 gcc_assert (val_exp == const0_rtx);
28709 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
28710 promoted_val = promote_duplicated_reg_to_size (val_exp,
28711 GET_MODE_SIZE (word_mode),
28712 desired_align, align);
28714 else
28716 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28717 desired_align, align);
28720 /* Misaligned move sequences handles both prologues and epilogues at once.
28721 Default code generation results in smaller code for large alignments and
28722 also avoids redundant job when sizes are known precisely. */
28723 if (misaligned_prologue_used)
28725 /* Misaligned move prologue handled small blocks by itself. */
28726 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
28727 (dst, src, &destreg, &srcreg,
28728 move_mode, promoted_val, vec_promoted_val,
28729 &count_exp,
28730 &jump_around_label,
28731 desired_align < align
28732 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
28733 desired_align, align, &min_size, dynamic_check, issetmem);
28734 if (!issetmem)
28735 src = change_address (src, BLKmode, srcreg);
28736 dst = change_address (dst, BLKmode, destreg);
28737 set_mem_align (dst, desired_align * BITS_PER_UNIT);
28738 epilogue_size_needed = 0;
28739 if (need_zero_guard
28740 && min_size < (unsigned HOST_WIDE_INT) size_needed)
28742 /* It is possible that we copied enough so the main loop will not
28743 execute. */
28744 gcc_assert (size_needed > 1);
28745 if (jump_around_label == NULL_RTX)
28746 jump_around_label = gen_label_rtx ();
28747 emit_cmp_and_jump_insns (count_exp,
28748 GEN_INT (size_needed),
28749 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
28750 if (expected_size == -1
28751 || expected_size < (desired_align - align) / 2 + size_needed)
28752 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28753 else
28754 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28757 /* Ensure that alignment prologue won't copy past end of block. */
28758 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
28760 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
28761 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
28762 Make sure it is power of 2. */
28763 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
28765 /* To improve performance of small blocks, we jump around the VAL
28766 promoting mode. This mean that if the promoted VAL is not constant,
28767 we might not use it in the epilogue and have to use byte
28768 loop variant. */
28769 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
28770 force_loopy_epilogue = true;
28771 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28772 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28774 /* If main algorithm works on QImode, no epilogue is needed.
28775 For small sizes just don't align anything. */
28776 if (size_needed == 1)
28777 desired_align = align;
28778 else
28779 goto epilogue;
28781 else if (!count
28782 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
28784 label = gen_label_rtx ();
28785 emit_cmp_and_jump_insns (count_exp,
28786 GEN_INT (epilogue_size_needed),
28787 LTU, 0, counter_mode (count_exp), 1, label);
28788 if (expected_size == -1 || expected_size < epilogue_size_needed)
28789 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28790 else
28791 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28795 /* Emit code to decide on runtime whether library call or inline should be
28796 used. */
28797 if (dynamic_check != -1)
28799 if (!issetmem && CONST_INT_P (count_exp))
28801 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
28803 emit_block_copy_via_libcall (dst, src, count_exp);
28804 count_exp = const0_rtx;
28805 goto epilogue;
28808 else
28810 rtx_code_label *hot_label = gen_label_rtx ();
28811 if (jump_around_label == NULL_RTX)
28812 jump_around_label = gen_label_rtx ();
28813 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
28814 LEU, 0, counter_mode (count_exp),
28815 1, hot_label);
28816 predict_jump (REG_BR_PROB_BASE * 90 / 100);
28817 if (issetmem)
28818 set_storage_via_libcall (dst, count_exp, val_exp);
28819 else
28820 emit_block_copy_via_libcall (dst, src, count_exp);
28821 emit_jump (jump_around_label);
28822 emit_label (hot_label);
28826 /* Step 2: Alignment prologue. */
28827 /* Do the expensive promotion once we branched off the small blocks. */
28828 if (issetmem && !promoted_val)
28829 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
28830 desired_align, align);
28832 if (desired_align > align && !misaligned_prologue_used)
28834 if (align_bytes == 0)
28836 /* Except for the first move in prologue, we no longer know
28837 constant offset in aliasing info. It don't seems to worth
28838 the pain to maintain it for the first move, so throw away
28839 the info early. */
28840 dst = change_address (dst, BLKmode, destreg);
28841 if (!issetmem)
28842 src = change_address (src, BLKmode, srcreg);
28843 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
28844 promoted_val, vec_promoted_val,
28845 count_exp, align, desired_align,
28846 issetmem);
28847 /* At most desired_align - align bytes are copied. */
28848 if (min_size < (unsigned)(desired_align - align))
28849 min_size = 0;
28850 else
28851 min_size -= desired_align - align;
28853 else
28855 /* If we know how many bytes need to be stored before dst is
28856 sufficiently aligned, maintain aliasing info accurately. */
28857 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
28858 srcreg,
28859 promoted_val,
28860 vec_promoted_val,
28861 desired_align,
28862 align_bytes,
28863 issetmem);
28865 count_exp = plus_constant (counter_mode (count_exp),
28866 count_exp, -align_bytes);
28867 count -= align_bytes;
28868 min_size -= align_bytes;
28869 max_size -= align_bytes;
28871 if (need_zero_guard
28872 && min_size < (unsigned HOST_WIDE_INT) size_needed
28873 && (count < (unsigned HOST_WIDE_INT) size_needed
28874 || (align_bytes == 0
28875 && count < ((unsigned HOST_WIDE_INT) size_needed
28876 + desired_align - align))))
28878 /* It is possible that we copied enough so the main loop will not
28879 execute. */
28880 gcc_assert (size_needed > 1);
28881 if (label == NULL_RTX)
28882 label = gen_label_rtx ();
28883 emit_cmp_and_jump_insns (count_exp,
28884 GEN_INT (size_needed),
28885 LTU, 0, counter_mode (count_exp), 1, label);
28886 if (expected_size == -1
28887 || expected_size < (desired_align - align) / 2 + size_needed)
28888 predict_jump (REG_BR_PROB_BASE * 20 / 100);
28889 else
28890 predict_jump (REG_BR_PROB_BASE * 60 / 100);
28893 if (label && size_needed == 1)
28895 emit_label (label);
28896 LABEL_NUSES (label) = 1;
28897 label = NULL;
28898 epilogue_size_needed = 1;
28899 if (issetmem)
28900 promoted_val = val_exp;
28902 else if (label == NULL_RTX && !misaligned_prologue_used)
28903 epilogue_size_needed = size_needed;
28905 /* Step 3: Main loop. */
28907 switch (alg)
28909 case libcall:
28910 case no_stringop:
28911 case last_alg:
28912 gcc_unreachable ();
28913 case loop_1_byte:
28914 case loop:
28915 case unrolled_loop:
28916 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28917 count_exp, move_mode, unroll_factor,
28918 expected_size, issetmem);
28919 break;
28920 case vector_loop:
28921 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28922 vec_promoted_val, count_exp, move_mode,
28923 unroll_factor, expected_size, issetmem);
28924 break;
28925 case rep_prefix_8_byte:
28926 case rep_prefix_4_byte:
28927 case rep_prefix_1_byte:
28928 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28929 val_exp, count_exp, move_mode, issetmem);
28930 break;
28932 /* Adjust properly the offset of src and dest memory for aliasing. */
28933 if (CONST_INT_P (count_exp))
28935 if (!issetmem)
28936 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28937 (count / size_needed) * size_needed);
28938 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28939 (count / size_needed) * size_needed);
28941 else
28943 if (!issetmem)
28944 src = change_address (src, BLKmode, srcreg);
28945 dst = change_address (dst, BLKmode, destreg);
28948 /* Step 4: Epilogue to copy the remaining bytes. */
28949 epilogue:
28950 if (label)
28952 /* When the main loop is done, COUNT_EXP might hold original count,
28953 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28954 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28955 bytes. Compensate if needed. */
28957 if (size_needed < epilogue_size_needed)
28959 tmp =
28960 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28961 GEN_INT (size_needed - 1), count_exp, 1,
28962 OPTAB_DIRECT);
28963 if (tmp != count_exp)
28964 emit_move_insn (count_exp, tmp);
28966 emit_label (label);
28967 LABEL_NUSES (label) = 1;
28970 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28972 if (force_loopy_epilogue)
28973 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28974 epilogue_size_needed);
28975 else
28977 if (issetmem)
28978 expand_setmem_epilogue (dst, destreg, promoted_val,
28979 vec_promoted_val, count_exp,
28980 epilogue_size_needed);
28981 else
28982 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28983 epilogue_size_needed);
28986 if (jump_around_label)
28987 emit_label (jump_around_label);
28988 return true;
28992 /* Expand the appropriate insns for doing strlen if not just doing
28993 repnz; scasb
28995 out = result, initialized with the start address
28996 align_rtx = alignment of the address.
28997 scratch = scratch register, initialized with the startaddress when
28998 not aligned, otherwise undefined
29000 This is just the body. It needs the initializations mentioned above and
29001 some address computing at the end. These things are done in i386.md. */
29003 static void
29004 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
29006 int align;
29007 rtx tmp;
29008 rtx_code_label *align_2_label = NULL;
29009 rtx_code_label *align_3_label = NULL;
29010 rtx_code_label *align_4_label = gen_label_rtx ();
29011 rtx_code_label *end_0_label = gen_label_rtx ();
29012 rtx mem;
29013 rtx tmpreg = gen_reg_rtx (SImode);
29014 rtx scratch = gen_reg_rtx (SImode);
29015 rtx cmp;
29017 align = 0;
29018 if (CONST_INT_P (align_rtx))
29019 align = INTVAL (align_rtx);
29021 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
29023 /* Is there a known alignment and is it less than 4? */
29024 if (align < 4)
29026 rtx scratch1 = gen_reg_rtx (Pmode);
29027 emit_move_insn (scratch1, out);
29028 /* Is there a known alignment and is it not 2? */
29029 if (align != 2)
29031 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
29032 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
29034 /* Leave just the 3 lower bits. */
29035 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
29036 NULL_RTX, 0, OPTAB_WIDEN);
29038 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29039 Pmode, 1, align_4_label);
29040 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
29041 Pmode, 1, align_2_label);
29042 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
29043 Pmode, 1, align_3_label);
29045 else
29047 /* Since the alignment is 2, we have to check 2 or 0 bytes;
29048 check if is aligned to 4 - byte. */
29050 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
29051 NULL_RTX, 0, OPTAB_WIDEN);
29053 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
29054 Pmode, 1, align_4_label);
29057 mem = change_address (src, QImode, out);
29059 /* Now compare the bytes. */
29061 /* Compare the first n unaligned byte on a byte per byte basis. */
29062 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
29063 QImode, 1, end_0_label);
29065 /* Increment the address. */
29066 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29068 /* Not needed with an alignment of 2 */
29069 if (align != 2)
29071 emit_label (align_2_label);
29073 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29074 end_0_label);
29076 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29078 emit_label (align_3_label);
29081 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
29082 end_0_label);
29084 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
29087 /* Generate loop to check 4 bytes at a time. It is not a good idea to
29088 align this loop. It gives only huge programs, but does not help to
29089 speed up. */
29090 emit_label (align_4_label);
29092 mem = change_address (src, SImode, out);
29093 emit_move_insn (scratch, mem);
29094 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
29096 /* This formula yields a nonzero result iff one of the bytes is zero.
29097 This saves three branches inside loop and many cycles. */
29099 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
29100 emit_insn (gen_one_cmplsi2 (scratch, scratch));
29101 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
29102 emit_insn (gen_andsi3 (tmpreg, tmpreg,
29103 gen_int_mode (0x80808080, SImode)));
29104 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
29105 align_4_label);
29107 if (TARGET_CMOVE)
29109 rtx reg = gen_reg_rtx (SImode);
29110 rtx reg2 = gen_reg_rtx (Pmode);
29111 emit_move_insn (reg, tmpreg);
29112 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
29114 /* If zero is not in the first two bytes, move two bytes forward. */
29115 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29116 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29117 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29118 emit_insn (gen_rtx_SET (tmpreg,
29119 gen_rtx_IF_THEN_ELSE (SImode, tmp,
29120 reg,
29121 tmpreg)));
29122 /* Emit lea manually to avoid clobbering of flags. */
29123 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
29125 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29126 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
29127 emit_insn (gen_rtx_SET (out,
29128 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
29129 reg2,
29130 out)));
29132 else
29134 rtx_code_label *end_2_label = gen_label_rtx ();
29135 /* Is zero in the first two bytes? */
29137 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
29138 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
29139 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
29140 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
29141 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
29142 pc_rtx);
29143 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
29144 JUMP_LABEL (tmp) = end_2_label;
29146 /* Not in the first two. Move two bytes forward. */
29147 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
29148 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
29150 emit_label (end_2_label);
29154 /* Avoid branch in fixing the byte. */
29155 tmpreg = gen_lowpart (QImode, tmpreg);
29156 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
29157 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
29158 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
29159 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
29161 emit_label (end_0_label);
29164 /* Expand strlen. */
29166 bool
29167 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
29169 rtx addr, scratch1, scratch2, scratch3, scratch4;
29171 /* The generic case of strlen expander is long. Avoid it's
29172 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
29174 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29175 && !TARGET_INLINE_ALL_STRINGOPS
29176 && !optimize_insn_for_size_p ()
29177 && (!CONST_INT_P (align) || INTVAL (align) < 4))
29178 return false;
29180 addr = force_reg (Pmode, XEXP (src, 0));
29181 scratch1 = gen_reg_rtx (Pmode);
29183 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
29184 && !optimize_insn_for_size_p ())
29186 /* Well it seems that some optimizer does not combine a call like
29187 foo(strlen(bar), strlen(bar));
29188 when the move and the subtraction is done here. It does calculate
29189 the length just once when these instructions are done inside of
29190 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
29191 often used and I use one fewer register for the lifetime of
29192 output_strlen_unroll() this is better. */
29194 emit_move_insn (out, addr);
29196 ix86_expand_strlensi_unroll_1 (out, src, align);
29198 /* strlensi_unroll_1 returns the address of the zero at the end of
29199 the string, like memchr(), so compute the length by subtracting
29200 the start address. */
29201 emit_insn (ix86_gen_sub3 (out, out, addr));
29203 else
29205 rtx unspec;
29207 /* Can't use this if the user has appropriated eax, ecx, or edi. */
29208 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
29209 return false;
29210 /* Can't use this for non-default address spaces. */
29211 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
29212 return false;
29214 scratch2 = gen_reg_rtx (Pmode);
29215 scratch3 = gen_reg_rtx (Pmode);
29216 scratch4 = force_reg (Pmode, constm1_rtx);
29218 emit_move_insn (scratch3, addr);
29219 eoschar = force_reg (QImode, eoschar);
29221 src = replace_equiv_address_nv (src, scratch3);
29223 /* If .md starts supporting :P, this can be done in .md. */
29224 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
29225 scratch4), UNSPEC_SCAS);
29226 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
29227 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
29228 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
29230 return true;
29233 /* For given symbol (function) construct code to compute address of it's PLT
29234 entry in large x86-64 PIC model. */
29235 static rtx
29236 construct_plt_address (rtx symbol)
29238 rtx tmp, unspec;
29240 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
29241 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
29242 gcc_assert (Pmode == DImode);
29244 tmp = gen_reg_rtx (Pmode);
29245 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
29247 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
29248 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
29249 return tmp;
29253 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
29254 rtx callarg2,
29255 rtx pop, bool sibcall)
29257 rtx vec[3];
29258 rtx use = NULL, call;
29259 unsigned int vec_len = 0;
29260 tree fndecl;
29262 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29264 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
29265 if (fndecl
29266 && (lookup_attribute ("interrupt",
29267 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
29268 error ("interrupt service routine can't be called directly");
29270 else
29271 fndecl = NULL_TREE;
29273 if (pop == const0_rtx)
29274 pop = NULL;
29275 gcc_assert (!TARGET_64BIT || !pop);
29277 if (TARGET_MACHO && !TARGET_64BIT)
29279 #if TARGET_MACHO
29280 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
29281 fnaddr = machopic_indirect_call_target (fnaddr);
29282 #endif
29284 else
29286 /* Static functions and indirect calls don't need the pic register. Also,
29287 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
29288 it an indirect call. */
29289 rtx addr = XEXP (fnaddr, 0);
29290 if (flag_pic
29291 && GET_CODE (addr) == SYMBOL_REF
29292 && !SYMBOL_REF_LOCAL_P (addr))
29294 if (flag_plt
29295 && (SYMBOL_REF_DECL (addr) == NULL_TREE
29296 || !lookup_attribute ("noplt",
29297 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
29299 if (!TARGET_64BIT
29300 || (ix86_cmodel == CM_LARGE_PIC
29301 && DEFAULT_ABI != MS_ABI))
29303 use_reg (&use, gen_rtx_REG (Pmode,
29304 REAL_PIC_OFFSET_TABLE_REGNUM));
29305 if (ix86_use_pseudo_pic_reg ())
29306 emit_move_insn (gen_rtx_REG (Pmode,
29307 REAL_PIC_OFFSET_TABLE_REGNUM),
29308 pic_offset_table_rtx);
29311 else if (!TARGET_PECOFF && !TARGET_MACHO)
29313 if (TARGET_64BIT)
29315 fnaddr = gen_rtx_UNSPEC (Pmode,
29316 gen_rtvec (1, addr),
29317 UNSPEC_GOTPCREL);
29318 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29320 else
29322 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
29323 UNSPEC_GOT);
29324 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
29325 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
29326 fnaddr);
29328 fnaddr = gen_const_mem (Pmode, fnaddr);
29329 /* Pmode may not be the same as word_mode for x32, which
29330 doesn't support indirect branch via 32-bit memory slot.
29331 Since x32 GOT slot is 64 bit with zero upper 32 bits,
29332 indirect branch via x32 GOT slot is OK. */
29333 if (GET_MODE (fnaddr) != word_mode)
29334 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
29335 fnaddr = gen_rtx_MEM (QImode, fnaddr);
29340 /* Skip setting up RAX register for -mskip-rax-setup when there are no
29341 parameters passed in vector registers. */
29342 if (TARGET_64BIT
29343 && (INTVAL (callarg2) > 0
29344 || (INTVAL (callarg2) == 0
29345 && (TARGET_SSE || !flag_skip_rax_setup))))
29347 rtx al = gen_rtx_REG (QImode, AX_REG);
29348 emit_move_insn (al, callarg2);
29349 use_reg (&use, al);
29352 if (ix86_cmodel == CM_LARGE_PIC
29353 && !TARGET_PECOFF
29354 && MEM_P (fnaddr)
29355 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
29356 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
29357 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
29358 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
29359 branch via x32 GOT slot is OK. */
29360 else if (!(TARGET_X32
29361 && MEM_P (fnaddr)
29362 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
29363 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
29364 && (sibcall
29365 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
29366 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
29368 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
29369 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
29372 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
29374 if (retval)
29376 /* We should add bounds as destination register in case
29377 pointer with bounds may be returned. */
29378 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
29380 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
29381 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
29382 if (GET_CODE (retval) == PARALLEL)
29384 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
29385 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
29386 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
29387 retval = chkp_join_splitted_slot (retval, par);
29389 else
29391 retval = gen_rtx_PARALLEL (VOIDmode,
29392 gen_rtvec (3, retval, b0, b1));
29393 chkp_put_regs_to_expr_list (retval);
29397 call = gen_rtx_SET (retval, call);
29399 vec[vec_len++] = call;
29401 if (pop)
29403 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
29404 pop = gen_rtx_SET (stack_pointer_rtx, pop);
29405 vec[vec_len++] = pop;
29408 if (cfun->machine->no_caller_saved_registers
29409 && (!fndecl
29410 || (!TREE_THIS_VOLATILE (fndecl)
29411 && !lookup_attribute ("no_caller_saved_registers",
29412 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
29414 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
29415 bool is_64bit_ms_abi = (TARGET_64BIT
29416 && ix86_function_abi (fndecl) == MS_ABI);
29417 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
29419 /* If there are no caller-saved registers, add all registers
29420 that are clobbered by the call which returns. */
29421 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29422 if (!fixed_regs[i]
29423 && (ix86_call_used_regs[i] == 1
29424 || (ix86_call_used_regs[i] & c_mask))
29425 && !STACK_REGNO_P (i)
29426 && !MMX_REGNO_P (i))
29427 clobber_reg (&use,
29428 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
29430 else if (TARGET_64BIT_MS_ABI
29431 && (!callarg2 || INTVAL (callarg2) != -2))
29433 unsigned i;
29435 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
29437 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
29438 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
29440 clobber_reg (&use, gen_rtx_REG (mode, regno));
29443 /* Set here, but it may get cleared later. */
29444 if (TARGET_CALL_MS2SYSV_XLOGUES)
29446 if (!TARGET_SSE)
29449 /* Don't break hot-patched functions. */
29450 else if (ix86_function_ms_hook_prologue (current_function_decl))
29453 /* TODO: Cases not yet examined. */
29454 else if (flag_split_stack)
29455 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
29457 else
29459 gcc_assert (!reload_completed);
29460 cfun->machine->call_ms2sysv = true;
29465 if (vec_len > 1)
29466 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
29467 call = emit_call_insn (call);
29468 if (use)
29469 CALL_INSN_FUNCTION_USAGE (call) = use;
29471 return call;
29474 /* Return true if the function being called was marked with attribute
29475 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
29476 to handle the non-PIC case in the backend because there is no easy
29477 interface for the front-end to force non-PLT calls to use the GOT.
29478 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
29479 to call the function marked "noplt" indirectly. */
29481 static bool
29482 ix86_nopic_noplt_attribute_p (rtx call_op)
29484 if (flag_pic || ix86_cmodel == CM_LARGE
29485 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
29486 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
29487 || SYMBOL_REF_LOCAL_P (call_op))
29488 return false;
29490 tree symbol_decl = SYMBOL_REF_DECL (call_op);
29492 if (!flag_plt
29493 || (symbol_decl != NULL_TREE
29494 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
29495 return true;
29497 return false;
29500 /* Output the assembly for a call instruction. */
29502 const char *
29503 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
29505 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
29506 bool seh_nop_p = false;
29507 const char *xasm;
29509 if (SIBLING_CALL_P (insn))
29511 if (direct_p)
29513 if (ix86_nopic_noplt_attribute_p (call_op))
29515 if (TARGET_64BIT)
29516 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29517 else
29518 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29520 else
29521 xasm = "%!jmp\t%P0";
29523 /* SEH epilogue detection requires the indirect branch case
29524 to include REX.W. */
29525 else if (TARGET_SEH)
29526 xasm = "%!rex.W jmp\t%A0";
29527 else
29528 xasm = "%!jmp\t%A0";
29530 output_asm_insn (xasm, &call_op);
29531 return "";
29534 /* SEH unwinding can require an extra nop to be emitted in several
29535 circumstances. Determine if we have one of those. */
29536 if (TARGET_SEH)
29538 rtx_insn *i;
29540 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
29542 /* If we get to another real insn, we don't need the nop. */
29543 if (INSN_P (i))
29544 break;
29546 /* If we get to the epilogue note, prevent a catch region from
29547 being adjacent to the standard epilogue sequence. If non-
29548 call-exceptions, we'll have done this during epilogue emission. */
29549 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29550 && !flag_non_call_exceptions
29551 && !can_throw_internal (insn))
29553 seh_nop_p = true;
29554 break;
29558 /* If we didn't find a real insn following the call, prevent the
29559 unwinder from looking into the next function. */
29560 if (i == NULL)
29561 seh_nop_p = true;
29564 if (direct_p)
29566 if (ix86_nopic_noplt_attribute_p (call_op))
29568 if (TARGET_64BIT)
29569 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29570 else
29571 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29573 else
29574 xasm = "%!call\t%P0";
29576 else
29577 xasm = "%!call\t%A0";
29579 output_asm_insn (xasm, &call_op);
29581 if (seh_nop_p)
29582 return "nop";
29584 return "";
29587 /* Clear stack slot assignments remembered from previous functions.
29588 This is called from INIT_EXPANDERS once before RTL is emitted for each
29589 function. */
29591 static struct machine_function *
29592 ix86_init_machine_status (void)
29594 struct machine_function *f;
29596 f = ggc_cleared_alloc<machine_function> ();
29597 f->call_abi = ix86_abi;
29599 return f;
29602 /* Return a MEM corresponding to a stack slot with mode MODE.
29603 Allocate a new slot if necessary.
29605 The RTL for a function can have several slots available: N is
29606 which slot to use. */
29609 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29611 struct stack_local_entry *s;
29613 gcc_assert (n < MAX_386_STACK_LOCALS);
29615 for (s = ix86_stack_locals; s; s = s->next)
29616 if (s->mode == mode && s->n == n)
29617 return validize_mem (copy_rtx (s->rtl));
29619 s = ggc_alloc<stack_local_entry> ();
29620 s->n = n;
29621 s->mode = mode;
29622 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29624 s->next = ix86_stack_locals;
29625 ix86_stack_locals = s;
29626 return validize_mem (copy_rtx (s->rtl));
29629 static void
29630 ix86_instantiate_decls (void)
29632 struct stack_local_entry *s;
29634 for (s = ix86_stack_locals; s; s = s->next)
29635 if (s->rtl != NULL_RTX)
29636 instantiate_decl_rtl (s->rtl);
29639 /* Return the number used for encoding REG, in the range 0..7. */
29641 static int
29642 reg_encoded_number (rtx reg)
29644 unsigned regno = REGNO (reg);
29645 switch (regno)
29647 case AX_REG:
29648 return 0;
29649 case CX_REG:
29650 return 1;
29651 case DX_REG:
29652 return 2;
29653 case BX_REG:
29654 return 3;
29655 case SP_REG:
29656 return 4;
29657 case BP_REG:
29658 return 5;
29659 case SI_REG:
29660 return 6;
29661 case DI_REG:
29662 return 7;
29663 default:
29664 break;
29666 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29667 return regno - FIRST_STACK_REG;
29668 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29669 return regno - FIRST_SSE_REG;
29670 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29671 return regno - FIRST_MMX_REG;
29672 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29673 return regno - FIRST_REX_SSE_REG;
29674 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29675 return regno - FIRST_REX_INT_REG;
29676 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29677 return regno - FIRST_MASK_REG;
29678 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29679 return regno - FIRST_BND_REG;
29680 return -1;
29683 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29684 in its encoding if it could be relevant for ROP mitigation, otherwise
29685 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29686 used for calculating it into them. */
29688 static int
29689 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29690 int *popno0 = 0, int *popno1 = 0)
29692 if (asm_noperands (PATTERN (insn)) >= 0)
29693 return -1;
29694 int has_modrm = get_attr_modrm (insn);
29695 if (!has_modrm)
29696 return -1;
29697 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29698 rtx op0, op1;
29699 switch (cls)
29701 case MODRM_CLASS_OP02:
29702 gcc_assert (noperands >= 3);
29703 if (popno0)
29705 *popno0 = 0;
29706 *popno1 = 2;
29708 op0 = operands[0];
29709 op1 = operands[2];
29710 break;
29711 case MODRM_CLASS_OP01:
29712 gcc_assert (noperands >= 2);
29713 if (popno0)
29715 *popno0 = 0;
29716 *popno1 = 1;
29718 op0 = operands[0];
29719 op1 = operands[1];
29720 break;
29721 default:
29722 return -1;
29724 if (REG_P (op0) && REG_P (op1))
29726 int enc0 = reg_encoded_number (op0);
29727 int enc1 = reg_encoded_number (op1);
29728 return 0xc0 + (enc1 << 3) + enc0;
29730 return -1;
29733 /* Check whether x86 address PARTS is a pc-relative address. */
29735 static bool
29736 rip_relative_addr_p (struct ix86_address *parts)
29738 rtx base, index, disp;
29740 base = parts->base;
29741 index = parts->index;
29742 disp = parts->disp;
29744 if (disp && !base && !index)
29746 if (TARGET_64BIT)
29748 rtx symbol = disp;
29750 if (GET_CODE (disp) == CONST)
29751 symbol = XEXP (disp, 0);
29752 if (GET_CODE (symbol) == PLUS
29753 && CONST_INT_P (XEXP (symbol, 1)))
29754 symbol = XEXP (symbol, 0);
29756 if (GET_CODE (symbol) == LABEL_REF
29757 || (GET_CODE (symbol) == SYMBOL_REF
29758 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29759 || (GET_CODE (symbol) == UNSPEC
29760 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29761 || XINT (symbol, 1) == UNSPEC_PCREL
29762 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29763 return true;
29766 return false;
29769 /* Calculate the length of the memory address in the instruction encoding.
29770 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29771 or other prefixes. We never generate addr32 prefix for LEA insn. */
29774 memory_address_length (rtx addr, bool lea)
29776 struct ix86_address parts;
29777 rtx base, index, disp;
29778 int len;
29779 int ok;
29781 if (GET_CODE (addr) == PRE_DEC
29782 || GET_CODE (addr) == POST_INC
29783 || GET_CODE (addr) == PRE_MODIFY
29784 || GET_CODE (addr) == POST_MODIFY)
29785 return 0;
29787 ok = ix86_decompose_address (addr, &parts);
29788 gcc_assert (ok);
29790 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29792 /* If this is not LEA instruction, add the length of addr32 prefix. */
29793 if (TARGET_64BIT && !lea
29794 && (SImode_address_operand (addr, VOIDmode)
29795 || (parts.base && GET_MODE (parts.base) == SImode)
29796 || (parts.index && GET_MODE (parts.index) == SImode)))
29797 len++;
29799 base = parts.base;
29800 index = parts.index;
29801 disp = parts.disp;
29803 if (base && SUBREG_P (base))
29804 base = SUBREG_REG (base);
29805 if (index && SUBREG_P (index))
29806 index = SUBREG_REG (index);
29808 gcc_assert (base == NULL_RTX || REG_P (base));
29809 gcc_assert (index == NULL_RTX || REG_P (index));
29811 /* Rule of thumb:
29812 - esp as the base always wants an index,
29813 - ebp as the base always wants a displacement,
29814 - r12 as the base always wants an index,
29815 - r13 as the base always wants a displacement. */
29817 /* Register Indirect. */
29818 if (base && !index && !disp)
29820 /* esp (for its index) and ebp (for its displacement) need
29821 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29822 code. */
29823 if (base == arg_pointer_rtx
29824 || base == frame_pointer_rtx
29825 || REGNO (base) == SP_REG
29826 || REGNO (base) == BP_REG
29827 || REGNO (base) == R12_REG
29828 || REGNO (base) == R13_REG)
29829 len++;
29832 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29833 is not disp32, but disp32(%rip), so for disp32
29834 SIB byte is needed, unless print_operand_address
29835 optimizes it into disp32(%rip) or (%rip) is implied
29836 by UNSPEC. */
29837 else if (disp && !base && !index)
29839 len += 4;
29840 if (!rip_relative_addr_p (&parts))
29841 len++;
29843 else
29845 /* Find the length of the displacement constant. */
29846 if (disp)
29848 if (base && satisfies_constraint_K (disp))
29849 len += 1;
29850 else
29851 len += 4;
29853 /* ebp always wants a displacement. Similarly r13. */
29854 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29855 len++;
29857 /* An index requires the two-byte modrm form.... */
29858 if (index
29859 /* ...like esp (or r12), which always wants an index. */
29860 || base == arg_pointer_rtx
29861 || base == frame_pointer_rtx
29862 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29863 len++;
29866 return len;
29869 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29870 is set, expect that insn have 8bit immediate alternative. */
29872 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29874 int len = 0;
29875 int i;
29876 extract_insn_cached (insn);
29877 for (i = recog_data.n_operands - 1; i >= 0; --i)
29878 if (CONSTANT_P (recog_data.operand[i]))
29880 enum attr_mode mode = get_attr_mode (insn);
29882 gcc_assert (!len);
29883 if (shortform && CONST_INT_P (recog_data.operand[i]))
29885 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29886 switch (mode)
29888 case MODE_QI:
29889 len = 1;
29890 continue;
29891 case MODE_HI:
29892 ival = trunc_int_for_mode (ival, HImode);
29893 break;
29894 case MODE_SI:
29895 ival = trunc_int_for_mode (ival, SImode);
29896 break;
29897 default:
29898 break;
29900 if (IN_RANGE (ival, -128, 127))
29902 len = 1;
29903 continue;
29906 switch (mode)
29908 case MODE_QI:
29909 len = 1;
29910 break;
29911 case MODE_HI:
29912 len = 2;
29913 break;
29914 case MODE_SI:
29915 len = 4;
29916 break;
29917 /* Immediates for DImode instructions are encoded
29918 as 32bit sign extended values. */
29919 case MODE_DI:
29920 len = 4;
29921 break;
29922 default:
29923 fatal_insn ("unknown insn mode", insn);
29926 return len;
29929 /* Compute default value for "length_address" attribute. */
29931 ix86_attr_length_address_default (rtx_insn *insn)
29933 int i;
29935 if (get_attr_type (insn) == TYPE_LEA)
29937 rtx set = PATTERN (insn), addr;
29939 if (GET_CODE (set) == PARALLEL)
29940 set = XVECEXP (set, 0, 0);
29942 gcc_assert (GET_CODE (set) == SET);
29944 addr = SET_SRC (set);
29946 return memory_address_length (addr, true);
29949 extract_insn_cached (insn);
29950 for (i = recog_data.n_operands - 1; i >= 0; --i)
29952 rtx op = recog_data.operand[i];
29953 if (MEM_P (op))
29955 constrain_operands_cached (insn, reload_completed);
29956 if (which_alternative != -1)
29958 const char *constraints = recog_data.constraints[i];
29959 int alt = which_alternative;
29961 while (*constraints == '=' || *constraints == '+')
29962 constraints++;
29963 while (alt-- > 0)
29964 while (*constraints++ != ',')
29966 /* Skip ignored operands. */
29967 if (*constraints == 'X')
29968 continue;
29971 int len = memory_address_length (XEXP (op, 0), false);
29973 /* Account for segment prefix for non-default addr spaces. */
29974 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29975 len++;
29977 return len;
29980 return 0;
29983 /* Compute default value for "length_vex" attribute. It includes
29984 2 or 3 byte VEX prefix and 1 opcode byte. */
29987 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29988 bool has_vex_w)
29990 int i;
29992 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29993 byte VEX prefix. */
29994 if (!has_0f_opcode || has_vex_w)
29995 return 3 + 1;
29997 /* We can always use 2 byte VEX prefix in 32bit. */
29998 if (!TARGET_64BIT)
29999 return 2 + 1;
30001 extract_insn_cached (insn);
30003 for (i = recog_data.n_operands - 1; i >= 0; --i)
30004 if (REG_P (recog_data.operand[i]))
30006 /* REX.W bit uses 3 byte VEX prefix. */
30007 if (GET_MODE (recog_data.operand[i]) == DImode
30008 && GENERAL_REG_P (recog_data.operand[i]))
30009 return 3 + 1;
30011 else
30013 /* REX.X or REX.B bits use 3 byte VEX prefix. */
30014 if (MEM_P (recog_data.operand[i])
30015 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
30016 return 3 + 1;
30019 return 2 + 1;
30022 /* Return the maximum number of instructions a cpu can issue. */
30024 static int
30025 ix86_issue_rate (void)
30027 switch (ix86_tune)
30029 case PROCESSOR_PENTIUM:
30030 case PROCESSOR_LAKEMONT:
30031 case PROCESSOR_BONNELL:
30032 case PROCESSOR_SILVERMONT:
30033 case PROCESSOR_KNL:
30034 case PROCESSOR_INTEL:
30035 case PROCESSOR_K6:
30036 case PROCESSOR_BTVER2:
30037 case PROCESSOR_PENTIUM4:
30038 case PROCESSOR_NOCONA:
30039 return 2;
30041 case PROCESSOR_PENTIUMPRO:
30042 case PROCESSOR_ATHLON:
30043 case PROCESSOR_K8:
30044 case PROCESSOR_AMDFAM10:
30045 case PROCESSOR_GENERIC:
30046 case PROCESSOR_BTVER1:
30047 return 3;
30049 case PROCESSOR_BDVER1:
30050 case PROCESSOR_BDVER2:
30051 case PROCESSOR_BDVER3:
30052 case PROCESSOR_BDVER4:
30053 case PROCESSOR_ZNVER1:
30054 case PROCESSOR_CORE2:
30055 case PROCESSOR_NEHALEM:
30056 case PROCESSOR_SANDYBRIDGE:
30057 case PROCESSOR_HASWELL:
30058 return 4;
30060 default:
30061 return 1;
30065 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
30066 by DEP_INSN and nothing set by DEP_INSN. */
30068 static bool
30069 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
30071 rtx set, set2;
30073 /* Simplify the test for uninteresting insns. */
30074 if (insn_type != TYPE_SETCC
30075 && insn_type != TYPE_ICMOV
30076 && insn_type != TYPE_FCMOV
30077 && insn_type != TYPE_IBR)
30078 return false;
30080 if ((set = single_set (dep_insn)) != 0)
30082 set = SET_DEST (set);
30083 set2 = NULL_RTX;
30085 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
30086 && XVECLEN (PATTERN (dep_insn), 0) == 2
30087 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
30088 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
30090 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30091 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
30093 else
30094 return false;
30096 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
30097 return false;
30099 /* This test is true if the dependent insn reads the flags but
30100 not any other potentially set register. */
30101 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
30102 return false;
30104 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
30105 return false;
30107 return true;
30110 /* Return true iff USE_INSN has a memory address with operands set by
30111 SET_INSN. */
30113 bool
30114 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
30116 int i;
30117 extract_insn_cached (use_insn);
30118 for (i = recog_data.n_operands - 1; i >= 0; --i)
30119 if (MEM_P (recog_data.operand[i]))
30121 rtx addr = XEXP (recog_data.operand[i], 0);
30122 if (modified_in_p (addr, set_insn) != 0)
30124 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
30125 has SP based memory (unless index reg is modified in a pop). */
30126 rtx set = single_set (set_insn);
30127 if (set
30128 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
30129 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
30131 struct ix86_address parts;
30132 if (ix86_decompose_address (addr, &parts)
30133 && parts.base == stack_pointer_rtx
30134 && (parts.index == NULL_RTX
30135 || MEM_P (SET_DEST (set))
30136 || !modified_in_p (parts.index, set_insn)))
30137 return false;
30139 return true;
30141 return false;
30143 return false;
30146 /* Helper function for exact_store_load_dependency.
30147 Return true if addr is found in insn. */
30148 static bool
30149 exact_dependency_1 (rtx addr, rtx insn)
30151 enum rtx_code code;
30152 const char *format_ptr;
30153 int i, j;
30155 code = GET_CODE (insn);
30156 switch (code)
30158 case MEM:
30159 if (rtx_equal_p (addr, insn))
30160 return true;
30161 break;
30162 case REG:
30163 CASE_CONST_ANY:
30164 case SYMBOL_REF:
30165 case CODE_LABEL:
30166 case PC:
30167 case CC0:
30168 case EXPR_LIST:
30169 return false;
30170 default:
30171 break;
30174 format_ptr = GET_RTX_FORMAT (code);
30175 for (i = 0; i < GET_RTX_LENGTH (code); i++)
30177 switch (*format_ptr++)
30179 case 'e':
30180 if (exact_dependency_1 (addr, XEXP (insn, i)))
30181 return true;
30182 break;
30183 case 'E':
30184 for (j = 0; j < XVECLEN (insn, i); j++)
30185 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
30186 return true;
30187 break;
30190 return false;
30193 /* Return true if there exists exact dependency for store & load, i.e.
30194 the same memory address is used in them. */
30195 static bool
30196 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
30198 rtx set1, set2;
30200 set1 = single_set (store);
30201 if (!set1)
30202 return false;
30203 if (!MEM_P (SET_DEST (set1)))
30204 return false;
30205 set2 = single_set (load);
30206 if (!set2)
30207 return false;
30208 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
30209 return true;
30210 return false;
30213 static int
30214 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
30215 unsigned int)
30217 enum attr_type insn_type, dep_insn_type;
30218 enum attr_memory memory;
30219 rtx set, set2;
30220 int dep_insn_code_number;
30222 /* Anti and output dependencies have zero cost on all CPUs. */
30223 if (dep_type != 0)
30224 return 0;
30226 dep_insn_code_number = recog_memoized (dep_insn);
30228 /* If we can't recognize the insns, we can't really do anything. */
30229 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
30230 return cost;
30232 insn_type = get_attr_type (insn);
30233 dep_insn_type = get_attr_type (dep_insn);
30235 switch (ix86_tune)
30237 case PROCESSOR_PENTIUM:
30238 case PROCESSOR_LAKEMONT:
30239 /* Address Generation Interlock adds a cycle of latency. */
30240 if (insn_type == TYPE_LEA)
30242 rtx addr = PATTERN (insn);
30244 if (GET_CODE (addr) == PARALLEL)
30245 addr = XVECEXP (addr, 0, 0);
30247 gcc_assert (GET_CODE (addr) == SET);
30249 addr = SET_SRC (addr);
30250 if (modified_in_p (addr, dep_insn))
30251 cost += 1;
30253 else if (ix86_agi_dependent (dep_insn, insn))
30254 cost += 1;
30256 /* ??? Compares pair with jump/setcc. */
30257 if (ix86_flags_dependent (insn, dep_insn, insn_type))
30258 cost = 0;
30260 /* Floating point stores require value to be ready one cycle earlier. */
30261 if (insn_type == TYPE_FMOV
30262 && get_attr_memory (insn) == MEMORY_STORE
30263 && !ix86_agi_dependent (dep_insn, insn))
30264 cost += 1;
30265 break;
30267 case PROCESSOR_PENTIUMPRO:
30268 /* INT->FP conversion is expensive. */
30269 if (get_attr_fp_int_src (dep_insn))
30270 cost += 5;
30272 /* There is one cycle extra latency between an FP op and a store. */
30273 if (insn_type == TYPE_FMOV
30274 && (set = single_set (dep_insn)) != NULL_RTX
30275 && (set2 = single_set (insn)) != NULL_RTX
30276 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
30277 && MEM_P (SET_DEST (set2)))
30278 cost += 1;
30280 memory = get_attr_memory (insn);
30282 /* Show ability of reorder buffer to hide latency of load by executing
30283 in parallel with previous instruction in case
30284 previous instruction is not needed to compute the address. */
30285 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30286 && !ix86_agi_dependent (dep_insn, insn))
30288 /* Claim moves to take one cycle, as core can issue one load
30289 at time and the next load can start cycle later. */
30290 if (dep_insn_type == TYPE_IMOV
30291 || dep_insn_type == TYPE_FMOV)
30292 cost = 1;
30293 else if (cost > 1)
30294 cost--;
30296 break;
30298 case PROCESSOR_K6:
30299 /* The esp dependency is resolved before
30300 the instruction is really finished. */
30301 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30302 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30303 return 1;
30305 /* INT->FP conversion is expensive. */
30306 if (get_attr_fp_int_src (dep_insn))
30307 cost += 5;
30309 memory = get_attr_memory (insn);
30311 /* Show ability of reorder buffer to hide latency of load by executing
30312 in parallel with previous instruction in case
30313 previous instruction is not needed to compute the address. */
30314 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30315 && !ix86_agi_dependent (dep_insn, insn))
30317 /* Claim moves to take one cycle, as core can issue one load
30318 at time and the next load can start cycle later. */
30319 if (dep_insn_type == TYPE_IMOV
30320 || dep_insn_type == TYPE_FMOV)
30321 cost = 1;
30322 else if (cost > 2)
30323 cost -= 2;
30324 else
30325 cost = 1;
30327 break;
30329 case PROCESSOR_AMDFAM10:
30330 case PROCESSOR_BDVER1:
30331 case PROCESSOR_BDVER2:
30332 case PROCESSOR_BDVER3:
30333 case PROCESSOR_BDVER4:
30334 case PROCESSOR_ZNVER1:
30335 case PROCESSOR_BTVER1:
30336 case PROCESSOR_BTVER2:
30337 case PROCESSOR_GENERIC:
30338 /* Stack engine allows to execute push&pop instructions in parall. */
30339 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30340 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30341 return 0;
30342 /* FALLTHRU */
30344 case PROCESSOR_ATHLON:
30345 case PROCESSOR_K8:
30346 memory = get_attr_memory (insn);
30348 /* Show ability of reorder buffer to hide latency of load by executing
30349 in parallel with previous instruction in case
30350 previous instruction is not needed to compute the address. */
30351 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30352 && !ix86_agi_dependent (dep_insn, insn))
30354 enum attr_unit unit = get_attr_unit (insn);
30355 int loadcost = 3;
30357 /* Because of the difference between the length of integer and
30358 floating unit pipeline preparation stages, the memory operands
30359 for floating point are cheaper.
30361 ??? For Athlon it the difference is most probably 2. */
30362 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
30363 loadcost = 3;
30364 else
30365 loadcost = TARGET_ATHLON ? 2 : 0;
30367 if (cost >= loadcost)
30368 cost -= loadcost;
30369 else
30370 cost = 0;
30372 break;
30374 case PROCESSOR_CORE2:
30375 case PROCESSOR_NEHALEM:
30376 case PROCESSOR_SANDYBRIDGE:
30377 case PROCESSOR_HASWELL:
30378 /* Stack engine allows to execute push&pop instructions in parall. */
30379 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
30380 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
30381 return 0;
30383 memory = get_attr_memory (insn);
30385 /* Show ability of reorder buffer to hide latency of load by executing
30386 in parallel with previous instruction in case
30387 previous instruction is not needed to compute the address. */
30388 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30389 && !ix86_agi_dependent (dep_insn, insn))
30391 if (cost >= 4)
30392 cost -= 4;
30393 else
30394 cost = 0;
30396 break;
30398 case PROCESSOR_SILVERMONT:
30399 case PROCESSOR_KNL:
30400 case PROCESSOR_INTEL:
30401 if (!reload_completed)
30402 return cost;
30404 /* Increase cost of integer loads. */
30405 memory = get_attr_memory (dep_insn);
30406 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
30408 enum attr_unit unit = get_attr_unit (dep_insn);
30409 if (unit == UNIT_INTEGER && cost == 1)
30411 if (memory == MEMORY_LOAD)
30412 cost = 3;
30413 else
30415 /* Increase cost of ld/st for short int types only
30416 because of store forwarding issue. */
30417 rtx set = single_set (dep_insn);
30418 if (set && (GET_MODE (SET_DEST (set)) == QImode
30419 || GET_MODE (SET_DEST (set)) == HImode))
30421 /* Increase cost of store/load insn if exact
30422 dependence exists and it is load insn. */
30423 enum attr_memory insn_memory = get_attr_memory (insn);
30424 if (insn_memory == MEMORY_LOAD
30425 && exact_store_load_dependency (dep_insn, insn))
30426 cost = 3;
30432 default:
30433 break;
30436 return cost;
30439 /* How many alternative schedules to try. This should be as wide as the
30440 scheduling freedom in the DFA, but no wider. Making this value too
30441 large results extra work for the scheduler. */
30443 static int
30444 ia32_multipass_dfa_lookahead (void)
30446 switch (ix86_tune)
30448 case PROCESSOR_PENTIUM:
30449 case PROCESSOR_LAKEMONT:
30450 return 2;
30452 case PROCESSOR_PENTIUMPRO:
30453 case PROCESSOR_K6:
30454 return 1;
30456 case PROCESSOR_BDVER1:
30457 case PROCESSOR_BDVER2:
30458 case PROCESSOR_BDVER3:
30459 case PROCESSOR_BDVER4:
30460 /* We use lookahead value 4 for BD both before and after reload
30461 schedules. Plan is to have value 8 included for O3. */
30462 return 4;
30464 case PROCESSOR_CORE2:
30465 case PROCESSOR_NEHALEM:
30466 case PROCESSOR_SANDYBRIDGE:
30467 case PROCESSOR_HASWELL:
30468 case PROCESSOR_BONNELL:
30469 case PROCESSOR_SILVERMONT:
30470 case PROCESSOR_KNL:
30471 case PROCESSOR_INTEL:
30472 /* Generally, we want haifa-sched:max_issue() to look ahead as far
30473 as many instructions can be executed on a cycle, i.e.,
30474 issue_rate. I wonder why tuning for many CPUs does not do this. */
30475 if (reload_completed)
30476 return ix86_issue_rate ();
30477 /* Don't use lookahead for pre-reload schedule to save compile time. */
30478 return 0;
30480 default:
30481 return 0;
30485 /* Return true if target platform supports macro-fusion. */
30487 static bool
30488 ix86_macro_fusion_p ()
30490 return TARGET_FUSE_CMP_AND_BRANCH;
30493 /* Check whether current microarchitecture support macro fusion
30494 for insn pair "CONDGEN + CONDJMP". Refer to
30495 "Intel Architectures Optimization Reference Manual". */
30497 static bool
30498 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
30500 rtx src, dest;
30501 enum rtx_code ccode;
30502 rtx compare_set = NULL_RTX, test_if, cond;
30503 rtx alu_set = NULL_RTX, addr = NULL_RTX;
30505 if (!any_condjump_p (condjmp))
30506 return false;
30508 unsigned int condreg1, condreg2;
30509 rtx cc_reg_1;
30510 ix86_fixed_condition_code_regs (&condreg1, &condreg2);
30511 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
30512 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
30513 || !condgen
30514 || !modified_in_p (cc_reg_1, condgen))
30515 return false;
30517 if (get_attr_type (condgen) != TYPE_TEST
30518 && get_attr_type (condgen) != TYPE_ICMP
30519 && get_attr_type (condgen) != TYPE_INCDEC
30520 && get_attr_type (condgen) != TYPE_ALU)
30521 return false;
30523 compare_set = single_set (condgen);
30524 if (compare_set == NULL_RTX
30525 && !TARGET_FUSE_ALU_AND_BRANCH)
30526 return false;
30528 if (compare_set == NULL_RTX)
30530 int i;
30531 rtx pat = PATTERN (condgen);
30532 for (i = 0; i < XVECLEN (pat, 0); i++)
30533 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
30535 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
30536 if (GET_CODE (set_src) == COMPARE)
30537 compare_set = XVECEXP (pat, 0, i);
30538 else
30539 alu_set = XVECEXP (pat, 0, i);
30542 if (compare_set == NULL_RTX)
30543 return false;
30544 src = SET_SRC (compare_set);
30545 if (GET_CODE (src) != COMPARE)
30546 return false;
30548 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
30549 supported. */
30550 if ((MEM_P (XEXP (src, 0))
30551 && CONST_INT_P (XEXP (src, 1)))
30552 || (MEM_P (XEXP (src, 1))
30553 && CONST_INT_P (XEXP (src, 0))))
30554 return false;
30556 /* No fusion for RIP-relative address. */
30557 if (MEM_P (XEXP (src, 0)))
30558 addr = XEXP (XEXP (src, 0), 0);
30559 else if (MEM_P (XEXP (src, 1)))
30560 addr = XEXP (XEXP (src, 1), 0);
30562 if (addr) {
30563 ix86_address parts;
30564 int ok = ix86_decompose_address (addr, &parts);
30565 gcc_assert (ok);
30567 if (rip_relative_addr_p (&parts))
30568 return false;
30571 test_if = SET_SRC (pc_set (condjmp));
30572 cond = XEXP (test_if, 0);
30573 ccode = GET_CODE (cond);
30574 /* Check whether conditional jump use Sign or Overflow Flags. */
30575 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
30576 && (ccode == GE
30577 || ccode == GT
30578 || ccode == LE
30579 || ccode == LT))
30580 return false;
30582 /* Return true for TYPE_TEST and TYPE_ICMP. */
30583 if (get_attr_type (condgen) == TYPE_TEST
30584 || get_attr_type (condgen) == TYPE_ICMP)
30585 return true;
30587 /* The following is the case that macro-fusion for alu + jmp. */
30588 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
30589 return false;
30591 /* No fusion for alu op with memory destination operand. */
30592 dest = SET_DEST (alu_set);
30593 if (MEM_P (dest))
30594 return false;
30596 /* Macro-fusion for inc/dec + unsigned conditional jump is not
30597 supported. */
30598 if (get_attr_type (condgen) == TYPE_INCDEC
30599 && (ccode == GEU
30600 || ccode == GTU
30601 || ccode == LEU
30602 || ccode == LTU))
30603 return false;
30605 return true;
30608 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
30609 execution. It is applied if
30610 (1) IMUL instruction is on the top of list;
30611 (2) There exists the only producer of independent IMUL instruction in
30612 ready list.
30613 Return index of IMUL producer if it was found and -1 otherwise. */
30614 static int
30615 do_reorder_for_imul (rtx_insn **ready, int n_ready)
30617 rtx_insn *insn;
30618 rtx set, insn1, insn2;
30619 sd_iterator_def sd_it;
30620 dep_t dep;
30621 int index = -1;
30622 int i;
30624 if (!TARGET_BONNELL)
30625 return index;
30627 /* Check that IMUL instruction is on the top of ready list. */
30628 insn = ready[n_ready - 1];
30629 set = single_set (insn);
30630 if (!set)
30631 return index;
30632 if (!(GET_CODE (SET_SRC (set)) == MULT
30633 && GET_MODE (SET_SRC (set)) == SImode))
30634 return index;
30636 /* Search for producer of independent IMUL instruction. */
30637 for (i = n_ready - 2; i >= 0; i--)
30639 insn = ready[i];
30640 if (!NONDEBUG_INSN_P (insn))
30641 continue;
30642 /* Skip IMUL instruction. */
30643 insn2 = PATTERN (insn);
30644 if (GET_CODE (insn2) == PARALLEL)
30645 insn2 = XVECEXP (insn2, 0, 0);
30646 if (GET_CODE (insn2) == SET
30647 && GET_CODE (SET_SRC (insn2)) == MULT
30648 && GET_MODE (SET_SRC (insn2)) == SImode)
30649 continue;
30651 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
30653 rtx con;
30654 con = DEP_CON (dep);
30655 if (!NONDEBUG_INSN_P (con))
30656 continue;
30657 insn1 = PATTERN (con);
30658 if (GET_CODE (insn1) == PARALLEL)
30659 insn1 = XVECEXP (insn1, 0, 0);
30661 if (GET_CODE (insn1) == SET
30662 && GET_CODE (SET_SRC (insn1)) == MULT
30663 && GET_MODE (SET_SRC (insn1)) == SImode)
30665 sd_iterator_def sd_it1;
30666 dep_t dep1;
30667 /* Check if there is no other dependee for IMUL. */
30668 index = i;
30669 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
30671 rtx pro;
30672 pro = DEP_PRO (dep1);
30673 if (!NONDEBUG_INSN_P (pro))
30674 continue;
30675 if (pro != insn)
30676 index = -1;
30678 if (index >= 0)
30679 break;
30682 if (index >= 0)
30683 break;
30685 return index;
30688 /* Try to find the best candidate on the top of ready list if two insns
30689 have the same priority - candidate is best if its dependees were
30690 scheduled earlier. Applied for Silvermont only.
30691 Return true if top 2 insns must be interchanged. */
30692 static bool
30693 swap_top_of_ready_list (rtx_insn **ready, int n_ready)
30695 rtx_insn *top = ready[n_ready - 1];
30696 rtx_insn *next = ready[n_ready - 2];
30697 rtx set;
30698 sd_iterator_def sd_it;
30699 dep_t dep;
30700 int clock1 = -1;
30701 int clock2 = -1;
30702 #define INSN_TICK(INSN) (HID (INSN)->tick)
30704 if (!TARGET_SILVERMONT && !TARGET_INTEL)
30705 return false;
30707 if (!NONDEBUG_INSN_P (top))
30708 return false;
30709 if (!NONJUMP_INSN_P (top))
30710 return false;
30711 if (!NONDEBUG_INSN_P (next))
30712 return false;
30713 if (!NONJUMP_INSN_P (next))
30714 return false;
30715 set = single_set (top);
30716 if (!set)
30717 return false;
30718 set = single_set (next);
30719 if (!set)
30720 return false;
30722 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
30724 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
30725 return false;
30726 /* Determine winner more precise. */
30727 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
30729 rtx pro;
30730 pro = DEP_PRO (dep);
30731 if (!NONDEBUG_INSN_P (pro))
30732 continue;
30733 if (INSN_TICK (pro) > clock1)
30734 clock1 = INSN_TICK (pro);
30736 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
30738 rtx pro;
30739 pro = DEP_PRO (dep);
30740 if (!NONDEBUG_INSN_P (pro))
30741 continue;
30742 if (INSN_TICK (pro) > clock2)
30743 clock2 = INSN_TICK (pro);
30746 if (clock1 == clock2)
30748 /* Determine winner - load must win. */
30749 enum attr_memory memory1, memory2;
30750 memory1 = get_attr_memory (top);
30751 memory2 = get_attr_memory (next);
30752 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
30753 return true;
30755 return (bool) (clock2 < clock1);
30757 return false;
30758 #undef INSN_TICK
30761 /* Perform possible reodering of ready list for Atom/Silvermont only.
30762 Return issue rate. */
30763 static int
30764 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
30765 int *pn_ready, int clock_var)
30767 int issue_rate = -1;
30768 int n_ready = *pn_ready;
30769 int i;
30770 rtx_insn *insn;
30771 int index = -1;
30773 /* Set up issue rate. */
30774 issue_rate = ix86_issue_rate ();
30776 /* Do reodering for BONNELL/SILVERMONT only. */
30777 if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
30778 return issue_rate;
30780 /* Nothing to do if ready list contains only 1 instruction. */
30781 if (n_ready <= 1)
30782 return issue_rate;
30784 /* Do reodering for post-reload scheduler only. */
30785 if (!reload_completed)
30786 return issue_rate;
30788 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
30790 if (sched_verbose > 1)
30791 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
30792 INSN_UID (ready[index]));
30794 /* Put IMUL producer (ready[index]) at the top of ready list. */
30795 insn = ready[index];
30796 for (i = index; i < n_ready - 1; i++)
30797 ready[i] = ready[i + 1];
30798 ready[n_ready - 1] = insn;
30799 return issue_rate;
30802 /* Skip selective scheduling since HID is not populated in it. */
30803 if (clock_var != 0
30804 && !sel_sched_p ()
30805 && swap_top_of_ready_list (ready, n_ready))
30807 if (sched_verbose > 1)
30808 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
30809 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
30810 /* Swap 2 top elements of ready list. */
30811 insn = ready[n_ready - 1];
30812 ready[n_ready - 1] = ready[n_ready - 2];
30813 ready[n_ready - 2] = insn;
30815 return issue_rate;
30818 static bool
30819 ix86_class_likely_spilled_p (reg_class_t);
30821 /* Returns true if lhs of insn is HW function argument register and set up
30822 is_spilled to true if it is likely spilled HW register. */
30823 static bool
30824 insn_is_function_arg (rtx insn, bool* is_spilled)
30826 rtx dst;
30828 if (!NONDEBUG_INSN_P (insn))
30829 return false;
30830 /* Call instructions are not movable, ignore it. */
30831 if (CALL_P (insn))
30832 return false;
30833 insn = PATTERN (insn);
30834 if (GET_CODE (insn) == PARALLEL)
30835 insn = XVECEXP (insn, 0, 0);
30836 if (GET_CODE (insn) != SET)
30837 return false;
30838 dst = SET_DEST (insn);
30839 if (REG_P (dst) && HARD_REGISTER_P (dst)
30840 && ix86_function_arg_regno_p (REGNO (dst)))
30842 /* Is it likely spilled HW register? */
30843 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
30844 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
30845 *is_spilled = true;
30846 return true;
30848 return false;
30851 /* Add output dependencies for chain of function adjacent arguments if only
30852 there is a move to likely spilled HW register. Return first argument
30853 if at least one dependence was added or NULL otherwise. */
30854 static rtx_insn *
30855 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
30857 rtx_insn *insn;
30858 rtx_insn *last = call;
30859 rtx_insn *first_arg = NULL;
30860 bool is_spilled = false;
30862 head = PREV_INSN (head);
30864 /* Find nearest to call argument passing instruction. */
30865 while (true)
30867 last = PREV_INSN (last);
30868 if (last == head)
30869 return NULL;
30870 if (!NONDEBUG_INSN_P (last))
30871 continue;
30872 if (insn_is_function_arg (last, &is_spilled))
30873 break;
30874 return NULL;
30877 first_arg = last;
30878 while (true)
30880 insn = PREV_INSN (last);
30881 if (!INSN_P (insn))
30882 break;
30883 if (insn == head)
30884 break;
30885 if (!NONDEBUG_INSN_P (insn))
30887 last = insn;
30888 continue;
30890 if (insn_is_function_arg (insn, &is_spilled))
30892 /* Add output depdendence between two function arguments if chain
30893 of output arguments contains likely spilled HW registers. */
30894 if (is_spilled)
30895 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30896 first_arg = last = insn;
30898 else
30899 break;
30901 if (!is_spilled)
30902 return NULL;
30903 return first_arg;
30906 /* Add output or anti dependency from insn to first_arg to restrict its code
30907 motion. */
30908 static void
30909 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
30911 rtx set;
30912 rtx tmp;
30914 /* Add anti dependencies for bounds stores. */
30915 if (INSN_P (insn)
30916 && GET_CODE (PATTERN (insn)) == PARALLEL
30917 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
30918 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
30920 add_dependence (first_arg, insn, REG_DEP_ANTI);
30921 return;
30924 set = single_set (insn);
30925 if (!set)
30926 return;
30927 tmp = SET_DEST (set);
30928 if (REG_P (tmp))
30930 /* Add output dependency to the first function argument. */
30931 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
30932 return;
30934 /* Add anti dependency. */
30935 add_dependence (first_arg, insn, REG_DEP_ANTI);
30938 /* Avoid cross block motion of function argument through adding dependency
30939 from the first non-jump instruction in bb. */
30940 static void
30941 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
30943 rtx_insn *insn = BB_END (bb);
30945 while (insn)
30947 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
30949 rtx set = single_set (insn);
30950 if (set)
30952 avoid_func_arg_motion (arg, insn);
30953 return;
30956 if (insn == BB_HEAD (bb))
30957 return;
30958 insn = PREV_INSN (insn);
30962 /* Hook for pre-reload schedule - avoid motion of function arguments
30963 passed in likely spilled HW registers. */
30964 static void
30965 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
30967 rtx_insn *insn;
30968 rtx_insn *first_arg = NULL;
30969 if (reload_completed)
30970 return;
30971 while (head != tail && DEBUG_INSN_P (head))
30972 head = NEXT_INSN (head);
30973 for (insn = tail; insn != head; insn = PREV_INSN (insn))
30974 if (INSN_P (insn) && CALL_P (insn))
30976 first_arg = add_parameter_dependencies (insn, head);
30977 if (first_arg)
30979 /* Add dependee for first argument to predecessors if only
30980 region contains more than one block. */
30981 basic_block bb = BLOCK_FOR_INSN (insn);
30982 int rgn = CONTAINING_RGN (bb->index);
30983 int nr_blks = RGN_NR_BLOCKS (rgn);
30984 /* Skip trivial regions and region head blocks that can have
30985 predecessors outside of region. */
30986 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
30988 edge e;
30989 edge_iterator ei;
30991 /* Regions are SCCs with the exception of selective
30992 scheduling with pipelining of outer blocks enabled.
30993 So also check that immediate predecessors of a non-head
30994 block are in the same region. */
30995 FOR_EACH_EDGE (e, ei, bb->preds)
30997 /* Avoid creating of loop-carried dependencies through
30998 using topological ordering in the region. */
30999 if (rgn == CONTAINING_RGN (e->src->index)
31000 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
31001 add_dependee_for_func_arg (first_arg, e->src);
31004 insn = first_arg;
31005 if (insn == head)
31006 break;
31009 else if (first_arg)
31010 avoid_func_arg_motion (first_arg, insn);
31013 /* Hook for pre-reload schedule - set priority of moves from likely spilled
31014 HW registers to maximum, to schedule them at soon as possible. These are
31015 moves from function argument registers at the top of the function entry
31016 and moves from function return value registers after call. */
31017 static int
31018 ix86_adjust_priority (rtx_insn *insn, int priority)
31020 rtx set;
31022 if (reload_completed)
31023 return priority;
31025 if (!NONDEBUG_INSN_P (insn))
31026 return priority;
31028 set = single_set (insn);
31029 if (set)
31031 rtx tmp = SET_SRC (set);
31032 if (REG_P (tmp)
31033 && HARD_REGISTER_P (tmp)
31034 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
31035 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
31036 return current_sched_info->sched_max_insns_priority;
31039 return priority;
31042 /* Model decoder of Core 2/i7.
31043 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
31044 track the instruction fetch block boundaries and make sure that long
31045 (9+ bytes) instructions are assigned to D0. */
31047 /* Maximum length of an insn that can be handled by
31048 a secondary decoder unit. '8' for Core 2/i7. */
31049 static int core2i7_secondary_decoder_max_insn_size;
31051 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
31052 '16' for Core 2/i7. */
31053 static int core2i7_ifetch_block_size;
31055 /* Maximum number of instructions decoder can handle per cycle.
31056 '6' for Core 2/i7. */
31057 static int core2i7_ifetch_block_max_insns;
31059 typedef struct ix86_first_cycle_multipass_data_ *
31060 ix86_first_cycle_multipass_data_t;
31061 typedef const struct ix86_first_cycle_multipass_data_ *
31062 const_ix86_first_cycle_multipass_data_t;
31064 /* A variable to store target state across calls to max_issue within
31065 one cycle. */
31066 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
31067 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
31069 /* Initialize DATA. */
31070 static void
31071 core2i7_first_cycle_multipass_init (void *_data)
31073 ix86_first_cycle_multipass_data_t data
31074 = (ix86_first_cycle_multipass_data_t) _data;
31076 data->ifetch_block_len = 0;
31077 data->ifetch_block_n_insns = 0;
31078 data->ready_try_change = NULL;
31079 data->ready_try_change_size = 0;
31082 /* Advancing the cycle; reset ifetch block counts. */
31083 static void
31084 core2i7_dfa_post_advance_cycle (void)
31086 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
31088 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31090 data->ifetch_block_len = 0;
31091 data->ifetch_block_n_insns = 0;
31094 static int min_insn_size (rtx_insn *);
31096 /* Filter out insns from ready_try that the core will not be able to issue
31097 on current cycle due to decoder. */
31098 static void
31099 core2i7_first_cycle_multipass_filter_ready_try
31100 (const_ix86_first_cycle_multipass_data_t data,
31101 signed char *ready_try, int n_ready, bool first_cycle_insn_p)
31103 while (n_ready--)
31105 rtx_insn *insn;
31106 int insn_size;
31108 if (ready_try[n_ready])
31109 continue;
31111 insn = get_ready_element (n_ready);
31112 insn_size = min_insn_size (insn);
31114 if (/* If this is a too long an insn for a secondary decoder ... */
31115 (!first_cycle_insn_p
31116 && insn_size > core2i7_secondary_decoder_max_insn_size)
31117 /* ... or it would not fit into the ifetch block ... */
31118 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
31119 /* ... or the decoder is full already ... */
31120 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
31121 /* ... mask the insn out. */
31123 ready_try[n_ready] = 1;
31125 if (data->ready_try_change)
31126 bitmap_set_bit (data->ready_try_change, n_ready);
31131 /* Prepare for a new round of multipass lookahead scheduling. */
31132 static void
31133 core2i7_first_cycle_multipass_begin (void *_data,
31134 signed char *ready_try, int n_ready,
31135 bool first_cycle_insn_p)
31137 ix86_first_cycle_multipass_data_t data
31138 = (ix86_first_cycle_multipass_data_t) _data;
31139 const_ix86_first_cycle_multipass_data_t prev_data
31140 = ix86_first_cycle_multipass_data;
31142 /* Restore the state from the end of the previous round. */
31143 data->ifetch_block_len = prev_data->ifetch_block_len;
31144 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
31146 /* Filter instructions that cannot be issued on current cycle due to
31147 decoder restrictions. */
31148 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31149 first_cycle_insn_p);
31152 /* INSN is being issued in current solution. Account for its impact on
31153 the decoder model. */
31154 static void
31155 core2i7_first_cycle_multipass_issue (void *_data,
31156 signed char *ready_try, int n_ready,
31157 rtx_insn *insn, const void *_prev_data)
31159 ix86_first_cycle_multipass_data_t data
31160 = (ix86_first_cycle_multipass_data_t) _data;
31161 const_ix86_first_cycle_multipass_data_t prev_data
31162 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
31164 int insn_size = min_insn_size (insn);
31166 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
31167 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
31168 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
31169 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
31171 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
31172 if (!data->ready_try_change)
31174 data->ready_try_change = sbitmap_alloc (n_ready);
31175 data->ready_try_change_size = n_ready;
31177 else if (data->ready_try_change_size < n_ready)
31179 data->ready_try_change = sbitmap_resize (data->ready_try_change,
31180 n_ready, 0);
31181 data->ready_try_change_size = n_ready;
31183 bitmap_clear (data->ready_try_change);
31185 /* Filter out insns from ready_try that the core will not be able to issue
31186 on current cycle due to decoder. */
31187 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
31188 false);
31191 /* Revert the effect on ready_try. */
31192 static void
31193 core2i7_first_cycle_multipass_backtrack (const void *_data,
31194 signed char *ready_try,
31195 int n_ready ATTRIBUTE_UNUSED)
31197 const_ix86_first_cycle_multipass_data_t data
31198 = (const_ix86_first_cycle_multipass_data_t) _data;
31199 unsigned int i = 0;
31200 sbitmap_iterator sbi;
31202 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
31203 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
31205 ready_try[i] = 0;
31209 /* Save the result of multipass lookahead scheduling for the next round. */
31210 static void
31211 core2i7_first_cycle_multipass_end (const void *_data)
31213 const_ix86_first_cycle_multipass_data_t data
31214 = (const_ix86_first_cycle_multipass_data_t) _data;
31215 ix86_first_cycle_multipass_data_t next_data
31216 = ix86_first_cycle_multipass_data;
31218 if (data != NULL)
31220 next_data->ifetch_block_len = data->ifetch_block_len;
31221 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
31225 /* Deallocate target data. */
31226 static void
31227 core2i7_first_cycle_multipass_fini (void *_data)
31229 ix86_first_cycle_multipass_data_t data
31230 = (ix86_first_cycle_multipass_data_t) _data;
31232 if (data->ready_try_change)
31234 sbitmap_free (data->ready_try_change);
31235 data->ready_try_change = NULL;
31236 data->ready_try_change_size = 0;
31240 /* Prepare for scheduling pass. */
31241 static void
31242 ix86_sched_init_global (FILE *, int, int)
31244 /* Install scheduling hooks for current CPU. Some of these hooks are used
31245 in time-critical parts of the scheduler, so we only set them up when
31246 they are actually used. */
31247 switch (ix86_tune)
31249 case PROCESSOR_CORE2:
31250 case PROCESSOR_NEHALEM:
31251 case PROCESSOR_SANDYBRIDGE:
31252 case PROCESSOR_HASWELL:
31253 /* Do not perform multipass scheduling for pre-reload schedule
31254 to save compile time. */
31255 if (reload_completed)
31257 targetm.sched.dfa_post_advance_cycle
31258 = core2i7_dfa_post_advance_cycle;
31259 targetm.sched.first_cycle_multipass_init
31260 = core2i7_first_cycle_multipass_init;
31261 targetm.sched.first_cycle_multipass_begin
31262 = core2i7_first_cycle_multipass_begin;
31263 targetm.sched.first_cycle_multipass_issue
31264 = core2i7_first_cycle_multipass_issue;
31265 targetm.sched.first_cycle_multipass_backtrack
31266 = core2i7_first_cycle_multipass_backtrack;
31267 targetm.sched.first_cycle_multipass_end
31268 = core2i7_first_cycle_multipass_end;
31269 targetm.sched.first_cycle_multipass_fini
31270 = core2i7_first_cycle_multipass_fini;
31272 /* Set decoder parameters. */
31273 core2i7_secondary_decoder_max_insn_size = 8;
31274 core2i7_ifetch_block_size = 16;
31275 core2i7_ifetch_block_max_insns = 6;
31276 break;
31278 /* Fall through. */
31279 default:
31280 targetm.sched.dfa_post_advance_cycle = NULL;
31281 targetm.sched.first_cycle_multipass_init = NULL;
31282 targetm.sched.first_cycle_multipass_begin = NULL;
31283 targetm.sched.first_cycle_multipass_issue = NULL;
31284 targetm.sched.first_cycle_multipass_backtrack = NULL;
31285 targetm.sched.first_cycle_multipass_end = NULL;
31286 targetm.sched.first_cycle_multipass_fini = NULL;
31287 break;
31292 /* Compute the alignment given to a constant that is being placed in memory.
31293 EXP is the constant and ALIGN is the alignment that the object would
31294 ordinarily have.
31295 The value of this function is used instead of that alignment to align
31296 the object. */
31299 ix86_constant_alignment (tree exp, int align)
31301 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
31302 || TREE_CODE (exp) == INTEGER_CST)
31304 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
31305 return 64;
31306 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
31307 return 128;
31309 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
31310 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
31311 return BITS_PER_WORD;
31313 return align;
31316 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
31317 the data type, and ALIGN is the alignment that the object would
31318 ordinarily have. */
31320 static int
31321 iamcu_alignment (tree type, int align)
31323 machine_mode mode;
31325 if (align < 32 || TYPE_USER_ALIGN (type))
31326 return align;
31328 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
31329 bytes. */
31330 mode = TYPE_MODE (strip_array_types (type));
31331 switch (GET_MODE_CLASS (mode))
31333 case MODE_INT:
31334 case MODE_COMPLEX_INT:
31335 case MODE_COMPLEX_FLOAT:
31336 case MODE_FLOAT:
31337 case MODE_DECIMAL_FLOAT:
31338 return 32;
31339 default:
31340 return align;
31344 /* Compute the alignment for a static variable.
31345 TYPE is the data type, and ALIGN is the alignment that
31346 the object would ordinarily have. The value of this function is used
31347 instead of that alignment to align the object. */
31350 ix86_data_alignment (tree type, int align, bool opt)
31352 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
31353 for symbols from other compilation units or symbols that don't need
31354 to bind locally. In order to preserve some ABI compatibility with
31355 those compilers, ensure we don't decrease alignment from what we
31356 used to assume. */
31358 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
31360 /* A data structure, equal or greater than the size of a cache line
31361 (64 bytes in the Pentium 4 and other recent Intel processors, including
31362 processors based on Intel Core microarchitecture) should be aligned
31363 so that its base address is a multiple of a cache line size. */
31365 int max_align
31366 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
31368 if (max_align < BITS_PER_WORD)
31369 max_align = BITS_PER_WORD;
31371 switch (ix86_align_data_type)
31373 case ix86_align_data_type_abi: opt = false; break;
31374 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
31375 case ix86_align_data_type_cacheline: break;
31378 if (TARGET_IAMCU)
31379 align = iamcu_alignment (type, align);
31381 if (opt
31382 && AGGREGATE_TYPE_P (type)
31383 && TYPE_SIZE (type)
31384 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
31386 if (wi::geu_p (TYPE_SIZE (type), max_align_compat)
31387 && align < max_align_compat)
31388 align = max_align_compat;
31389 if (wi::geu_p (TYPE_SIZE (type), max_align)
31390 && align < max_align)
31391 align = max_align;
31394 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31395 to 16byte boundary. */
31396 if (TARGET_64BIT)
31398 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
31399 && TYPE_SIZE (type)
31400 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31401 && wi::geu_p (TYPE_SIZE (type), 128)
31402 && align < 128)
31403 return 128;
31406 if (!opt)
31407 return align;
31409 if (TREE_CODE (type) == ARRAY_TYPE)
31411 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31412 return 64;
31413 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31414 return 128;
31416 else if (TREE_CODE (type) == COMPLEX_TYPE)
31419 if (TYPE_MODE (type) == DCmode && align < 64)
31420 return 64;
31421 if ((TYPE_MODE (type) == XCmode
31422 || TYPE_MODE (type) == TCmode) && align < 128)
31423 return 128;
31425 else if ((TREE_CODE (type) == RECORD_TYPE
31426 || TREE_CODE (type) == UNION_TYPE
31427 || TREE_CODE (type) == QUAL_UNION_TYPE)
31428 && TYPE_FIELDS (type))
31430 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31431 return 64;
31432 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31433 return 128;
31435 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31436 || TREE_CODE (type) == INTEGER_TYPE)
31438 if (TYPE_MODE (type) == DFmode && align < 64)
31439 return 64;
31440 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31441 return 128;
31444 return align;
31447 /* Compute the alignment for a local variable or a stack slot. EXP is
31448 the data type or decl itself, MODE is the widest mode available and
31449 ALIGN is the alignment that the object would ordinarily have. The
31450 value of this macro is used instead of that alignment to align the
31451 object. */
31453 unsigned int
31454 ix86_local_alignment (tree exp, machine_mode mode,
31455 unsigned int align)
31457 tree type, decl;
31459 if (exp && DECL_P (exp))
31461 type = TREE_TYPE (exp);
31462 decl = exp;
31464 else
31466 type = exp;
31467 decl = NULL;
31470 /* Don't do dynamic stack realignment for long long objects with
31471 -mpreferred-stack-boundary=2. */
31472 if (!TARGET_64BIT
31473 && align == 64
31474 && ix86_preferred_stack_boundary < 64
31475 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
31476 && (!type || !TYPE_USER_ALIGN (type))
31477 && (!decl || !DECL_USER_ALIGN (decl)))
31478 align = 32;
31480 /* If TYPE is NULL, we are allocating a stack slot for caller-save
31481 register in MODE. We will return the largest alignment of XF
31482 and DF. */
31483 if (!type)
31485 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
31486 align = GET_MODE_ALIGNMENT (DFmode);
31487 return align;
31490 /* Don't increase alignment for Intel MCU psABI. */
31491 if (TARGET_IAMCU)
31492 return align;
31494 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
31495 to 16byte boundary. Exact wording is:
31497 An array uses the same alignment as its elements, except that a local or
31498 global array variable of length at least 16 bytes or
31499 a C99 variable-length array variable always has alignment of at least 16 bytes.
31501 This was added to allow use of aligned SSE instructions at arrays. This
31502 rule is meant for static storage (where compiler can not do the analysis
31503 by itself). We follow it for automatic variables only when convenient.
31504 We fully control everything in the function compiled and functions from
31505 other unit can not rely on the alignment.
31507 Exclude va_list type. It is the common case of local array where
31508 we can not benefit from the alignment.
31510 TODO: Probably one should optimize for size only when var is not escaping. */
31511 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
31512 && TARGET_SSE)
31514 if (AGGREGATE_TYPE_P (type)
31515 && (va_list_type_node == NULL_TREE
31516 || (TYPE_MAIN_VARIANT (type)
31517 != TYPE_MAIN_VARIANT (va_list_type_node)))
31518 && TYPE_SIZE (type)
31519 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
31520 && wi::geu_p (TYPE_SIZE (type), 128)
31521 && align < 128)
31522 return 128;
31524 if (TREE_CODE (type) == ARRAY_TYPE)
31526 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
31527 return 64;
31528 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
31529 return 128;
31531 else if (TREE_CODE (type) == COMPLEX_TYPE)
31533 if (TYPE_MODE (type) == DCmode && align < 64)
31534 return 64;
31535 if ((TYPE_MODE (type) == XCmode
31536 || TYPE_MODE (type) == TCmode) && align < 128)
31537 return 128;
31539 else if ((TREE_CODE (type) == RECORD_TYPE
31540 || TREE_CODE (type) == UNION_TYPE
31541 || TREE_CODE (type) == QUAL_UNION_TYPE)
31542 && TYPE_FIELDS (type))
31544 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
31545 return 64;
31546 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
31547 return 128;
31549 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
31550 || TREE_CODE (type) == INTEGER_TYPE)
31553 if (TYPE_MODE (type) == DFmode && align < 64)
31554 return 64;
31555 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
31556 return 128;
31558 return align;
31561 /* Compute the minimum required alignment for dynamic stack realignment
31562 purposes for a local variable, parameter or a stack slot. EXP is
31563 the data type or decl itself, MODE is its mode and ALIGN is the
31564 alignment that the object would ordinarily have. */
31566 unsigned int
31567 ix86_minimum_alignment (tree exp, machine_mode mode,
31568 unsigned int align)
31570 tree type, decl;
31572 if (exp && DECL_P (exp))
31574 type = TREE_TYPE (exp);
31575 decl = exp;
31577 else
31579 type = exp;
31580 decl = NULL;
31583 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
31584 return align;
31586 /* Don't do dynamic stack realignment for long long objects with
31587 -mpreferred-stack-boundary=2. */
31588 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
31589 && (!type || !TYPE_USER_ALIGN (type))
31590 && (!decl || !DECL_USER_ALIGN (decl)))
31592 gcc_checking_assert (!TARGET_STV);
31593 return 32;
31596 return align;
31599 /* Find a location for the static chain incoming to a nested function.
31600 This is a register, unless all free registers are used by arguments. */
31602 static rtx
31603 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
31605 unsigned regno;
31607 /* While this function won't be called by the middle-end when a static
31608 chain isn't needed, it's also used throughout the backend so it's
31609 easiest to keep this check centralized. */
31610 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
31611 return NULL;
31613 if (TARGET_64BIT)
31615 /* We always use R10 in 64-bit mode. */
31616 regno = R10_REG;
31618 else
31620 const_tree fntype, fndecl;
31621 unsigned int ccvt;
31623 /* By default in 32-bit mode we use ECX to pass the static chain. */
31624 regno = CX_REG;
31626 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
31628 fntype = TREE_TYPE (fndecl_or_type);
31629 fndecl = fndecl_or_type;
31631 else
31633 fntype = fndecl_or_type;
31634 fndecl = NULL;
31637 ccvt = ix86_get_callcvt (fntype);
31638 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31640 /* Fastcall functions use ecx/edx for arguments, which leaves
31641 us with EAX for the static chain.
31642 Thiscall functions use ecx for arguments, which also
31643 leaves us with EAX for the static chain. */
31644 regno = AX_REG;
31646 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31648 /* Thiscall functions use ecx for arguments, which leaves
31649 us with EAX and EDX for the static chain.
31650 We are using for abi-compatibility EAX. */
31651 regno = AX_REG;
31653 else if (ix86_function_regparm (fntype, fndecl) == 3)
31655 /* For regparm 3, we have no free call-clobbered registers in
31656 which to store the static chain. In order to implement this,
31657 we have the trampoline push the static chain to the stack.
31658 However, we can't push a value below the return address when
31659 we call the nested function directly, so we have to use an
31660 alternate entry point. For this we use ESI, and have the
31661 alternate entry point push ESI, so that things appear the
31662 same once we're executing the nested function. */
31663 if (incoming_p)
31665 if (fndecl == current_function_decl
31666 && !ix86_static_chain_on_stack)
31668 gcc_assert (!reload_completed);
31669 ix86_static_chain_on_stack = true;
31671 return gen_frame_mem (SImode,
31672 plus_constant (Pmode,
31673 arg_pointer_rtx, -8));
31675 regno = SI_REG;
31679 return gen_rtx_REG (Pmode, regno);
31682 /* Emit RTL insns to initialize the variable parts of a trampoline.
31683 FNDECL is the decl of the target address; M_TRAMP is a MEM for
31684 the trampoline, and CHAIN_VALUE is an RTX for the static chain
31685 to be passed to the target function. */
31687 static void
31688 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
31690 rtx mem, fnaddr;
31691 int opcode;
31692 int offset = 0;
31694 fnaddr = XEXP (DECL_RTL (fndecl), 0);
31696 if (TARGET_64BIT)
31698 int size;
31700 /* Load the function address to r11. Try to load address using
31701 the shorter movl instead of movabs. We may want to support
31702 movq for kernel mode, but kernel does not use trampolines at
31703 the moment. FNADDR is a 32bit address and may not be in
31704 DImode when ptr_mode == SImode. Always use movl in this
31705 case. */
31706 if (ptr_mode == SImode
31707 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
31709 fnaddr = copy_addr_to_reg (fnaddr);
31711 mem = adjust_address (m_tramp, HImode, offset);
31712 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
31714 mem = adjust_address (m_tramp, SImode, offset + 2);
31715 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
31716 offset += 6;
31718 else
31720 mem = adjust_address (m_tramp, HImode, offset);
31721 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
31723 mem = adjust_address (m_tramp, DImode, offset + 2);
31724 emit_move_insn (mem, fnaddr);
31725 offset += 10;
31728 /* Load static chain using movabs to r10. Use the shorter movl
31729 instead of movabs when ptr_mode == SImode. */
31730 if (ptr_mode == SImode)
31732 opcode = 0xba41;
31733 size = 6;
31735 else
31737 opcode = 0xba49;
31738 size = 10;
31741 mem = adjust_address (m_tramp, HImode, offset);
31742 emit_move_insn (mem, gen_int_mode (opcode, HImode));
31744 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
31745 emit_move_insn (mem, chain_value);
31746 offset += size;
31748 /* Jump to r11; the last (unused) byte is a nop, only there to
31749 pad the write out to a single 32-bit store. */
31750 mem = adjust_address (m_tramp, SImode, offset);
31751 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
31752 offset += 4;
31754 else
31756 rtx disp, chain;
31758 /* Depending on the static chain location, either load a register
31759 with a constant, or push the constant to the stack. All of the
31760 instructions are the same size. */
31761 chain = ix86_static_chain (fndecl, true);
31762 if (REG_P (chain))
31764 switch (REGNO (chain))
31766 case AX_REG:
31767 opcode = 0xb8; break;
31768 case CX_REG:
31769 opcode = 0xb9; break;
31770 default:
31771 gcc_unreachable ();
31774 else
31775 opcode = 0x68;
31777 mem = adjust_address (m_tramp, QImode, offset);
31778 emit_move_insn (mem, gen_int_mode (opcode, QImode));
31780 mem = adjust_address (m_tramp, SImode, offset + 1);
31781 emit_move_insn (mem, chain_value);
31782 offset += 5;
31784 mem = adjust_address (m_tramp, QImode, offset);
31785 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
31787 mem = adjust_address (m_tramp, SImode, offset + 1);
31789 /* Compute offset from the end of the jmp to the target function.
31790 In the case in which the trampoline stores the static chain on
31791 the stack, we need to skip the first insn which pushes the
31792 (call-saved) register static chain; this push is 1 byte. */
31793 offset += 5;
31794 disp = expand_binop (SImode, sub_optab, fnaddr,
31795 plus_constant (Pmode, XEXP (m_tramp, 0),
31796 offset - (MEM_P (chain) ? 1 : 0)),
31797 NULL_RTX, 1, OPTAB_DIRECT);
31798 emit_move_insn (mem, disp);
31801 gcc_assert (offset <= TRAMPOLINE_SIZE);
31803 #ifdef HAVE_ENABLE_EXECUTE_STACK
31804 #ifdef CHECK_EXECUTE_STACK_ENABLED
31805 if (CHECK_EXECUTE_STACK_ENABLED)
31806 #endif
31807 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
31808 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
31809 #endif
31812 static bool
31813 ix86_allocate_stack_slots_for_args (void)
31815 /* Naked functions should not allocate stack slots for arguments. */
31816 return !ix86_function_naked (current_function_decl);
31819 static bool
31820 ix86_warn_func_return (tree decl)
31822 /* Naked functions are implemented entirely in assembly, including the
31823 return sequence, so suppress warnings about this. */
31824 return !ix86_function_naked (decl);
31827 /* The following file contains several enumerations and data structures
31828 built from the definitions in i386-builtin-types.def. */
31830 #include "i386-builtin-types.inc"
31832 /* Table for the ix86 builtin non-function types. */
31833 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
31835 /* Retrieve an element from the above table, building some of
31836 the types lazily. */
31838 static tree
31839 ix86_get_builtin_type (enum ix86_builtin_type tcode)
31841 unsigned int index;
31842 tree type, itype;
31844 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
31846 type = ix86_builtin_type_tab[(int) tcode];
31847 if (type != NULL)
31848 return type;
31850 gcc_assert (tcode > IX86_BT_LAST_PRIM);
31851 if (tcode <= IX86_BT_LAST_VECT)
31853 machine_mode mode;
31855 index = tcode - IX86_BT_LAST_PRIM - 1;
31856 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
31857 mode = ix86_builtin_type_vect_mode[index];
31859 type = build_vector_type_for_mode (itype, mode);
31861 else
31863 int quals;
31865 index = tcode - IX86_BT_LAST_VECT - 1;
31866 if (tcode <= IX86_BT_LAST_PTR)
31867 quals = TYPE_UNQUALIFIED;
31868 else
31869 quals = TYPE_QUAL_CONST;
31871 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
31872 if (quals != TYPE_UNQUALIFIED)
31873 itype = build_qualified_type (itype, quals);
31875 type = build_pointer_type (itype);
31878 ix86_builtin_type_tab[(int) tcode] = type;
31879 return type;
31882 /* Table for the ix86 builtin function types. */
31883 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
31885 /* Retrieve an element from the above table, building some of
31886 the types lazily. */
31888 static tree
31889 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
31891 tree type;
31893 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
31895 type = ix86_builtin_func_type_tab[(int) tcode];
31896 if (type != NULL)
31897 return type;
31899 if (tcode <= IX86_BT_LAST_FUNC)
31901 unsigned start = ix86_builtin_func_start[(int) tcode];
31902 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
31903 tree rtype, atype, args = void_list_node;
31904 unsigned i;
31906 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
31907 for (i = after - 1; i > start; --i)
31909 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
31910 args = tree_cons (NULL, atype, args);
31913 type = build_function_type (rtype, args);
31915 else
31917 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
31918 enum ix86_builtin_func_type icode;
31920 icode = ix86_builtin_func_alias_base[index];
31921 type = ix86_get_builtin_func_type (icode);
31924 ix86_builtin_func_type_tab[(int) tcode] = type;
31925 return type;
31929 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
31930 bdesc_* arrays below should come first, then builtins for each bdesc_*
31931 array in ascending order, so that we can use direct array accesses. */
31932 enum ix86_builtins
31934 IX86_BUILTIN_MASKMOVQ,
31935 IX86_BUILTIN_LDMXCSR,
31936 IX86_BUILTIN_STMXCSR,
31937 IX86_BUILTIN_MASKMOVDQU,
31938 IX86_BUILTIN_PSLLDQ128,
31939 IX86_BUILTIN_CLFLUSH,
31940 IX86_BUILTIN_MONITOR,
31941 IX86_BUILTIN_MWAIT,
31942 IX86_BUILTIN_CLZERO,
31943 IX86_BUILTIN_VEC_INIT_V2SI,
31944 IX86_BUILTIN_VEC_INIT_V4HI,
31945 IX86_BUILTIN_VEC_INIT_V8QI,
31946 IX86_BUILTIN_VEC_EXT_V2DF,
31947 IX86_BUILTIN_VEC_EXT_V2DI,
31948 IX86_BUILTIN_VEC_EXT_V4SF,
31949 IX86_BUILTIN_VEC_EXT_V4SI,
31950 IX86_BUILTIN_VEC_EXT_V8HI,
31951 IX86_BUILTIN_VEC_EXT_V2SI,
31952 IX86_BUILTIN_VEC_EXT_V4HI,
31953 IX86_BUILTIN_VEC_EXT_V16QI,
31954 IX86_BUILTIN_VEC_SET_V2DI,
31955 IX86_BUILTIN_VEC_SET_V4SF,
31956 IX86_BUILTIN_VEC_SET_V4SI,
31957 IX86_BUILTIN_VEC_SET_V8HI,
31958 IX86_BUILTIN_VEC_SET_V4HI,
31959 IX86_BUILTIN_VEC_SET_V16QI,
31960 IX86_BUILTIN_GATHERSIV2DF,
31961 IX86_BUILTIN_GATHERSIV4DF,
31962 IX86_BUILTIN_GATHERDIV2DF,
31963 IX86_BUILTIN_GATHERDIV4DF,
31964 IX86_BUILTIN_GATHERSIV4SF,
31965 IX86_BUILTIN_GATHERSIV8SF,
31966 IX86_BUILTIN_GATHERDIV4SF,
31967 IX86_BUILTIN_GATHERDIV8SF,
31968 IX86_BUILTIN_GATHERSIV2DI,
31969 IX86_BUILTIN_GATHERSIV4DI,
31970 IX86_BUILTIN_GATHERDIV2DI,
31971 IX86_BUILTIN_GATHERDIV4DI,
31972 IX86_BUILTIN_GATHERSIV4SI,
31973 IX86_BUILTIN_GATHERSIV8SI,
31974 IX86_BUILTIN_GATHERDIV4SI,
31975 IX86_BUILTIN_GATHERDIV8SI,
31976 IX86_BUILTIN_VFMSUBSD3_MASK3,
31977 IX86_BUILTIN_VFMSUBSS3_MASK3,
31978 IX86_BUILTIN_GATHER3SIV8SF,
31979 IX86_BUILTIN_GATHER3SIV4SF,
31980 IX86_BUILTIN_GATHER3SIV4DF,
31981 IX86_BUILTIN_GATHER3SIV2DF,
31982 IX86_BUILTIN_GATHER3DIV8SF,
31983 IX86_BUILTIN_GATHER3DIV4SF,
31984 IX86_BUILTIN_GATHER3DIV4DF,
31985 IX86_BUILTIN_GATHER3DIV2DF,
31986 IX86_BUILTIN_GATHER3SIV8SI,
31987 IX86_BUILTIN_GATHER3SIV4SI,
31988 IX86_BUILTIN_GATHER3SIV4DI,
31989 IX86_BUILTIN_GATHER3SIV2DI,
31990 IX86_BUILTIN_GATHER3DIV8SI,
31991 IX86_BUILTIN_GATHER3DIV4SI,
31992 IX86_BUILTIN_GATHER3DIV4DI,
31993 IX86_BUILTIN_GATHER3DIV2DI,
31994 IX86_BUILTIN_SCATTERSIV8SF,
31995 IX86_BUILTIN_SCATTERSIV4SF,
31996 IX86_BUILTIN_SCATTERSIV4DF,
31997 IX86_BUILTIN_SCATTERSIV2DF,
31998 IX86_BUILTIN_SCATTERDIV8SF,
31999 IX86_BUILTIN_SCATTERDIV4SF,
32000 IX86_BUILTIN_SCATTERDIV4DF,
32001 IX86_BUILTIN_SCATTERDIV2DF,
32002 IX86_BUILTIN_SCATTERSIV8SI,
32003 IX86_BUILTIN_SCATTERSIV4SI,
32004 IX86_BUILTIN_SCATTERSIV4DI,
32005 IX86_BUILTIN_SCATTERSIV2DI,
32006 IX86_BUILTIN_SCATTERDIV8SI,
32007 IX86_BUILTIN_SCATTERDIV4SI,
32008 IX86_BUILTIN_SCATTERDIV4DI,
32009 IX86_BUILTIN_SCATTERDIV2DI,
32010 /* Alternate 4 and 8 element gather/scatter for the vectorizer
32011 where all operands are 32-byte or 64-byte wide respectively. */
32012 IX86_BUILTIN_GATHERALTSIV4DF,
32013 IX86_BUILTIN_GATHERALTDIV8SF,
32014 IX86_BUILTIN_GATHERALTSIV4DI,
32015 IX86_BUILTIN_GATHERALTDIV8SI,
32016 IX86_BUILTIN_GATHER3ALTDIV16SF,
32017 IX86_BUILTIN_GATHER3ALTDIV16SI,
32018 IX86_BUILTIN_GATHER3ALTSIV4DF,
32019 IX86_BUILTIN_GATHER3ALTDIV8SF,
32020 IX86_BUILTIN_GATHER3ALTSIV4DI,
32021 IX86_BUILTIN_GATHER3ALTDIV8SI,
32022 IX86_BUILTIN_GATHER3ALTSIV8DF,
32023 IX86_BUILTIN_GATHER3ALTSIV8DI,
32024 IX86_BUILTIN_GATHER3DIV16SF,
32025 IX86_BUILTIN_GATHER3DIV16SI,
32026 IX86_BUILTIN_GATHER3DIV8DF,
32027 IX86_BUILTIN_GATHER3DIV8DI,
32028 IX86_BUILTIN_GATHER3SIV16SF,
32029 IX86_BUILTIN_GATHER3SIV16SI,
32030 IX86_BUILTIN_GATHER3SIV8DF,
32031 IX86_BUILTIN_GATHER3SIV8DI,
32032 IX86_BUILTIN_SCATTERALTSIV8DF,
32033 IX86_BUILTIN_SCATTERALTDIV16SF,
32034 IX86_BUILTIN_SCATTERALTSIV8DI,
32035 IX86_BUILTIN_SCATTERALTDIV16SI,
32036 IX86_BUILTIN_SCATTERDIV16SF,
32037 IX86_BUILTIN_SCATTERDIV16SI,
32038 IX86_BUILTIN_SCATTERDIV8DF,
32039 IX86_BUILTIN_SCATTERDIV8DI,
32040 IX86_BUILTIN_SCATTERSIV16SF,
32041 IX86_BUILTIN_SCATTERSIV16SI,
32042 IX86_BUILTIN_SCATTERSIV8DF,
32043 IX86_BUILTIN_SCATTERSIV8DI,
32044 IX86_BUILTIN_GATHERPFQPD,
32045 IX86_BUILTIN_GATHERPFDPS,
32046 IX86_BUILTIN_GATHERPFDPD,
32047 IX86_BUILTIN_GATHERPFQPS,
32048 IX86_BUILTIN_SCATTERPFDPD,
32049 IX86_BUILTIN_SCATTERPFDPS,
32050 IX86_BUILTIN_SCATTERPFQPD,
32051 IX86_BUILTIN_SCATTERPFQPS,
32052 IX86_BUILTIN_CLWB,
32053 IX86_BUILTIN_CLFLUSHOPT,
32054 IX86_BUILTIN_INFQ,
32055 IX86_BUILTIN_HUGE_VALQ,
32056 IX86_BUILTIN_NANQ,
32057 IX86_BUILTIN_NANSQ,
32058 IX86_BUILTIN_XABORT,
32059 IX86_BUILTIN_ADDCARRYX32,
32060 IX86_BUILTIN_ADDCARRYX64,
32061 IX86_BUILTIN_SBB32,
32062 IX86_BUILTIN_SBB64,
32063 IX86_BUILTIN_RDRAND16_STEP,
32064 IX86_BUILTIN_RDRAND32_STEP,
32065 IX86_BUILTIN_RDRAND64_STEP,
32066 IX86_BUILTIN_RDSEED16_STEP,
32067 IX86_BUILTIN_RDSEED32_STEP,
32068 IX86_BUILTIN_RDSEED64_STEP,
32069 IX86_BUILTIN_MONITORX,
32070 IX86_BUILTIN_MWAITX,
32071 IX86_BUILTIN_CFSTRING,
32072 IX86_BUILTIN_CPU_INIT,
32073 IX86_BUILTIN_CPU_IS,
32074 IX86_BUILTIN_CPU_SUPPORTS,
32075 IX86_BUILTIN_READ_FLAGS,
32076 IX86_BUILTIN_WRITE_FLAGS,
32078 /* All the remaining builtins are tracked in bdesc_* arrays in
32079 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
32080 this point. */
32081 #define BDESC(mask, icode, name, code, comparison, flag) \
32082 code,
32083 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32084 code, \
32085 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
32086 #define BDESC_END(kind, next_kind)
32088 #include "i386-builtin.def"
32090 #undef BDESC
32091 #undef BDESC_FIRST
32092 #undef BDESC_END
32094 IX86_BUILTIN_MAX,
32096 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
32098 /* Now just the aliases for bdesc_* start/end. */
32099 #define BDESC(mask, icode, name, code, comparison, flag)
32100 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
32101 #define BDESC_END(kind, next_kind) \
32102 IX86_BUILTIN__BDESC_##kind##_LAST \
32103 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
32105 #include "i386-builtin.def"
32107 #undef BDESC
32108 #undef BDESC_FIRST
32109 #undef BDESC_END
32111 /* Just to make sure there is no comma after the last enumerator. */
32112 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
32115 /* Table for the ix86 builtin decls. */
32116 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
32118 /* Table of all of the builtin functions that are possible with different ISA's
32119 but are waiting to be built until a function is declared to use that
32120 ISA. */
32121 struct builtin_isa {
32122 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
32123 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
32124 const char *name; /* function name */
32125 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
32126 unsigned char const_p:1; /* true if the declaration is constant */
32127 unsigned char pure_p:1; /* true if the declaration has pure attribute */
32128 bool leaf_p; /* true if the declaration has leaf attribute */
32129 bool nothrow_p; /* true if the declaration has nothrow attribute */
32130 bool set_and_not_built_p;
32133 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
32135 /* Bits that can still enable any inclusion of a builtin. */
32136 static HOST_WIDE_INT deferred_isa_values = 0;
32137 static HOST_WIDE_INT deferred_isa_values2 = 0;
32139 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
32140 of which isa_flags to use in the ix86_builtins_isa array. Stores the
32141 function decl in the ix86_builtins array. Returns the function decl or
32142 NULL_TREE, if the builtin was not added.
32144 If the front end has a special hook for builtin functions, delay adding
32145 builtin functions that aren't in the current ISA until the ISA is changed
32146 with function specific optimization. Doing so, can save about 300K for the
32147 default compiler. When the builtin is expanded, check at that time whether
32148 it is valid.
32150 If the front end doesn't have a special hook, record all builtins, even if
32151 it isn't an instruction set in the current ISA in case the user uses
32152 function specific options for a different ISA, so that we don't get scope
32153 errors if a builtin is added in the middle of a function scope. */
32155 static inline tree
32156 def_builtin (HOST_WIDE_INT mask, const char *name,
32157 enum ix86_builtin_func_type tcode,
32158 enum ix86_builtins code)
32160 tree decl = NULL_TREE;
32162 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
32164 ix86_builtins_isa[(int) code].isa = mask;
32166 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
32167 where any bit set means that built-in is enable, this bit must be *and-ed*
32168 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
32169 means that *both* cpuid bits must be set for the built-in to be available.
32170 Handle this here. */
32171 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32172 mask &= ~OPTION_MASK_ISA_AVX512VL;
32174 mask &= ~OPTION_MASK_ISA_64BIT;
32175 if (mask == 0
32176 || (mask & ix86_isa_flags) != 0
32177 || (lang_hooks.builtin_function
32178 == lang_hooks.builtin_function_ext_scope))
32181 tree type = ix86_get_builtin_func_type (tcode);
32182 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32183 NULL, NULL_TREE);
32184 ix86_builtins[(int) code] = decl;
32185 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32187 else
32189 /* Just a MASK where set_and_not_built_p == true can potentially
32190 include a builtin. */
32191 deferred_isa_values |= mask;
32192 ix86_builtins[(int) code] = NULL_TREE;
32193 ix86_builtins_isa[(int) code].tcode = tcode;
32194 ix86_builtins_isa[(int) code].name = name;
32195 ix86_builtins_isa[(int) code].leaf_p = false;
32196 ix86_builtins_isa[(int) code].nothrow_p = false;
32197 ix86_builtins_isa[(int) code].const_p = false;
32198 ix86_builtins_isa[(int) code].pure_p = false;
32199 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32203 return decl;
32206 /* Like def_builtin, but also marks the function decl "const". */
32208 static inline tree
32209 def_builtin_const (HOST_WIDE_INT mask, const char *name,
32210 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32212 tree decl = def_builtin (mask, name, tcode, code);
32213 if (decl)
32214 TREE_READONLY (decl) = 1;
32215 else
32216 ix86_builtins_isa[(int) code].const_p = true;
32218 return decl;
32221 /* Like def_builtin, but also marks the function decl "pure". */
32223 static inline tree
32224 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
32225 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32227 tree decl = def_builtin (mask, name, tcode, code);
32228 if (decl)
32229 DECL_PURE_P (decl) = 1;
32230 else
32231 ix86_builtins_isa[(int) code].pure_p = true;
32233 return decl;
32236 /* Like def_builtin, but for additional isa2 flags. */
32238 static inline tree
32239 def_builtin2 (HOST_WIDE_INT mask, const char *name,
32240 enum ix86_builtin_func_type tcode,
32241 enum ix86_builtins code)
32243 tree decl = NULL_TREE;
32245 ix86_builtins_isa[(int) code].isa2 = mask;
32247 if (mask == 0
32248 || (mask & ix86_isa_flags2) != 0
32249 || (lang_hooks.builtin_function
32250 == lang_hooks.builtin_function_ext_scope))
32253 tree type = ix86_get_builtin_func_type (tcode);
32254 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32255 NULL, NULL_TREE);
32256 ix86_builtins[(int) code] = decl;
32257 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
32259 else
32261 /* Just a MASK where set_and_not_built_p == true can potentially
32262 include a builtin. */
32263 deferred_isa_values2 |= mask;
32264 ix86_builtins[(int) code] = NULL_TREE;
32265 ix86_builtins_isa[(int) code].tcode = tcode;
32266 ix86_builtins_isa[(int) code].name = name;
32267 ix86_builtins_isa[(int) code].leaf_p = false;
32268 ix86_builtins_isa[(int) code].nothrow_p = false;
32269 ix86_builtins_isa[(int) code].const_p = false;
32270 ix86_builtins_isa[(int) code].pure_p = false;
32271 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
32274 return decl;
32277 /* Like def_builtin, but also marks the function decl "const". */
32279 static inline tree
32280 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
32281 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32283 tree decl = def_builtin2 (mask, name, tcode, code);
32284 if (decl)
32285 TREE_READONLY (decl) = 1;
32286 else
32287 ix86_builtins_isa[(int) code].const_p = true;
32289 return decl;
32292 /* Like def_builtin, but also marks the function decl "pure". */
32294 static inline tree
32295 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
32296 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
32298 tree decl = def_builtin2 (mask, name, tcode, code);
32299 if (decl)
32300 DECL_PURE_P (decl) = 1;
32301 else
32302 ix86_builtins_isa[(int) code].pure_p = true;
32304 return decl;
32307 /* Add any new builtin functions for a given ISA that may not have been
32308 declared. This saves a bit of space compared to adding all of the
32309 declarations to the tree, even if we didn't use them. */
32311 static void
32312 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
32314 if ((isa & deferred_isa_values) == 0
32315 && (isa2 & deferred_isa_values2) == 0)
32316 return;
32318 /* Bits in ISA value can be removed from potential isa values. */
32319 deferred_isa_values &= ~isa;
32320 deferred_isa_values2 &= ~isa2;
32322 int i;
32323 tree saved_current_target_pragma = current_target_pragma;
32324 current_target_pragma = NULL_TREE;
32326 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
32328 if (((ix86_builtins_isa[i].isa & isa) != 0
32329 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
32330 && ix86_builtins_isa[i].set_and_not_built_p)
32332 tree decl, type;
32334 /* Don't define the builtin again. */
32335 ix86_builtins_isa[i].set_and_not_built_p = false;
32337 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
32338 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
32339 type, i, BUILT_IN_MD, NULL,
32340 NULL_TREE);
32342 ix86_builtins[i] = decl;
32343 if (ix86_builtins_isa[i].const_p)
32344 TREE_READONLY (decl) = 1;
32345 if (ix86_builtins_isa[i].pure_p)
32346 DECL_PURE_P (decl) = 1;
32347 if (ix86_builtins_isa[i].leaf_p)
32348 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
32349 NULL_TREE);
32350 if (ix86_builtins_isa[i].nothrow_p)
32351 TREE_NOTHROW (decl) = 1;
32355 current_target_pragma = saved_current_target_pragma;
32358 /* Bits for builtin_description.flag. */
32360 /* Set when we don't support the comparison natively, and should
32361 swap_comparison in order to support it. */
32362 #define BUILTIN_DESC_SWAP_OPERANDS 1
32364 struct builtin_description
32366 const HOST_WIDE_INT mask;
32367 const enum insn_code icode;
32368 const char *const name;
32369 const enum ix86_builtins code;
32370 const enum rtx_code comparison;
32371 const int flag;
32374 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
32375 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
32376 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
32377 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
32378 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
32379 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
32380 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
32381 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
32382 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
32383 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
32384 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
32385 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
32386 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
32387 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
32388 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
32389 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
32390 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
32391 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
32392 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
32393 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
32394 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
32395 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
32396 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
32397 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
32398 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
32399 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
32400 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
32401 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
32402 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
32403 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
32404 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
32405 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
32406 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
32407 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
32408 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
32409 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
32410 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
32411 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
32412 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
32413 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
32414 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
32415 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
32416 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
32417 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
32418 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
32419 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
32420 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
32421 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
32422 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
32423 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
32424 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
32425 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
32427 #define BDESC(mask, icode, name, code, comparison, flag) \
32428 { mask, icode, name, code, comparison, flag },
32429 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
32430 static const struct builtin_description bdesc_##kind[] = \
32432 BDESC (mask, icode, name, code, comparison, flag)
32433 #define BDESC_END(kind, next_kind) \
32436 #include "i386-builtin.def"
32438 #undef BDESC
32439 #undef BDESC_FIRST
32440 #undef BDESC_END
32442 /* TM vector builtins. */
32444 /* Reuse the existing x86-specific `struct builtin_description' cause
32445 we're lazy. Add casts to make them fit. */
32446 static const struct builtin_description bdesc_tm[] =
32448 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32449 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32450 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
32451 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32452 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32453 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32454 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
32456 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32457 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32458 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
32459 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32460 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32461 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32462 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
32464 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32465 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32466 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
32467 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32468 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32469 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32470 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
32472 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
32473 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
32474 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
32477 /* Initialize the transactional memory vector load/store builtins. */
32479 static void
32480 ix86_init_tm_builtins (void)
32482 enum ix86_builtin_func_type ftype;
32483 const struct builtin_description *d;
32484 size_t i;
32485 tree decl;
32486 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
32487 tree attrs_log, attrs_type_log;
32489 if (!flag_tm)
32490 return;
32492 /* If there are no builtins defined, we must be compiling in a
32493 language without trans-mem support. */
32494 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
32495 return;
32497 /* Use whatever attributes a normal TM load has. */
32498 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
32499 attrs_load = DECL_ATTRIBUTES (decl);
32500 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32501 /* Use whatever attributes a normal TM store has. */
32502 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
32503 attrs_store = DECL_ATTRIBUTES (decl);
32504 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32505 /* Use whatever attributes a normal TM log has. */
32506 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
32507 attrs_log = DECL_ATTRIBUTES (decl);
32508 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
32510 for (i = 0, d = bdesc_tm;
32511 i < ARRAY_SIZE (bdesc_tm);
32512 i++, d++)
32514 if ((d->mask & ix86_isa_flags) != 0
32515 || (lang_hooks.builtin_function
32516 == lang_hooks.builtin_function_ext_scope))
32518 tree type, attrs, attrs_type;
32519 enum built_in_function code = (enum built_in_function) d->code;
32521 ftype = (enum ix86_builtin_func_type) d->flag;
32522 type = ix86_get_builtin_func_type (ftype);
32524 if (BUILTIN_TM_LOAD_P (code))
32526 attrs = attrs_load;
32527 attrs_type = attrs_type_load;
32529 else if (BUILTIN_TM_STORE_P (code))
32531 attrs = attrs_store;
32532 attrs_type = attrs_type_store;
32534 else
32536 attrs = attrs_log;
32537 attrs_type = attrs_type_log;
32539 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
32540 /* The builtin without the prefix for
32541 calling it directly. */
32542 d->name + strlen ("__builtin_"),
32543 attrs);
32544 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
32545 set the TYPE_ATTRIBUTES. */
32546 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
32548 set_builtin_decl (code, decl, false);
32553 /* Macros for verification of enum ix86_builtins order. */
32554 #define BDESC_VERIFY(x, y, z) \
32555 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
32556 #define BDESC_VERIFYS(x, y, z) \
32557 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
32559 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32560 IX86_BUILTIN__BDESC_COMI_LAST, 1);
32561 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32562 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
32563 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32564 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
32565 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
32566 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
32567 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32568 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
32569 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
32570 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
32571 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
32572 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
32573 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
32574 IX86_BUILTIN__BDESC_MPX_LAST, 1);
32575 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
32576 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
32577 BDESC_VERIFYS (IX86_BUILTIN_MAX,
32578 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
32580 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
32581 in the current target ISA to allow the user to compile particular modules
32582 with different target specific options that differ from the command line
32583 options. */
32584 static void
32585 ix86_init_mmx_sse_builtins (void)
32587 const struct builtin_description * d;
32588 enum ix86_builtin_func_type ftype;
32589 size_t i;
32591 /* Add all special builtins with variable number of operands. */
32592 for (i = 0, d = bdesc_special_args;
32593 i < ARRAY_SIZE (bdesc_special_args);
32594 i++, d++)
32596 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
32597 if (d->name == 0)
32598 continue;
32600 ftype = (enum ix86_builtin_func_type) d->flag;
32601 def_builtin (d->mask, d->name, ftype, d->code);
32603 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
32604 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
32605 ARRAY_SIZE (bdesc_special_args) - 1);
32607 /* Add all builtins with variable number of operands. */
32608 for (i = 0, d = bdesc_args;
32609 i < ARRAY_SIZE (bdesc_args);
32610 i++, d++)
32612 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
32613 if (d->name == 0)
32614 continue;
32616 ftype = (enum ix86_builtin_func_type) d->flag;
32617 def_builtin_const (d->mask, d->name, ftype, d->code);
32619 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
32620 IX86_BUILTIN__BDESC_ARGS_FIRST,
32621 ARRAY_SIZE (bdesc_args) - 1);
32623 /* Add all builtins with variable number of operands. */
32624 for (i = 0, d = bdesc_args2;
32625 i < ARRAY_SIZE (bdesc_args2);
32626 i++, d++)
32628 if (d->name == 0)
32629 continue;
32631 ftype = (enum ix86_builtin_func_type) d->flag;
32632 def_builtin_const2 (d->mask, d->name, ftype, d->code);
32635 /* Add all builtins with rounding. */
32636 for (i = 0, d = bdesc_round_args;
32637 i < ARRAY_SIZE (bdesc_round_args);
32638 i++, d++)
32640 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
32641 if (d->name == 0)
32642 continue;
32644 ftype = (enum ix86_builtin_func_type) d->flag;
32645 def_builtin_const (d->mask, d->name, ftype, d->code);
32647 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
32648 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
32649 ARRAY_SIZE (bdesc_round_args) - 1);
32651 /* pcmpestr[im] insns. */
32652 for (i = 0, d = bdesc_pcmpestr;
32653 i < ARRAY_SIZE (bdesc_pcmpestr);
32654 i++, d++)
32656 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
32657 if (d->code == IX86_BUILTIN_PCMPESTRM128)
32658 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
32659 else
32660 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
32661 def_builtin_const (d->mask, d->name, ftype, d->code);
32663 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
32664 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
32665 ARRAY_SIZE (bdesc_pcmpestr) - 1);
32667 /* pcmpistr[im] insns. */
32668 for (i = 0, d = bdesc_pcmpistr;
32669 i < ARRAY_SIZE (bdesc_pcmpistr);
32670 i++, d++)
32672 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
32673 if (d->code == IX86_BUILTIN_PCMPISTRM128)
32674 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
32675 else
32676 ftype = INT_FTYPE_V16QI_V16QI_INT;
32677 def_builtin_const (d->mask, d->name, ftype, d->code);
32679 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
32680 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
32681 ARRAY_SIZE (bdesc_pcmpistr) - 1);
32683 /* comi/ucomi insns. */
32684 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32686 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
32687 if (d->mask == OPTION_MASK_ISA_SSE2)
32688 ftype = INT_FTYPE_V2DF_V2DF;
32689 else
32690 ftype = INT_FTYPE_V4SF_V4SF;
32691 def_builtin_const (d->mask, d->name, ftype, d->code);
32693 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
32694 IX86_BUILTIN__BDESC_COMI_FIRST,
32695 ARRAY_SIZE (bdesc_comi) - 1);
32697 /* SSE */
32698 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
32699 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
32700 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
32701 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
32703 /* SSE or 3DNow!A */
32704 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
32705 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
32706 IX86_BUILTIN_MASKMOVQ);
32708 /* SSE2 */
32709 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
32710 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
32712 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
32713 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
32714 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
32715 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
32717 /* SSE3. */
32718 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
32719 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
32720 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
32721 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
32723 /* AES */
32724 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
32725 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
32726 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
32727 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
32728 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
32729 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
32730 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
32731 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
32732 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
32733 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
32734 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
32735 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
32737 /* PCLMUL */
32738 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
32739 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
32741 /* RDRND */
32742 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
32743 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
32744 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
32745 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
32746 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
32747 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
32748 IX86_BUILTIN_RDRAND64_STEP);
32750 /* AVX2 */
32751 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
32752 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
32753 IX86_BUILTIN_GATHERSIV2DF);
32755 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
32756 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
32757 IX86_BUILTIN_GATHERSIV4DF);
32759 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
32760 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
32761 IX86_BUILTIN_GATHERDIV2DF);
32763 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
32764 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
32765 IX86_BUILTIN_GATHERDIV4DF);
32767 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
32768 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
32769 IX86_BUILTIN_GATHERSIV4SF);
32771 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
32772 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
32773 IX86_BUILTIN_GATHERSIV8SF);
32775 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
32776 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
32777 IX86_BUILTIN_GATHERDIV4SF);
32779 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
32780 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
32781 IX86_BUILTIN_GATHERDIV8SF);
32783 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
32784 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
32785 IX86_BUILTIN_GATHERSIV2DI);
32787 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
32788 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
32789 IX86_BUILTIN_GATHERSIV4DI);
32791 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
32792 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
32793 IX86_BUILTIN_GATHERDIV2DI);
32795 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
32796 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
32797 IX86_BUILTIN_GATHERDIV4DI);
32799 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
32800 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
32801 IX86_BUILTIN_GATHERSIV4SI);
32803 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
32804 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
32805 IX86_BUILTIN_GATHERSIV8SI);
32807 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
32808 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
32809 IX86_BUILTIN_GATHERDIV4SI);
32811 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
32812 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
32813 IX86_BUILTIN_GATHERDIV8SI);
32815 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
32816 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
32817 IX86_BUILTIN_GATHERALTSIV4DF);
32819 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
32820 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
32821 IX86_BUILTIN_GATHERALTDIV8SF);
32823 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
32824 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
32825 IX86_BUILTIN_GATHERALTSIV4DI);
32827 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
32828 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
32829 IX86_BUILTIN_GATHERALTDIV8SI);
32831 /* AVX512F */
32832 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
32833 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
32834 IX86_BUILTIN_GATHER3SIV16SF);
32836 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
32837 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
32838 IX86_BUILTIN_GATHER3SIV8DF);
32840 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
32841 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
32842 IX86_BUILTIN_GATHER3DIV16SF);
32844 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
32845 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
32846 IX86_BUILTIN_GATHER3DIV8DF);
32848 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
32849 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
32850 IX86_BUILTIN_GATHER3SIV16SI);
32852 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
32853 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
32854 IX86_BUILTIN_GATHER3SIV8DI);
32856 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
32857 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
32858 IX86_BUILTIN_GATHER3DIV16SI);
32860 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
32861 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
32862 IX86_BUILTIN_GATHER3DIV8DI);
32864 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
32865 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
32866 IX86_BUILTIN_GATHER3ALTSIV8DF);
32868 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
32869 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
32870 IX86_BUILTIN_GATHER3ALTDIV16SF);
32872 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
32873 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
32874 IX86_BUILTIN_GATHER3ALTSIV8DI);
32876 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
32877 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
32878 IX86_BUILTIN_GATHER3ALTDIV16SI);
32880 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
32881 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
32882 IX86_BUILTIN_SCATTERSIV16SF);
32884 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
32885 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
32886 IX86_BUILTIN_SCATTERSIV8DF);
32888 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
32889 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
32890 IX86_BUILTIN_SCATTERDIV16SF);
32892 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
32893 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
32894 IX86_BUILTIN_SCATTERDIV8DF);
32896 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
32897 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
32898 IX86_BUILTIN_SCATTERSIV16SI);
32900 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
32901 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
32902 IX86_BUILTIN_SCATTERSIV8DI);
32904 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
32905 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
32906 IX86_BUILTIN_SCATTERDIV16SI);
32908 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
32909 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
32910 IX86_BUILTIN_SCATTERDIV8DI);
32912 /* AVX512VL */
32913 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
32914 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
32915 IX86_BUILTIN_GATHER3SIV2DF);
32917 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
32918 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
32919 IX86_BUILTIN_GATHER3SIV4DF);
32921 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
32922 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
32923 IX86_BUILTIN_GATHER3DIV2DF);
32925 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
32926 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
32927 IX86_BUILTIN_GATHER3DIV4DF);
32929 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
32930 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
32931 IX86_BUILTIN_GATHER3SIV4SF);
32933 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
32934 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
32935 IX86_BUILTIN_GATHER3SIV8SF);
32937 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
32938 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
32939 IX86_BUILTIN_GATHER3DIV4SF);
32941 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
32942 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
32943 IX86_BUILTIN_GATHER3DIV8SF);
32945 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
32946 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
32947 IX86_BUILTIN_GATHER3SIV2DI);
32949 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
32950 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
32951 IX86_BUILTIN_GATHER3SIV4DI);
32953 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
32954 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
32955 IX86_BUILTIN_GATHER3DIV2DI);
32957 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
32958 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
32959 IX86_BUILTIN_GATHER3DIV4DI);
32961 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
32962 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
32963 IX86_BUILTIN_GATHER3SIV4SI);
32965 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
32966 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
32967 IX86_BUILTIN_GATHER3SIV8SI);
32969 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
32970 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
32971 IX86_BUILTIN_GATHER3DIV4SI);
32973 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
32974 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
32975 IX86_BUILTIN_GATHER3DIV8SI);
32977 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
32978 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
32979 IX86_BUILTIN_GATHER3ALTSIV4DF);
32981 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
32982 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
32983 IX86_BUILTIN_GATHER3ALTDIV8SF);
32985 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
32986 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
32987 IX86_BUILTIN_GATHER3ALTSIV4DI);
32989 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
32990 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
32991 IX86_BUILTIN_GATHER3ALTDIV8SI);
32993 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
32994 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
32995 IX86_BUILTIN_SCATTERSIV8SF);
32997 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
32998 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
32999 IX86_BUILTIN_SCATTERSIV4SF);
33001 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
33002 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
33003 IX86_BUILTIN_SCATTERSIV4DF);
33005 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
33006 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
33007 IX86_BUILTIN_SCATTERSIV2DF);
33009 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
33010 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
33011 IX86_BUILTIN_SCATTERDIV8SF);
33013 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
33014 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
33015 IX86_BUILTIN_SCATTERDIV4SF);
33017 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
33018 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
33019 IX86_BUILTIN_SCATTERDIV4DF);
33021 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
33022 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
33023 IX86_BUILTIN_SCATTERDIV2DF);
33025 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
33026 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
33027 IX86_BUILTIN_SCATTERSIV8SI);
33029 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
33030 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
33031 IX86_BUILTIN_SCATTERSIV4SI);
33033 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
33034 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
33035 IX86_BUILTIN_SCATTERSIV4DI);
33037 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
33038 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
33039 IX86_BUILTIN_SCATTERSIV2DI);
33041 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
33042 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
33043 IX86_BUILTIN_SCATTERDIV8SI);
33045 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
33046 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
33047 IX86_BUILTIN_SCATTERDIV4SI);
33049 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
33050 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
33051 IX86_BUILTIN_SCATTERDIV4DI);
33053 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
33054 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
33055 IX86_BUILTIN_SCATTERDIV2DI);
33056 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
33057 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
33058 IX86_BUILTIN_SCATTERALTSIV8DF);
33060 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
33061 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
33062 IX86_BUILTIN_SCATTERALTDIV16SF);
33064 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
33065 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
33066 IX86_BUILTIN_SCATTERALTSIV8DI);
33068 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
33069 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
33070 IX86_BUILTIN_SCATTERALTDIV16SI);
33072 /* AVX512PF */
33073 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
33074 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33075 IX86_BUILTIN_GATHERPFDPD);
33076 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
33077 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33078 IX86_BUILTIN_GATHERPFDPS);
33079 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
33080 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33081 IX86_BUILTIN_GATHERPFQPD);
33082 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
33083 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33084 IX86_BUILTIN_GATHERPFQPS);
33085 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
33086 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
33087 IX86_BUILTIN_SCATTERPFDPD);
33088 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
33089 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
33090 IX86_BUILTIN_SCATTERPFDPS);
33091 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
33092 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33093 IX86_BUILTIN_SCATTERPFQPD);
33094 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
33095 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
33096 IX86_BUILTIN_SCATTERPFQPS);
33098 /* SHA */
33099 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
33100 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
33101 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
33102 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
33103 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
33104 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
33105 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
33106 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
33107 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
33108 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
33109 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
33110 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
33111 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
33112 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
33114 /* RTM. */
33115 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
33116 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
33118 /* MMX access to the vec_init patterns. */
33119 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
33120 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
33122 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
33123 V4HI_FTYPE_HI_HI_HI_HI,
33124 IX86_BUILTIN_VEC_INIT_V4HI);
33126 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
33127 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
33128 IX86_BUILTIN_VEC_INIT_V8QI);
33130 /* Access to the vec_extract patterns. */
33131 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
33132 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
33133 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
33134 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
33135 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
33136 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
33137 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
33138 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
33139 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
33140 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
33142 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33143 "__builtin_ia32_vec_ext_v4hi",
33144 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
33146 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
33147 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
33149 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
33150 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
33152 /* Access to the vec_set patterns. */
33153 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
33154 "__builtin_ia32_vec_set_v2di",
33155 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
33157 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
33158 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
33160 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
33161 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
33163 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
33164 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
33166 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
33167 "__builtin_ia32_vec_set_v4hi",
33168 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
33170 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
33171 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
33173 /* RDSEED */
33174 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
33175 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
33176 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
33177 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
33178 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
33179 "__builtin_ia32_rdseed_di_step",
33180 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
33182 /* ADCX */
33183 def_builtin (0, "__builtin_ia32_addcarryx_u32",
33184 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
33185 def_builtin (OPTION_MASK_ISA_64BIT,
33186 "__builtin_ia32_addcarryx_u64",
33187 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33188 IX86_BUILTIN_ADDCARRYX64);
33190 /* SBB */
33191 def_builtin (0, "__builtin_ia32_sbb_u32",
33192 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
33193 def_builtin (OPTION_MASK_ISA_64BIT,
33194 "__builtin_ia32_sbb_u64",
33195 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
33196 IX86_BUILTIN_SBB64);
33198 /* Read/write FLAGS. */
33199 def_builtin (0, "__builtin_ia32_readeflags_u32",
33200 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33201 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
33202 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
33203 def_builtin (0, "__builtin_ia32_writeeflags_u32",
33204 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
33205 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
33206 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
33208 /* CLFLUSHOPT. */
33209 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
33210 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
33212 /* CLWB. */
33213 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
33214 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
33216 /* MONITORX and MWAITX. */
33217 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
33218 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
33219 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
33220 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
33222 /* CLZERO. */
33223 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
33224 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
33226 /* Add FMA4 multi-arg argument instructions */
33227 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33229 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
33230 if (d->name == 0)
33231 continue;
33233 ftype = (enum ix86_builtin_func_type) d->flag;
33234 def_builtin_const (d->mask, d->name, ftype, d->code);
33236 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
33237 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
33238 ARRAY_SIZE (bdesc_multi_arg) - 1);
33241 static void
33242 ix86_init_mpx_builtins ()
33244 const struct builtin_description * d;
33245 enum ix86_builtin_func_type ftype;
33246 tree decl;
33247 size_t i;
33249 for (i = 0, d = bdesc_mpx;
33250 i < ARRAY_SIZE (bdesc_mpx);
33251 i++, d++)
33253 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
33254 if (d->name == 0)
33255 continue;
33257 ftype = (enum ix86_builtin_func_type) d->flag;
33258 decl = def_builtin (d->mask, d->name, ftype, d->code);
33260 /* With no leaf and nothrow flags for MPX builtins
33261 abnormal edges may follow its call when setjmp
33262 presents in the function. Since we may have a lot
33263 of MPX builtins calls it causes lots of useless
33264 edges and enormous PHI nodes. To avoid this we mark
33265 MPX builtins as leaf and nothrow. */
33266 if (decl)
33268 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33269 NULL_TREE);
33270 TREE_NOTHROW (decl) = 1;
33272 else
33274 ix86_builtins_isa[(int)d->code].leaf_p = true;
33275 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33278 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
33279 IX86_BUILTIN__BDESC_MPX_FIRST,
33280 ARRAY_SIZE (bdesc_mpx) - 1);
33282 for (i = 0, d = bdesc_mpx_const;
33283 i < ARRAY_SIZE (bdesc_mpx_const);
33284 i++, d++)
33286 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
33287 if (d->name == 0)
33288 continue;
33290 ftype = (enum ix86_builtin_func_type) d->flag;
33291 decl = def_builtin_const (d->mask, d->name, ftype, d->code);
33293 if (decl)
33295 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
33296 NULL_TREE);
33297 TREE_NOTHROW (decl) = 1;
33299 else
33301 ix86_builtins_isa[(int)d->code].leaf_p = true;
33302 ix86_builtins_isa[(int)d->code].nothrow_p = true;
33305 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
33306 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
33307 ARRAY_SIZE (bdesc_mpx_const) - 1);
33309 #undef BDESC_VERIFY
33310 #undef BDESC_VERIFYS
33312 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
33313 to return a pointer to VERSION_DECL if the outcome of the expression
33314 formed by PREDICATE_CHAIN is true. This function will be called during
33315 version dispatch to decide which function version to execute. It returns
33316 the basic block at the end, to which more conditions can be added. */
33318 static basic_block
33319 add_condition_to_bb (tree function_decl, tree version_decl,
33320 tree predicate_chain, basic_block new_bb)
33322 gimple *return_stmt;
33323 tree convert_expr, result_var;
33324 gimple *convert_stmt;
33325 gimple *call_cond_stmt;
33326 gimple *if_else_stmt;
33328 basic_block bb1, bb2, bb3;
33329 edge e12, e23;
33331 tree cond_var, and_expr_var = NULL_TREE;
33332 gimple_seq gseq;
33334 tree predicate_decl, predicate_arg;
33336 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
33338 gcc_assert (new_bb != NULL);
33339 gseq = bb_seq (new_bb);
33342 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
33343 build_fold_addr_expr (version_decl));
33344 result_var = create_tmp_var (ptr_type_node);
33345 convert_stmt = gimple_build_assign (result_var, convert_expr);
33346 return_stmt = gimple_build_return (result_var);
33348 if (predicate_chain == NULL_TREE)
33350 gimple_seq_add_stmt (&gseq, convert_stmt);
33351 gimple_seq_add_stmt (&gseq, return_stmt);
33352 set_bb_seq (new_bb, gseq);
33353 gimple_set_bb (convert_stmt, new_bb);
33354 gimple_set_bb (return_stmt, new_bb);
33355 pop_cfun ();
33356 return new_bb;
33359 while (predicate_chain != NULL)
33361 cond_var = create_tmp_var (integer_type_node);
33362 predicate_decl = TREE_PURPOSE (predicate_chain);
33363 predicate_arg = TREE_VALUE (predicate_chain);
33364 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
33365 gimple_call_set_lhs (call_cond_stmt, cond_var);
33367 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
33368 gimple_set_bb (call_cond_stmt, new_bb);
33369 gimple_seq_add_stmt (&gseq, call_cond_stmt);
33371 predicate_chain = TREE_CHAIN (predicate_chain);
33373 if (and_expr_var == NULL)
33374 and_expr_var = cond_var;
33375 else
33377 gimple *assign_stmt;
33378 /* Use MIN_EXPR to check if any integer is zero?.
33379 and_expr_var = min_expr <cond_var, and_expr_var> */
33380 assign_stmt = gimple_build_assign (and_expr_var,
33381 build2 (MIN_EXPR, integer_type_node,
33382 cond_var, and_expr_var));
33384 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
33385 gimple_set_bb (assign_stmt, new_bb);
33386 gimple_seq_add_stmt (&gseq, assign_stmt);
33390 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
33391 integer_zero_node,
33392 NULL_TREE, NULL_TREE);
33393 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
33394 gimple_set_bb (if_else_stmt, new_bb);
33395 gimple_seq_add_stmt (&gseq, if_else_stmt);
33397 gimple_seq_add_stmt (&gseq, convert_stmt);
33398 gimple_seq_add_stmt (&gseq, return_stmt);
33399 set_bb_seq (new_bb, gseq);
33401 bb1 = new_bb;
33402 e12 = split_block (bb1, if_else_stmt);
33403 bb2 = e12->dest;
33404 e12->flags &= ~EDGE_FALLTHRU;
33405 e12->flags |= EDGE_TRUE_VALUE;
33407 e23 = split_block (bb2, return_stmt);
33409 gimple_set_bb (convert_stmt, bb2);
33410 gimple_set_bb (return_stmt, bb2);
33412 bb3 = e23->dest;
33413 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
33415 remove_edge (e23);
33416 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
33418 pop_cfun ();
33420 return bb3;
33423 /* This parses the attribute arguments to target in DECL and determines
33424 the right builtin to use to match the platform specification.
33425 It returns the priority value for this version decl. If PREDICATE_LIST
33426 is not NULL, it stores the list of cpu features that need to be checked
33427 before dispatching this function. */
33429 static unsigned int
33430 get_builtin_code_for_version (tree decl, tree *predicate_list)
33432 tree attrs;
33433 struct cl_target_option cur_target;
33434 tree target_node;
33435 struct cl_target_option *new_target;
33436 const char *arg_str = NULL;
33437 const char *attrs_str = NULL;
33438 char *tok_str = NULL;
33439 char *token;
33441 /* Priority of i386 features, greater value is higher priority. This is
33442 used to decide the order in which function dispatch must happen. For
33443 instance, a version specialized for SSE4.2 should be checked for dispatch
33444 before a version for SSE3, as SSE4.2 implies SSE3. */
33445 enum feature_priority
33447 P_ZERO = 0,
33448 P_MMX,
33449 P_SSE,
33450 P_SSE2,
33451 P_SSE3,
33452 P_SSSE3,
33453 P_PROC_SSSE3,
33454 P_SSE4_A,
33455 P_PROC_SSE4_A,
33456 P_SSE4_1,
33457 P_SSE4_2,
33458 P_PROC_SSE4_2,
33459 P_POPCNT,
33460 P_AES,
33461 P_PCLMUL,
33462 P_AVX,
33463 P_PROC_AVX,
33464 P_BMI,
33465 P_PROC_BMI,
33466 P_FMA4,
33467 P_XOP,
33468 P_PROC_XOP,
33469 P_FMA,
33470 P_PROC_FMA,
33471 P_BMI2,
33472 P_AVX2,
33473 P_PROC_AVX2,
33474 P_AVX512F,
33475 P_PROC_AVX512F
33478 enum feature_priority priority = P_ZERO;
33480 /* These are the target attribute strings for which a dispatcher is
33481 available, from fold_builtin_cpu. */
33483 static struct _feature_list
33485 const char *const name;
33486 const enum feature_priority priority;
33488 const feature_list[] =
33490 {"mmx", P_MMX},
33491 {"sse", P_SSE},
33492 {"sse2", P_SSE2},
33493 {"sse3", P_SSE3},
33494 {"sse4a", P_SSE4_A},
33495 {"ssse3", P_SSSE3},
33496 {"sse4.1", P_SSE4_1},
33497 {"sse4.2", P_SSE4_2},
33498 {"popcnt", P_POPCNT},
33499 {"aes", P_AES},
33500 {"pclmul", P_PCLMUL},
33501 {"avx", P_AVX},
33502 {"bmi", P_BMI},
33503 {"fma4", P_FMA4},
33504 {"xop", P_XOP},
33505 {"fma", P_FMA},
33506 {"bmi2", P_BMI2},
33507 {"avx2", P_AVX2},
33508 {"avx512f", P_AVX512F}
33512 static unsigned int NUM_FEATURES
33513 = sizeof (feature_list) / sizeof (struct _feature_list);
33515 unsigned int i;
33517 tree predicate_chain = NULL_TREE;
33518 tree predicate_decl, predicate_arg;
33520 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33521 gcc_assert (attrs != NULL);
33523 attrs = TREE_VALUE (TREE_VALUE (attrs));
33525 gcc_assert (TREE_CODE (attrs) == STRING_CST);
33526 attrs_str = TREE_STRING_POINTER (attrs);
33528 /* Return priority zero for default function. */
33529 if (strcmp (attrs_str, "default") == 0)
33530 return 0;
33532 /* Handle arch= if specified. For priority, set it to be 1 more than
33533 the best instruction set the processor can handle. For instance, if
33534 there is a version for atom and a version for ssse3 (the highest ISA
33535 priority for atom), the atom version must be checked for dispatch
33536 before the ssse3 version. */
33537 if (strstr (attrs_str, "arch=") != NULL)
33539 cl_target_option_save (&cur_target, &global_options);
33540 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
33541 &global_options_set);
33543 gcc_assert (target_node);
33544 new_target = TREE_TARGET_OPTION (target_node);
33545 gcc_assert (new_target);
33547 if (new_target->arch_specified && new_target->arch > 0)
33549 switch (new_target->arch)
33551 case PROCESSOR_CORE2:
33552 arg_str = "core2";
33553 priority = P_PROC_SSSE3;
33554 break;
33555 case PROCESSOR_NEHALEM:
33556 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
33558 arg_str = "westmere";
33559 priority = P_AES;
33561 else
33563 /* We translate "arch=corei7" and "arch=nehalem" to
33564 "corei7" so that it will be mapped to M_INTEL_COREI7
33565 as cpu type to cover all M_INTEL_COREI7_XXXs. */
33566 arg_str = "corei7";
33567 priority = P_PROC_SSE4_2;
33569 break;
33570 case PROCESSOR_SANDYBRIDGE:
33571 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
33572 arg_str = "ivybridge";
33573 else
33574 arg_str = "sandybridge";
33575 priority = P_PROC_AVX;
33576 break;
33577 case PROCESSOR_HASWELL:
33578 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
33579 arg_str = "skylake-avx512";
33580 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
33581 arg_str = "skylake";
33582 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
33583 arg_str = "broadwell";
33584 else
33585 arg_str = "haswell";
33586 priority = P_PROC_AVX2;
33587 break;
33588 case PROCESSOR_BONNELL:
33589 arg_str = "bonnell";
33590 priority = P_PROC_SSSE3;
33591 break;
33592 case PROCESSOR_KNL:
33593 arg_str = "knl";
33594 priority = P_PROC_AVX512F;
33595 break;
33596 case PROCESSOR_SILVERMONT:
33597 arg_str = "silvermont";
33598 priority = P_PROC_SSE4_2;
33599 break;
33600 case PROCESSOR_AMDFAM10:
33601 arg_str = "amdfam10h";
33602 priority = P_PROC_SSE4_A;
33603 break;
33604 case PROCESSOR_BTVER1:
33605 arg_str = "btver1";
33606 priority = P_PROC_SSE4_A;
33607 break;
33608 case PROCESSOR_BTVER2:
33609 arg_str = "btver2";
33610 priority = P_PROC_BMI;
33611 break;
33612 case PROCESSOR_BDVER1:
33613 arg_str = "bdver1";
33614 priority = P_PROC_XOP;
33615 break;
33616 case PROCESSOR_BDVER2:
33617 arg_str = "bdver2";
33618 priority = P_PROC_FMA;
33619 break;
33620 case PROCESSOR_BDVER3:
33621 arg_str = "bdver3";
33622 priority = P_PROC_FMA;
33623 break;
33624 case PROCESSOR_BDVER4:
33625 arg_str = "bdver4";
33626 priority = P_PROC_AVX2;
33627 break;
33628 case PROCESSOR_ZNVER1:
33629 arg_str = "znver1";
33630 priority = P_PROC_AVX2;
33631 break;
33635 cl_target_option_restore (&global_options, &cur_target);
33637 if (predicate_list && arg_str == NULL)
33639 error_at (DECL_SOURCE_LOCATION (decl),
33640 "No dispatcher found for the versioning attributes");
33641 return 0;
33644 if (predicate_list)
33646 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
33647 /* For a C string literal the length includes the trailing NULL. */
33648 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
33649 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33650 predicate_chain);
33654 /* Process feature name. */
33655 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
33656 strcpy (tok_str, attrs_str);
33657 token = strtok (tok_str, ",");
33658 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
33660 while (token != NULL)
33662 /* Do not process "arch=" */
33663 if (strncmp (token, "arch=", 5) == 0)
33665 token = strtok (NULL, ",");
33666 continue;
33668 for (i = 0; i < NUM_FEATURES; ++i)
33670 if (strcmp (token, feature_list[i].name) == 0)
33672 if (predicate_list)
33674 predicate_arg = build_string_literal (
33675 strlen (feature_list[i].name) + 1,
33676 feature_list[i].name);
33677 predicate_chain = tree_cons (predicate_decl, predicate_arg,
33678 predicate_chain);
33680 /* Find the maximum priority feature. */
33681 if (feature_list[i].priority > priority)
33682 priority = feature_list[i].priority;
33684 break;
33687 if (predicate_list && i == NUM_FEATURES)
33689 error_at (DECL_SOURCE_LOCATION (decl),
33690 "No dispatcher found for %s", token);
33691 return 0;
33693 token = strtok (NULL, ",");
33695 free (tok_str);
33697 if (predicate_list && predicate_chain == NULL_TREE)
33699 error_at (DECL_SOURCE_LOCATION (decl),
33700 "No dispatcher found for the versioning attributes : %s",
33701 attrs_str);
33702 return 0;
33704 else if (predicate_list)
33706 predicate_chain = nreverse (predicate_chain);
33707 *predicate_list = predicate_chain;
33710 return priority;
33713 /* This compares the priority of target features in function DECL1
33714 and DECL2. It returns positive value if DECL1 is higher priority,
33715 negative value if DECL2 is higher priority and 0 if they are the
33716 same. */
33718 static int
33719 ix86_compare_version_priority (tree decl1, tree decl2)
33721 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
33722 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
33724 return (int)priority1 - (int)priority2;
33727 /* V1 and V2 point to function versions with different priorities
33728 based on the target ISA. This function compares their priorities. */
33730 static int
33731 feature_compare (const void *v1, const void *v2)
33733 typedef struct _function_version_info
33735 tree version_decl;
33736 tree predicate_chain;
33737 unsigned int dispatch_priority;
33738 } function_version_info;
33740 const function_version_info c1 = *(const function_version_info *)v1;
33741 const function_version_info c2 = *(const function_version_info *)v2;
33742 return (c2.dispatch_priority - c1.dispatch_priority);
33745 /* This function generates the dispatch function for
33746 multi-versioned functions. DISPATCH_DECL is the function which will
33747 contain the dispatch logic. FNDECLS are the function choices for
33748 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
33749 in DISPATCH_DECL in which the dispatch code is generated. */
33751 static int
33752 dispatch_function_versions (tree dispatch_decl,
33753 void *fndecls_p,
33754 basic_block *empty_bb)
33756 tree default_decl;
33757 gimple *ifunc_cpu_init_stmt;
33758 gimple_seq gseq;
33759 int ix;
33760 tree ele;
33761 vec<tree> *fndecls;
33762 unsigned int num_versions = 0;
33763 unsigned int actual_versions = 0;
33764 unsigned int i;
33766 struct _function_version_info
33768 tree version_decl;
33769 tree predicate_chain;
33770 unsigned int dispatch_priority;
33771 }*function_version_info;
33773 gcc_assert (dispatch_decl != NULL
33774 && fndecls_p != NULL
33775 && empty_bb != NULL);
33777 /*fndecls_p is actually a vector. */
33778 fndecls = static_cast<vec<tree> *> (fndecls_p);
33780 /* At least one more version other than the default. */
33781 num_versions = fndecls->length ();
33782 gcc_assert (num_versions >= 2);
33784 function_version_info = (struct _function_version_info *)
33785 XNEWVEC (struct _function_version_info, (num_versions - 1));
33787 /* The first version in the vector is the default decl. */
33788 default_decl = (*fndecls)[0];
33790 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
33792 gseq = bb_seq (*empty_bb);
33793 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
33794 constructors, so explicity call __builtin_cpu_init here. */
33795 ifunc_cpu_init_stmt = gimple_build_call_vec (
33796 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
33797 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
33798 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
33799 set_bb_seq (*empty_bb, gseq);
33801 pop_cfun ();
33804 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
33806 tree version_decl = ele;
33807 tree predicate_chain = NULL_TREE;
33808 unsigned int priority;
33809 /* Get attribute string, parse it and find the right predicate decl.
33810 The predicate function could be a lengthy combination of many
33811 features, like arch-type and various isa-variants. */
33812 priority = get_builtin_code_for_version (version_decl,
33813 &predicate_chain);
33815 if (predicate_chain == NULL_TREE)
33816 continue;
33818 function_version_info [actual_versions].version_decl = version_decl;
33819 function_version_info [actual_versions].predicate_chain
33820 = predicate_chain;
33821 function_version_info [actual_versions].dispatch_priority = priority;
33822 actual_versions++;
33825 /* Sort the versions according to descending order of dispatch priority. The
33826 priority is based on the ISA. This is not a perfect solution. There
33827 could still be ambiguity. If more than one function version is suitable
33828 to execute, which one should be dispatched? In future, allow the user
33829 to specify a dispatch priority next to the version. */
33830 qsort (function_version_info, actual_versions,
33831 sizeof (struct _function_version_info), feature_compare);
33833 for (i = 0; i < actual_versions; ++i)
33834 *empty_bb = add_condition_to_bb (dispatch_decl,
33835 function_version_info[i].version_decl,
33836 function_version_info[i].predicate_chain,
33837 *empty_bb);
33839 /* dispatch default version at the end. */
33840 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
33841 NULL, *empty_bb);
33843 free (function_version_info);
33844 return 0;
33847 /* This function changes the assembler name for functions that are
33848 versions. If DECL is a function version and has a "target"
33849 attribute, it appends the attribute string to its assembler name. */
33851 static tree
33852 ix86_mangle_function_version_assembler_name (tree decl, tree id)
33854 tree version_attr;
33855 const char *orig_name, *version_string;
33856 char *attr_str, *assembler_name;
33858 if (DECL_DECLARED_INLINE_P (decl)
33859 && lookup_attribute ("gnu_inline",
33860 DECL_ATTRIBUTES (decl)))
33861 error_at (DECL_SOURCE_LOCATION (decl),
33862 "Function versions cannot be marked as gnu_inline,"
33863 " bodies have to be generated");
33865 if (DECL_VIRTUAL_P (decl)
33866 || DECL_VINDEX (decl))
33867 sorry ("Virtual function multiversioning not supported");
33869 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
33871 /* target attribute string cannot be NULL. */
33872 gcc_assert (version_attr != NULL_TREE);
33874 orig_name = IDENTIFIER_POINTER (id);
33875 version_string
33876 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
33878 if (strcmp (version_string, "default") == 0)
33879 return id;
33881 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
33882 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
33884 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
33886 /* Allow assembler name to be modified if already set. */
33887 if (DECL_ASSEMBLER_NAME_SET_P (decl))
33888 SET_DECL_RTL (decl, NULL);
33890 tree ret = get_identifier (assembler_name);
33891 XDELETEVEC (attr_str);
33892 XDELETEVEC (assembler_name);
33893 return ret;
33897 static tree
33898 ix86_mangle_decl_assembler_name (tree decl, tree id)
33900 /* For function version, add the target suffix to the assembler name. */
33901 if (TREE_CODE (decl) == FUNCTION_DECL
33902 && DECL_FUNCTION_VERSIONED (decl))
33903 id = ix86_mangle_function_version_assembler_name (decl, id);
33904 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
33905 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
33906 #endif
33908 return id;
33911 /* Make a dispatcher declaration for the multi-versioned function DECL.
33912 Calls to DECL function will be replaced with calls to the dispatcher
33913 by the front-end. Returns the decl of the dispatcher function. */
33915 static tree
33916 ix86_get_function_versions_dispatcher (void *decl)
33918 tree fn = (tree) decl;
33919 struct cgraph_node *node = NULL;
33920 struct cgraph_node *default_node = NULL;
33921 struct cgraph_function_version_info *node_v = NULL;
33922 struct cgraph_function_version_info *first_v = NULL;
33924 tree dispatch_decl = NULL;
33926 struct cgraph_function_version_info *default_version_info = NULL;
33928 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
33930 node = cgraph_node::get (fn);
33931 gcc_assert (node != NULL);
33933 node_v = node->function_version ();
33934 gcc_assert (node_v != NULL);
33936 if (node_v->dispatcher_resolver != NULL)
33937 return node_v->dispatcher_resolver;
33939 /* Find the default version and make it the first node. */
33940 first_v = node_v;
33941 /* Go to the beginning of the chain. */
33942 while (first_v->prev != NULL)
33943 first_v = first_v->prev;
33944 default_version_info = first_v;
33945 while (default_version_info != NULL)
33947 if (is_function_default_version
33948 (default_version_info->this_node->decl))
33949 break;
33950 default_version_info = default_version_info->next;
33953 /* If there is no default node, just return NULL. */
33954 if (default_version_info == NULL)
33955 return NULL;
33957 /* Make default info the first node. */
33958 if (first_v != default_version_info)
33960 default_version_info->prev->next = default_version_info->next;
33961 if (default_version_info->next)
33962 default_version_info->next->prev = default_version_info->prev;
33963 first_v->prev = default_version_info;
33964 default_version_info->next = first_v;
33965 default_version_info->prev = NULL;
33968 default_node = default_version_info->this_node;
33970 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
33971 if (targetm.has_ifunc_p ())
33973 struct cgraph_function_version_info *it_v = NULL;
33974 struct cgraph_node *dispatcher_node = NULL;
33975 struct cgraph_function_version_info *dispatcher_version_info = NULL;
33977 /* Right now, the dispatching is done via ifunc. */
33978 dispatch_decl = make_dispatcher_decl (default_node->decl);
33980 dispatcher_node = cgraph_node::get_create (dispatch_decl);
33981 gcc_assert (dispatcher_node != NULL);
33982 dispatcher_node->dispatcher_function = 1;
33983 dispatcher_version_info
33984 = dispatcher_node->insert_new_function_version ();
33985 dispatcher_version_info->next = default_version_info;
33986 dispatcher_node->definition = 1;
33988 /* Set the dispatcher for all the versions. */
33989 it_v = default_version_info;
33990 while (it_v != NULL)
33992 it_v->dispatcher_resolver = dispatch_decl;
33993 it_v = it_v->next;
33996 else
33997 #endif
33999 error_at (DECL_SOURCE_LOCATION (default_node->decl),
34000 "multiversioning needs ifunc which is not supported "
34001 "on this target");
34004 return dispatch_decl;
34007 /* Make the resolver function decl to dispatch the versions of
34008 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
34009 ifunc alias that will point to the created resolver. Create an
34010 empty basic block in the resolver and store the pointer in
34011 EMPTY_BB. Return the decl of the resolver function. */
34013 static tree
34014 make_resolver_func (const tree default_decl,
34015 const tree ifunc_alias_decl,
34016 basic_block *empty_bb)
34018 char *resolver_name;
34019 tree decl, type, decl_name, t;
34021 /* IFUNC's have to be globally visible. So, if the default_decl is
34022 not, then the name of the IFUNC should be made unique. */
34023 if (TREE_PUBLIC (default_decl) == 0)
34025 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
34026 symtab->change_decl_assembler_name (ifunc_alias_decl,
34027 get_identifier (ifunc_name));
34028 XDELETEVEC (ifunc_name);
34031 resolver_name = make_unique_name (default_decl, "resolver", false);
34033 /* The resolver function should return a (void *). */
34034 type = build_function_type_list (ptr_type_node, NULL_TREE);
34036 decl = build_fn_decl (resolver_name, type);
34037 decl_name = get_identifier (resolver_name);
34038 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
34040 DECL_NAME (decl) = decl_name;
34041 TREE_USED (decl) = 1;
34042 DECL_ARTIFICIAL (decl) = 1;
34043 DECL_IGNORED_P (decl) = 1;
34044 TREE_PUBLIC (decl) = 0;
34045 DECL_UNINLINABLE (decl) = 1;
34047 /* Resolver is not external, body is generated. */
34048 DECL_EXTERNAL (decl) = 0;
34049 DECL_EXTERNAL (ifunc_alias_decl) = 0;
34051 DECL_CONTEXT (decl) = NULL_TREE;
34052 DECL_INITIAL (decl) = make_node (BLOCK);
34053 DECL_STATIC_CONSTRUCTOR (decl) = 0;
34055 if (DECL_COMDAT_GROUP (default_decl)
34056 || TREE_PUBLIC (default_decl))
34058 /* In this case, each translation unit with a call to this
34059 versioned function will put out a resolver. Ensure it
34060 is comdat to keep just one copy. */
34061 DECL_COMDAT (decl) = 1;
34062 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
34064 /* Build result decl and add to function_decl. */
34065 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
34066 DECL_ARTIFICIAL (t) = 1;
34067 DECL_IGNORED_P (t) = 1;
34068 DECL_RESULT (decl) = t;
34070 gimplify_function_tree (decl);
34071 push_cfun (DECL_STRUCT_FUNCTION (decl));
34072 *empty_bb = init_lowered_empty_function (decl, false,
34073 profile_count::uninitialized ());
34075 cgraph_node::add_new_function (decl, true);
34076 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
34078 pop_cfun ();
34080 gcc_assert (ifunc_alias_decl != NULL);
34081 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
34082 DECL_ATTRIBUTES (ifunc_alias_decl)
34083 = make_attribute ("ifunc", resolver_name,
34084 DECL_ATTRIBUTES (ifunc_alias_decl));
34086 /* Create the alias for dispatch to resolver here. */
34087 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
34088 XDELETEVEC (resolver_name);
34089 return decl;
34092 /* Generate the dispatching code body to dispatch multi-versioned function
34093 DECL. The target hook is called to process the "target" attributes and
34094 provide the code to dispatch the right function at run-time. NODE points
34095 to the dispatcher decl whose body will be created. */
34097 static tree
34098 ix86_generate_version_dispatcher_body (void *node_p)
34100 tree resolver_decl;
34101 basic_block empty_bb;
34102 tree default_ver_decl;
34103 struct cgraph_node *versn;
34104 struct cgraph_node *node;
34106 struct cgraph_function_version_info *node_version_info = NULL;
34107 struct cgraph_function_version_info *versn_info = NULL;
34109 node = (cgraph_node *)node_p;
34111 node_version_info = node->function_version ();
34112 gcc_assert (node->dispatcher_function
34113 && node_version_info != NULL);
34115 if (node_version_info->dispatcher_resolver)
34116 return node_version_info->dispatcher_resolver;
34118 /* The first version in the chain corresponds to the default version. */
34119 default_ver_decl = node_version_info->next->this_node->decl;
34121 /* node is going to be an alias, so remove the finalized bit. */
34122 node->definition = false;
34124 resolver_decl = make_resolver_func (default_ver_decl,
34125 node->decl, &empty_bb);
34127 node_version_info->dispatcher_resolver = resolver_decl;
34129 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
34131 auto_vec<tree, 2> fn_ver_vec;
34133 for (versn_info = node_version_info->next; versn_info;
34134 versn_info = versn_info->next)
34136 versn = versn_info->this_node;
34137 /* Check for virtual functions here again, as by this time it should
34138 have been determined if this function needs a vtable index or
34139 not. This happens for methods in derived classes that override
34140 virtual methods in base classes but are not explicitly marked as
34141 virtual. */
34142 if (DECL_VINDEX (versn->decl))
34143 sorry ("Virtual function multiversioning not supported");
34145 fn_ver_vec.safe_push (versn->decl);
34148 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
34149 cgraph_edge::rebuild_edges ();
34150 pop_cfun ();
34151 return resolver_decl;
34153 /* This builds the processor_model struct type defined in
34154 libgcc/config/i386/cpuinfo.c */
34156 static tree
34157 build_processor_model_struct (void)
34159 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
34160 "__cpu_features"};
34161 tree field = NULL_TREE, field_chain = NULL_TREE;
34162 int i;
34163 tree type = make_node (RECORD_TYPE);
34165 /* The first 3 fields are unsigned int. */
34166 for (i = 0; i < 3; ++i)
34168 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34169 get_identifier (field_name[i]), unsigned_type_node);
34170 if (field_chain != NULL_TREE)
34171 DECL_CHAIN (field) = field_chain;
34172 field_chain = field;
34175 /* The last field is an array of unsigned integers of size one. */
34176 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
34177 get_identifier (field_name[3]),
34178 build_array_type (unsigned_type_node,
34179 build_index_type (size_one_node)));
34180 if (field_chain != NULL_TREE)
34181 DECL_CHAIN (field) = field_chain;
34182 field_chain = field;
34184 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
34185 return type;
34188 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
34190 static tree
34191 make_var_decl (tree type, const char *name)
34193 tree new_decl;
34195 new_decl = build_decl (UNKNOWN_LOCATION,
34196 VAR_DECL,
34197 get_identifier(name),
34198 type);
34200 DECL_EXTERNAL (new_decl) = 1;
34201 TREE_STATIC (new_decl) = 1;
34202 TREE_PUBLIC (new_decl) = 1;
34203 DECL_INITIAL (new_decl) = 0;
34204 DECL_ARTIFICIAL (new_decl) = 0;
34205 DECL_PRESERVE_P (new_decl) = 1;
34207 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
34208 assemble_variable (new_decl, 0, 0, 0);
34210 return new_decl;
34213 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
34214 into an integer defined in libgcc/config/i386/cpuinfo.c */
34216 static tree
34217 fold_builtin_cpu (tree fndecl, tree *args)
34219 unsigned int i;
34220 enum ix86_builtins fn_code = (enum ix86_builtins)
34221 DECL_FUNCTION_CODE (fndecl);
34222 tree param_string_cst = NULL;
34224 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
34225 enum processor_features
34227 F_CMOV = 0,
34228 F_MMX,
34229 F_POPCNT,
34230 F_SSE,
34231 F_SSE2,
34232 F_SSE3,
34233 F_SSSE3,
34234 F_SSE4_1,
34235 F_SSE4_2,
34236 F_AVX,
34237 F_AVX2,
34238 F_SSE4_A,
34239 F_FMA4,
34240 F_XOP,
34241 F_FMA,
34242 F_AVX512F,
34243 F_BMI,
34244 F_BMI2,
34245 F_AES,
34246 F_PCLMUL,
34247 F_AVX512VL,
34248 F_AVX512BW,
34249 F_AVX512DQ,
34250 F_AVX512CD,
34251 F_AVX512ER,
34252 F_AVX512PF,
34253 F_AVX512VBMI,
34254 F_AVX512IFMA,
34255 F_AVX5124VNNIW,
34256 F_AVX5124FMAPS,
34257 F_AVX512VPOPCNTDQ,
34258 F_MAX
34261 /* These are the values for vendor types and cpu types and subtypes
34262 in cpuinfo.c. Cpu types and subtypes should be subtracted by
34263 the corresponding start value. */
34264 enum processor_model
34266 M_INTEL = 1,
34267 M_AMD,
34268 M_CPU_TYPE_START,
34269 M_INTEL_BONNELL,
34270 M_INTEL_CORE2,
34271 M_INTEL_COREI7,
34272 M_AMDFAM10H,
34273 M_AMDFAM15H,
34274 M_INTEL_SILVERMONT,
34275 M_INTEL_KNL,
34276 M_AMD_BTVER1,
34277 M_AMD_BTVER2,
34278 M_CPU_SUBTYPE_START,
34279 M_INTEL_COREI7_NEHALEM,
34280 M_INTEL_COREI7_WESTMERE,
34281 M_INTEL_COREI7_SANDYBRIDGE,
34282 M_AMDFAM10H_BARCELONA,
34283 M_AMDFAM10H_SHANGHAI,
34284 M_AMDFAM10H_ISTANBUL,
34285 M_AMDFAM15H_BDVER1,
34286 M_AMDFAM15H_BDVER2,
34287 M_AMDFAM15H_BDVER3,
34288 M_AMDFAM15H_BDVER4,
34289 M_AMDFAM17H_ZNVER1,
34290 M_INTEL_COREI7_IVYBRIDGE,
34291 M_INTEL_COREI7_HASWELL,
34292 M_INTEL_COREI7_BROADWELL,
34293 M_INTEL_COREI7_SKYLAKE,
34294 M_INTEL_COREI7_SKYLAKE_AVX512
34297 static struct _arch_names_table
34299 const char *const name;
34300 const enum processor_model model;
34302 const arch_names_table[] =
34304 {"amd", M_AMD},
34305 {"intel", M_INTEL},
34306 {"atom", M_INTEL_BONNELL},
34307 {"slm", M_INTEL_SILVERMONT},
34308 {"core2", M_INTEL_CORE2},
34309 {"corei7", M_INTEL_COREI7},
34310 {"nehalem", M_INTEL_COREI7_NEHALEM},
34311 {"westmere", M_INTEL_COREI7_WESTMERE},
34312 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
34313 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
34314 {"haswell", M_INTEL_COREI7_HASWELL},
34315 {"broadwell", M_INTEL_COREI7_BROADWELL},
34316 {"skylake", M_INTEL_COREI7_SKYLAKE},
34317 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
34318 {"bonnell", M_INTEL_BONNELL},
34319 {"silvermont", M_INTEL_SILVERMONT},
34320 {"knl", M_INTEL_KNL},
34321 {"amdfam10h", M_AMDFAM10H},
34322 {"barcelona", M_AMDFAM10H_BARCELONA},
34323 {"shanghai", M_AMDFAM10H_SHANGHAI},
34324 {"istanbul", M_AMDFAM10H_ISTANBUL},
34325 {"btver1", M_AMD_BTVER1},
34326 {"amdfam15h", M_AMDFAM15H},
34327 {"bdver1", M_AMDFAM15H_BDVER1},
34328 {"bdver2", M_AMDFAM15H_BDVER2},
34329 {"bdver3", M_AMDFAM15H_BDVER3},
34330 {"bdver4", M_AMDFAM15H_BDVER4},
34331 {"btver2", M_AMD_BTVER2},
34332 {"znver1", M_AMDFAM17H_ZNVER1},
34335 static struct _isa_names_table
34337 const char *const name;
34338 const enum processor_features feature;
34340 const isa_names_table[] =
34342 {"cmov", F_CMOV},
34343 {"mmx", F_MMX},
34344 {"popcnt", F_POPCNT},
34345 {"sse", F_SSE},
34346 {"sse2", F_SSE2},
34347 {"sse3", F_SSE3},
34348 {"ssse3", F_SSSE3},
34349 {"sse4a", F_SSE4_A},
34350 {"sse4.1", F_SSE4_1},
34351 {"sse4.2", F_SSE4_2},
34352 {"avx", F_AVX},
34353 {"fma4", F_FMA4},
34354 {"xop", F_XOP},
34355 {"fma", F_FMA},
34356 {"avx2", F_AVX2},
34357 {"avx512f", F_AVX512F},
34358 {"bmi", F_BMI},
34359 {"bmi2", F_BMI2},
34360 {"aes", F_AES},
34361 {"pclmul", F_PCLMUL},
34362 {"avx512vl",F_AVX512VL},
34363 {"avx512bw",F_AVX512BW},
34364 {"avx512dq",F_AVX512DQ},
34365 {"avx512cd",F_AVX512CD},
34366 {"avx512er",F_AVX512ER},
34367 {"avx512pf",F_AVX512PF},
34368 {"avx512vbmi",F_AVX512VBMI},
34369 {"avx512ifma",F_AVX512IFMA},
34370 {"avx5124vnniw",F_AVX5124VNNIW},
34371 {"avx5124fmaps",F_AVX5124FMAPS},
34372 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
34375 tree __processor_model_type = build_processor_model_struct ();
34376 tree __cpu_model_var = make_var_decl (__processor_model_type,
34377 "__cpu_model");
34380 varpool_node::add (__cpu_model_var);
34382 gcc_assert ((args != NULL) && (*args != NULL));
34384 param_string_cst = *args;
34385 while (param_string_cst
34386 && TREE_CODE (param_string_cst) != STRING_CST)
34388 /* *args must be a expr that can contain other EXPRS leading to a
34389 STRING_CST. */
34390 if (!EXPR_P (param_string_cst))
34392 error ("Parameter to builtin must be a string constant or literal");
34393 return integer_zero_node;
34395 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
34398 gcc_assert (param_string_cst);
34400 if (fn_code == IX86_BUILTIN_CPU_IS)
34402 tree ref;
34403 tree field;
34404 tree final;
34406 unsigned int field_val = 0;
34407 unsigned int NUM_ARCH_NAMES
34408 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
34410 for (i = 0; i < NUM_ARCH_NAMES; i++)
34411 if (strcmp (arch_names_table[i].name,
34412 TREE_STRING_POINTER (param_string_cst)) == 0)
34413 break;
34415 if (i == NUM_ARCH_NAMES)
34417 error ("Parameter to builtin not valid: %s",
34418 TREE_STRING_POINTER (param_string_cst));
34419 return integer_zero_node;
34422 field = TYPE_FIELDS (__processor_model_type);
34423 field_val = arch_names_table[i].model;
34425 /* CPU types are stored in the next field. */
34426 if (field_val > M_CPU_TYPE_START
34427 && field_val < M_CPU_SUBTYPE_START)
34429 field = DECL_CHAIN (field);
34430 field_val -= M_CPU_TYPE_START;
34433 /* CPU subtypes are stored in the next field. */
34434 if (field_val > M_CPU_SUBTYPE_START)
34436 field = DECL_CHAIN ( DECL_CHAIN (field));
34437 field_val -= M_CPU_SUBTYPE_START;
34440 /* Get the appropriate field in __cpu_model. */
34441 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34442 field, NULL_TREE);
34444 /* Check the value. */
34445 final = build2 (EQ_EXPR, unsigned_type_node, ref,
34446 build_int_cstu (unsigned_type_node, field_val));
34447 return build1 (CONVERT_EXPR, integer_type_node, final);
34449 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
34451 tree ref;
34452 tree array_elt;
34453 tree field;
34454 tree final;
34456 unsigned int field_val = 0;
34457 unsigned int NUM_ISA_NAMES
34458 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
34460 for (i = 0; i < NUM_ISA_NAMES; i++)
34461 if (strcmp (isa_names_table[i].name,
34462 TREE_STRING_POINTER (param_string_cst)) == 0)
34463 break;
34465 if (i == NUM_ISA_NAMES)
34467 error ("Parameter to builtin not valid: %s",
34468 TREE_STRING_POINTER (param_string_cst));
34469 return integer_zero_node;
34472 field = TYPE_FIELDS (__processor_model_type);
34473 /* Get the last field, which is __cpu_features. */
34474 while (DECL_CHAIN (field))
34475 field = DECL_CHAIN (field);
34477 /* Get the appropriate field: __cpu_model.__cpu_features */
34478 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
34479 field, NULL_TREE);
34481 /* Access the 0th element of __cpu_features array. */
34482 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
34483 integer_zero_node, NULL_TREE, NULL_TREE);
34485 field_val = (1 << isa_names_table[i].feature);
34486 /* Return __cpu_model.__cpu_features[0] & field_val */
34487 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
34488 build_int_cstu (unsigned_type_node, field_val));
34489 return build1 (CONVERT_EXPR, integer_type_node, final);
34491 gcc_unreachable ();
34494 static tree
34495 ix86_fold_builtin (tree fndecl, int n_args,
34496 tree *args, bool ignore ATTRIBUTE_UNUSED)
34498 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
34500 enum ix86_builtins fn_code = (enum ix86_builtins)
34501 DECL_FUNCTION_CODE (fndecl);
34502 switch (fn_code)
34504 case IX86_BUILTIN_CPU_IS:
34505 case IX86_BUILTIN_CPU_SUPPORTS:
34506 gcc_assert (n_args == 1);
34507 return fold_builtin_cpu (fndecl, args);
34509 case IX86_BUILTIN_NANQ:
34510 case IX86_BUILTIN_NANSQ:
34512 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34513 const char *str = c_getstr (*args);
34514 int quiet = fn_code == IX86_BUILTIN_NANQ;
34515 REAL_VALUE_TYPE real;
34517 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
34518 return build_real (type, real);
34519 return NULL_TREE;
34522 case IX86_BUILTIN_INFQ:
34523 case IX86_BUILTIN_HUGE_VALQ:
34525 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34526 REAL_VALUE_TYPE inf;
34527 real_inf (&inf);
34528 return build_real (type, inf);
34531 case IX86_BUILTIN_TZCNT16:
34532 case IX86_BUILTIN_CTZS:
34533 case IX86_BUILTIN_TZCNT32:
34534 case IX86_BUILTIN_TZCNT64:
34535 gcc_assert (n_args == 1);
34536 if (TREE_CODE (args[0]) == INTEGER_CST)
34538 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34539 tree arg = args[0];
34540 if (fn_code == IX86_BUILTIN_TZCNT16
34541 || fn_code == IX86_BUILTIN_CTZS)
34542 arg = fold_convert (short_unsigned_type_node, arg);
34543 if (integer_zerop (arg))
34544 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34545 else
34546 return fold_const_call (CFN_CTZ, type, arg);
34548 break;
34550 case IX86_BUILTIN_LZCNT16:
34551 case IX86_BUILTIN_CLZS:
34552 case IX86_BUILTIN_LZCNT32:
34553 case IX86_BUILTIN_LZCNT64:
34554 gcc_assert (n_args == 1);
34555 if (TREE_CODE (args[0]) == INTEGER_CST)
34557 tree type = TREE_TYPE (TREE_TYPE (fndecl));
34558 tree arg = args[0];
34559 if (fn_code == IX86_BUILTIN_LZCNT16
34560 || fn_code == IX86_BUILTIN_CLZS)
34561 arg = fold_convert (short_unsigned_type_node, arg);
34562 if (integer_zerop (arg))
34563 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
34564 else
34565 return fold_const_call (CFN_CLZ, type, arg);
34567 break;
34569 case IX86_BUILTIN_BEXTR32:
34570 case IX86_BUILTIN_BEXTR64:
34571 case IX86_BUILTIN_BEXTRI32:
34572 case IX86_BUILTIN_BEXTRI64:
34573 gcc_assert (n_args == 2);
34574 if (tree_fits_uhwi_p (args[1]))
34576 unsigned HOST_WIDE_INT res = 0;
34577 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
34578 unsigned int start = tree_to_uhwi (args[1]);
34579 unsigned int len = (start & 0xff00) >> 8;
34580 start &= 0xff;
34581 if (start >= prec || len == 0)
34582 res = 0;
34583 else if (!tree_fits_uhwi_p (args[0]))
34584 break;
34585 else
34586 res = tree_to_uhwi (args[0]) >> start;
34587 if (len > prec)
34588 len = prec;
34589 if (len < HOST_BITS_PER_WIDE_INT)
34590 res &= (HOST_WIDE_INT_1U << len) - 1;
34591 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34593 break;
34595 case IX86_BUILTIN_BZHI32:
34596 case IX86_BUILTIN_BZHI64:
34597 gcc_assert (n_args == 2);
34598 if (tree_fits_uhwi_p (args[1]))
34600 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
34601 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
34602 return args[0];
34603 if (!tree_fits_uhwi_p (args[0]))
34604 break;
34605 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
34606 res &= ~(HOST_WIDE_INT_M1U << idx);
34607 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34609 break;
34611 case IX86_BUILTIN_PDEP32:
34612 case IX86_BUILTIN_PDEP64:
34613 gcc_assert (n_args == 2);
34614 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34616 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34617 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34618 unsigned HOST_WIDE_INT res = 0;
34619 unsigned HOST_WIDE_INT m, k = 1;
34620 for (m = 1; m; m <<= 1)
34621 if ((mask & m) != 0)
34623 if ((src & k) != 0)
34624 res |= m;
34625 k <<= 1;
34627 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34629 break;
34631 case IX86_BUILTIN_PEXT32:
34632 case IX86_BUILTIN_PEXT64:
34633 gcc_assert (n_args == 2);
34634 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
34636 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
34637 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
34638 unsigned HOST_WIDE_INT res = 0;
34639 unsigned HOST_WIDE_INT m, k = 1;
34640 for (m = 1; m; m <<= 1)
34641 if ((mask & m) != 0)
34643 if ((src & m) != 0)
34644 res |= k;
34645 k <<= 1;
34647 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
34649 break;
34651 default:
34652 break;
34656 #ifdef SUBTARGET_FOLD_BUILTIN
34657 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
34658 #endif
34660 return NULL_TREE;
34663 /* Fold a MD builtin (use ix86_fold_builtin for folding into
34664 constant) in GIMPLE. */
34666 bool
34667 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
34669 gimple *stmt = gsi_stmt (*gsi);
34670 tree fndecl = gimple_call_fndecl (stmt);
34671 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
34672 int n_args = gimple_call_num_args (stmt);
34673 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
34674 tree decl = NULL_TREE;
34675 tree arg0, arg1;
34677 switch (fn_code)
34679 case IX86_BUILTIN_TZCNT32:
34680 decl = builtin_decl_implicit (BUILT_IN_CTZ);
34681 goto fold_tzcnt_lzcnt;
34683 case IX86_BUILTIN_TZCNT64:
34684 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
34685 goto fold_tzcnt_lzcnt;
34687 case IX86_BUILTIN_LZCNT32:
34688 decl = builtin_decl_implicit (BUILT_IN_CLZ);
34689 goto fold_tzcnt_lzcnt;
34691 case IX86_BUILTIN_LZCNT64:
34692 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
34693 goto fold_tzcnt_lzcnt;
34695 fold_tzcnt_lzcnt:
34696 gcc_assert (n_args == 1);
34697 arg0 = gimple_call_arg (stmt, 0);
34698 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
34700 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
34701 /* If arg0 is provably non-zero, optimize into generic
34702 __builtin_c[tl]z{,ll} function the middle-end handles
34703 better. */
34704 if (!expr_not_equal_to (arg0, wi::zero (prec)))
34705 return false;
34707 location_t loc = gimple_location (stmt);
34708 gimple *g = gimple_build_call (decl, 1, arg0);
34709 gimple_set_location (g, loc);
34710 tree lhs = make_ssa_name (integer_type_node);
34711 gimple_call_set_lhs (g, lhs);
34712 gsi_insert_before (gsi, g, GSI_SAME_STMT);
34713 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
34714 gimple_set_location (g, loc);
34715 gsi_replace (gsi, g, false);
34716 return true;
34718 break;
34720 case IX86_BUILTIN_BZHI32:
34721 case IX86_BUILTIN_BZHI64:
34722 gcc_assert (n_args == 2);
34723 arg1 = gimple_call_arg (stmt, 1);
34724 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
34726 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
34727 arg0 = gimple_call_arg (stmt, 0);
34728 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
34729 break;
34730 location_t loc = gimple_location (stmt);
34731 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34732 gimple_set_location (g, loc);
34733 gsi_replace (gsi, g, false);
34734 return true;
34736 break;
34738 case IX86_BUILTIN_PDEP32:
34739 case IX86_BUILTIN_PDEP64:
34740 case IX86_BUILTIN_PEXT32:
34741 case IX86_BUILTIN_PEXT64:
34742 gcc_assert (n_args == 2);
34743 arg1 = gimple_call_arg (stmt, 1);
34744 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
34746 location_t loc = gimple_location (stmt);
34747 arg0 = gimple_call_arg (stmt, 0);
34748 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
34749 gimple_set_location (g, loc);
34750 gsi_replace (gsi, g, false);
34751 return true;
34753 break;
34755 default:
34756 break;
34759 return false;
34762 /* Make builtins to detect cpu type and features supported. NAME is
34763 the builtin name, CODE is the builtin code, and FTYPE is the function
34764 type of the builtin. */
34766 static void
34767 make_cpu_type_builtin (const char* name, int code,
34768 enum ix86_builtin_func_type ftype, bool is_const)
34770 tree decl;
34771 tree type;
34773 type = ix86_get_builtin_func_type (ftype);
34774 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
34775 NULL, NULL_TREE);
34776 gcc_assert (decl != NULL_TREE);
34777 ix86_builtins[(int) code] = decl;
34778 TREE_READONLY (decl) = is_const;
34781 /* Make builtins to get CPU type and features supported. The created
34782 builtins are :
34784 __builtin_cpu_init (), to detect cpu type and features,
34785 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
34786 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
34789 static void
34790 ix86_init_platform_type_builtins (void)
34792 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
34793 INT_FTYPE_VOID, false);
34794 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
34795 INT_FTYPE_PCCHAR, true);
34796 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
34797 INT_FTYPE_PCCHAR, true);
34800 /* Internal method for ix86_init_builtins. */
34802 static void
34803 ix86_init_builtins_va_builtins_abi (void)
34805 tree ms_va_ref, sysv_va_ref;
34806 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
34807 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
34808 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
34809 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
34811 if (!TARGET_64BIT)
34812 return;
34813 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
34814 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
34815 ms_va_ref = build_reference_type (ms_va_list_type_node);
34816 sysv_va_ref =
34817 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
34819 fnvoid_va_end_ms =
34820 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34821 fnvoid_va_start_ms =
34822 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
34823 fnvoid_va_end_sysv =
34824 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
34825 fnvoid_va_start_sysv =
34826 build_varargs_function_type_list (void_type_node, sysv_va_ref,
34827 NULL_TREE);
34828 fnvoid_va_copy_ms =
34829 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
34830 NULL_TREE);
34831 fnvoid_va_copy_sysv =
34832 build_function_type_list (void_type_node, sysv_va_ref,
34833 sysv_va_ref, NULL_TREE);
34835 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
34836 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
34837 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
34838 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
34839 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
34840 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
34841 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
34842 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34843 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
34844 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34845 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
34846 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34849 static void
34850 ix86_init_builtin_types (void)
34852 tree float80_type_node, const_string_type_node;
34854 /* The __float80 type. */
34855 float80_type_node = long_double_type_node;
34856 if (TYPE_MODE (float80_type_node) != XFmode)
34858 if (float64x_type_node != NULL_TREE
34859 && TYPE_MODE (float64x_type_node) == XFmode)
34860 float80_type_node = float64x_type_node;
34861 else
34863 /* The __float80 type. */
34864 float80_type_node = make_node (REAL_TYPE);
34866 TYPE_PRECISION (float80_type_node) = 80;
34867 layout_type (float80_type_node);
34870 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34872 /* The __float128 type. The node has already been created as
34873 _Float128, so we only need to register the __float128 name for
34874 it. */
34875 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34877 const_string_type_node
34878 = build_pointer_type (build_qualified_type
34879 (char_type_node, TYPE_QUAL_CONST));
34881 /* This macro is built by i386-builtin-types.awk. */
34882 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34885 static void
34886 ix86_init_builtins (void)
34888 tree ftype, decl;
34890 ix86_init_builtin_types ();
34892 /* Builtins to get CPU type and features. */
34893 ix86_init_platform_type_builtins ();
34895 /* TFmode support builtins. */
34896 def_builtin_const (0, "__builtin_infq",
34897 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34898 def_builtin_const (0, "__builtin_huge_valq",
34899 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34901 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34902 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34903 BUILT_IN_MD, "nanq", NULL_TREE);
34904 TREE_READONLY (decl) = 1;
34905 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34907 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34908 BUILT_IN_MD, "nansq", NULL_TREE);
34909 TREE_READONLY (decl) = 1;
34910 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34912 /* We will expand them to normal call if SSE isn't available since
34913 they are used by libgcc. */
34914 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34915 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34916 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34917 TREE_READONLY (decl) = 1;
34918 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34920 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34921 decl = add_builtin_function ("__builtin_copysignq", ftype,
34922 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34923 "__copysigntf3", NULL_TREE);
34924 TREE_READONLY (decl) = 1;
34925 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34927 ix86_init_tm_builtins ();
34928 ix86_init_mmx_sse_builtins ();
34929 ix86_init_mpx_builtins ();
34931 if (TARGET_LP64)
34932 ix86_init_builtins_va_builtins_abi ();
34934 #ifdef SUBTARGET_INIT_BUILTINS
34935 SUBTARGET_INIT_BUILTINS;
34936 #endif
34939 /* Return the ix86 builtin for CODE. */
34941 static tree
34942 ix86_builtin_decl (unsigned code, bool)
34944 if (code >= IX86_BUILTIN_MAX)
34945 return error_mark_node;
34947 return ix86_builtins[code];
34950 /* Errors in the source file can cause expand_expr to return const0_rtx
34951 where we expect a vector. To avoid crashing, use one of the vector
34952 clear instructions. */
34953 static rtx
34954 safe_vector_operand (rtx x, machine_mode mode)
34956 if (x == const0_rtx)
34957 x = CONST0_RTX (mode);
34958 return x;
34961 /* Fixup modeless constants to fit required mode. */
34962 static rtx
34963 fixup_modeless_constant (rtx x, machine_mode mode)
34965 if (GET_MODE (x) == VOIDmode)
34966 x = convert_to_mode (mode, x, 1);
34967 return x;
34970 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34972 static rtx
34973 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34975 rtx pat;
34976 tree arg0 = CALL_EXPR_ARG (exp, 0);
34977 tree arg1 = CALL_EXPR_ARG (exp, 1);
34978 rtx op0 = expand_normal (arg0);
34979 rtx op1 = expand_normal (arg1);
34980 machine_mode tmode = insn_data[icode].operand[0].mode;
34981 machine_mode mode0 = insn_data[icode].operand[1].mode;
34982 machine_mode mode1 = insn_data[icode].operand[2].mode;
34984 if (VECTOR_MODE_P (mode0))
34985 op0 = safe_vector_operand (op0, mode0);
34986 if (VECTOR_MODE_P (mode1))
34987 op1 = safe_vector_operand (op1, mode1);
34989 if (optimize || !target
34990 || GET_MODE (target) != tmode
34991 || !insn_data[icode].operand[0].predicate (target, tmode))
34992 target = gen_reg_rtx (tmode);
34994 if (GET_MODE (op1) == SImode && mode1 == TImode)
34996 rtx x = gen_reg_rtx (V4SImode);
34997 emit_insn (gen_sse2_loadd (x, op1));
34998 op1 = gen_lowpart (TImode, x);
35001 if (!insn_data[icode].operand[1].predicate (op0, mode0))
35002 op0 = copy_to_mode_reg (mode0, op0);
35003 if (!insn_data[icode].operand[2].predicate (op1, mode1))
35004 op1 = copy_to_mode_reg (mode1, op1);
35006 pat = GEN_FCN (icode) (target, op0, op1);
35007 if (! pat)
35008 return 0;
35010 emit_insn (pat);
35012 return target;
35015 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
35017 static rtx
35018 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
35019 enum ix86_builtin_func_type m_type,
35020 enum rtx_code sub_code)
35022 rtx pat;
35023 int i;
35024 int nargs;
35025 bool comparison_p = false;
35026 bool tf_p = false;
35027 bool last_arg_constant = false;
35028 int num_memory = 0;
35029 struct {
35030 rtx op;
35031 machine_mode mode;
35032 } args[4];
35034 machine_mode tmode = insn_data[icode].operand[0].mode;
35036 switch (m_type)
35038 case MULTI_ARG_4_DF2_DI_I:
35039 case MULTI_ARG_4_DF2_DI_I1:
35040 case MULTI_ARG_4_SF2_SI_I:
35041 case MULTI_ARG_4_SF2_SI_I1:
35042 nargs = 4;
35043 last_arg_constant = true;
35044 break;
35046 case MULTI_ARG_3_SF:
35047 case MULTI_ARG_3_DF:
35048 case MULTI_ARG_3_SF2:
35049 case MULTI_ARG_3_DF2:
35050 case MULTI_ARG_3_DI:
35051 case MULTI_ARG_3_SI:
35052 case MULTI_ARG_3_SI_DI:
35053 case MULTI_ARG_3_HI:
35054 case MULTI_ARG_3_HI_SI:
35055 case MULTI_ARG_3_QI:
35056 case MULTI_ARG_3_DI2:
35057 case MULTI_ARG_3_SI2:
35058 case MULTI_ARG_3_HI2:
35059 case MULTI_ARG_3_QI2:
35060 nargs = 3;
35061 break;
35063 case MULTI_ARG_2_SF:
35064 case MULTI_ARG_2_DF:
35065 case MULTI_ARG_2_DI:
35066 case MULTI_ARG_2_SI:
35067 case MULTI_ARG_2_HI:
35068 case MULTI_ARG_2_QI:
35069 nargs = 2;
35070 break;
35072 case MULTI_ARG_2_DI_IMM:
35073 case MULTI_ARG_2_SI_IMM:
35074 case MULTI_ARG_2_HI_IMM:
35075 case MULTI_ARG_2_QI_IMM:
35076 nargs = 2;
35077 last_arg_constant = true;
35078 break;
35080 case MULTI_ARG_1_SF:
35081 case MULTI_ARG_1_DF:
35082 case MULTI_ARG_1_SF2:
35083 case MULTI_ARG_1_DF2:
35084 case MULTI_ARG_1_DI:
35085 case MULTI_ARG_1_SI:
35086 case MULTI_ARG_1_HI:
35087 case MULTI_ARG_1_QI:
35088 case MULTI_ARG_1_SI_DI:
35089 case MULTI_ARG_1_HI_DI:
35090 case MULTI_ARG_1_HI_SI:
35091 case MULTI_ARG_1_QI_DI:
35092 case MULTI_ARG_1_QI_SI:
35093 case MULTI_ARG_1_QI_HI:
35094 nargs = 1;
35095 break;
35097 case MULTI_ARG_2_DI_CMP:
35098 case MULTI_ARG_2_SI_CMP:
35099 case MULTI_ARG_2_HI_CMP:
35100 case MULTI_ARG_2_QI_CMP:
35101 nargs = 2;
35102 comparison_p = true;
35103 break;
35105 case MULTI_ARG_2_SF_TF:
35106 case MULTI_ARG_2_DF_TF:
35107 case MULTI_ARG_2_DI_TF:
35108 case MULTI_ARG_2_SI_TF:
35109 case MULTI_ARG_2_HI_TF:
35110 case MULTI_ARG_2_QI_TF:
35111 nargs = 2;
35112 tf_p = true;
35113 break;
35115 default:
35116 gcc_unreachable ();
35119 if (optimize || !target
35120 || GET_MODE (target) != tmode
35121 || !insn_data[icode].operand[0].predicate (target, tmode))
35122 target = gen_reg_rtx (tmode);
35123 else if (memory_operand (target, tmode))
35124 num_memory++;
35126 gcc_assert (nargs <= 4);
35128 for (i = 0; i < nargs; i++)
35130 tree arg = CALL_EXPR_ARG (exp, i);
35131 rtx op = expand_normal (arg);
35132 int adjust = (comparison_p) ? 1 : 0;
35133 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
35135 if (last_arg_constant && i == nargs - 1)
35137 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
35139 enum insn_code new_icode = icode;
35140 switch (icode)
35142 case CODE_FOR_xop_vpermil2v2df3:
35143 case CODE_FOR_xop_vpermil2v4sf3:
35144 case CODE_FOR_xop_vpermil2v4df3:
35145 case CODE_FOR_xop_vpermil2v8sf3:
35146 error ("the last argument must be a 2-bit immediate");
35147 return gen_reg_rtx (tmode);
35148 case CODE_FOR_xop_rotlv2di3:
35149 new_icode = CODE_FOR_rotlv2di3;
35150 goto xop_rotl;
35151 case CODE_FOR_xop_rotlv4si3:
35152 new_icode = CODE_FOR_rotlv4si3;
35153 goto xop_rotl;
35154 case CODE_FOR_xop_rotlv8hi3:
35155 new_icode = CODE_FOR_rotlv8hi3;
35156 goto xop_rotl;
35157 case CODE_FOR_xop_rotlv16qi3:
35158 new_icode = CODE_FOR_rotlv16qi3;
35159 xop_rotl:
35160 if (CONST_INT_P (op))
35162 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
35163 op = GEN_INT (INTVAL (op) & mask);
35164 gcc_checking_assert
35165 (insn_data[icode].operand[i + 1].predicate (op, mode));
35167 else
35169 gcc_checking_assert
35170 (nargs == 2
35171 && insn_data[new_icode].operand[0].mode == tmode
35172 && insn_data[new_icode].operand[1].mode == tmode
35173 && insn_data[new_icode].operand[2].mode == mode
35174 && insn_data[new_icode].operand[0].predicate
35175 == insn_data[icode].operand[0].predicate
35176 && insn_data[new_icode].operand[1].predicate
35177 == insn_data[icode].operand[1].predicate);
35178 icode = new_icode;
35179 goto non_constant;
35181 break;
35182 default:
35183 gcc_unreachable ();
35187 else
35189 non_constant:
35190 if (VECTOR_MODE_P (mode))
35191 op = safe_vector_operand (op, mode);
35193 /* If we aren't optimizing, only allow one memory operand to be
35194 generated. */
35195 if (memory_operand (op, mode))
35196 num_memory++;
35198 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
35200 if (optimize
35201 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
35202 || num_memory > 1)
35203 op = force_reg (mode, op);
35206 args[i].op = op;
35207 args[i].mode = mode;
35210 switch (nargs)
35212 case 1:
35213 pat = GEN_FCN (icode) (target, args[0].op);
35214 break;
35216 case 2:
35217 if (tf_p)
35218 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35219 GEN_INT ((int)sub_code));
35220 else if (! comparison_p)
35221 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35222 else
35224 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
35225 args[0].op,
35226 args[1].op);
35228 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
35230 break;
35232 case 3:
35233 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35234 break;
35236 case 4:
35237 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
35238 break;
35240 default:
35241 gcc_unreachable ();
35244 if (! pat)
35245 return 0;
35247 emit_insn (pat);
35248 return target;
35251 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
35252 insns with vec_merge. */
35254 static rtx
35255 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
35256 rtx target)
35258 rtx pat;
35259 tree arg0 = CALL_EXPR_ARG (exp, 0);
35260 rtx op1, op0 = expand_normal (arg0);
35261 machine_mode tmode = insn_data[icode].operand[0].mode;
35262 machine_mode mode0 = insn_data[icode].operand[1].mode;
35264 if (optimize || !target
35265 || GET_MODE (target) != tmode
35266 || !insn_data[icode].operand[0].predicate (target, tmode))
35267 target = gen_reg_rtx (tmode);
35269 if (VECTOR_MODE_P (mode0))
35270 op0 = safe_vector_operand (op0, mode0);
35272 if ((optimize && !register_operand (op0, mode0))
35273 || !insn_data[icode].operand[1].predicate (op0, mode0))
35274 op0 = copy_to_mode_reg (mode0, op0);
35276 op1 = op0;
35277 if (!insn_data[icode].operand[2].predicate (op1, mode0))
35278 op1 = copy_to_mode_reg (mode0, op1);
35280 pat = GEN_FCN (icode) (target, op0, op1);
35281 if (! pat)
35282 return 0;
35283 emit_insn (pat);
35284 return target;
35287 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
35289 static rtx
35290 ix86_expand_sse_compare (const struct builtin_description *d,
35291 tree exp, rtx target, bool swap)
35293 rtx pat;
35294 tree arg0 = CALL_EXPR_ARG (exp, 0);
35295 tree arg1 = CALL_EXPR_ARG (exp, 1);
35296 rtx op0 = expand_normal (arg0);
35297 rtx op1 = expand_normal (arg1);
35298 rtx op2;
35299 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35300 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35301 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35302 enum rtx_code comparison = d->comparison;
35304 if (VECTOR_MODE_P (mode0))
35305 op0 = safe_vector_operand (op0, mode0);
35306 if (VECTOR_MODE_P (mode1))
35307 op1 = safe_vector_operand (op1, mode1);
35309 /* Swap operands if we have a comparison that isn't available in
35310 hardware. */
35311 if (swap)
35312 std::swap (op0, op1);
35314 if (optimize || !target
35315 || GET_MODE (target) != tmode
35316 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35317 target = gen_reg_rtx (tmode);
35319 if ((optimize && !register_operand (op0, mode0))
35320 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
35321 op0 = copy_to_mode_reg (mode0, op0);
35322 if ((optimize && !register_operand (op1, mode1))
35323 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
35324 op1 = copy_to_mode_reg (mode1, op1);
35326 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
35327 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35328 if (! pat)
35329 return 0;
35330 emit_insn (pat);
35331 return target;
35334 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
35336 static rtx
35337 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
35338 rtx target)
35340 rtx pat;
35341 tree arg0 = CALL_EXPR_ARG (exp, 0);
35342 tree arg1 = CALL_EXPR_ARG (exp, 1);
35343 rtx op0 = expand_normal (arg0);
35344 rtx op1 = expand_normal (arg1);
35345 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35346 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35347 enum rtx_code comparison = d->comparison;
35349 if (VECTOR_MODE_P (mode0))
35350 op0 = safe_vector_operand (op0, mode0);
35351 if (VECTOR_MODE_P (mode1))
35352 op1 = safe_vector_operand (op1, mode1);
35354 /* Swap operands if we have a comparison that isn't available in
35355 hardware. */
35356 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
35357 std::swap (op0, op1);
35359 target = gen_reg_rtx (SImode);
35360 emit_move_insn (target, const0_rtx);
35361 target = gen_rtx_SUBREG (QImode, target, 0);
35363 if ((optimize && !register_operand (op0, mode0))
35364 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35365 op0 = copy_to_mode_reg (mode0, op0);
35366 if ((optimize && !register_operand (op1, mode1))
35367 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35368 op1 = copy_to_mode_reg (mode1, op1);
35370 pat = GEN_FCN (d->icode) (op0, op1);
35371 if (! pat)
35372 return 0;
35373 emit_insn (pat);
35374 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35375 gen_rtx_fmt_ee (comparison, QImode,
35376 SET_DEST (pat),
35377 const0_rtx)));
35379 return SUBREG_REG (target);
35382 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
35384 static rtx
35385 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
35386 rtx target)
35388 rtx pat;
35389 tree arg0 = CALL_EXPR_ARG (exp, 0);
35390 rtx op1, op0 = expand_normal (arg0);
35391 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35392 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35394 if (optimize || target == 0
35395 || GET_MODE (target) != tmode
35396 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35397 target = gen_reg_rtx (tmode);
35399 if (VECTOR_MODE_P (mode0))
35400 op0 = safe_vector_operand (op0, mode0);
35402 if ((optimize && !register_operand (op0, mode0))
35403 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35404 op0 = copy_to_mode_reg (mode0, op0);
35406 op1 = GEN_INT (d->comparison);
35408 pat = GEN_FCN (d->icode) (target, op0, op1);
35409 if (! pat)
35410 return 0;
35411 emit_insn (pat);
35412 return target;
35415 static rtx
35416 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
35417 tree exp, rtx target)
35419 rtx pat;
35420 tree arg0 = CALL_EXPR_ARG (exp, 0);
35421 tree arg1 = CALL_EXPR_ARG (exp, 1);
35422 rtx op0 = expand_normal (arg0);
35423 rtx op1 = expand_normal (arg1);
35424 rtx op2;
35425 machine_mode tmode = insn_data[d->icode].operand[0].mode;
35426 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
35427 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
35429 if (optimize || target == 0
35430 || GET_MODE (target) != tmode
35431 || !insn_data[d->icode].operand[0].predicate (target, tmode))
35432 target = gen_reg_rtx (tmode);
35434 op0 = safe_vector_operand (op0, mode0);
35435 op1 = safe_vector_operand (op1, mode1);
35437 if ((optimize && !register_operand (op0, mode0))
35438 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35439 op0 = copy_to_mode_reg (mode0, op0);
35440 if ((optimize && !register_operand (op1, mode1))
35441 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35442 op1 = copy_to_mode_reg (mode1, op1);
35444 op2 = GEN_INT (d->comparison);
35446 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
35447 if (! pat)
35448 return 0;
35449 emit_insn (pat);
35450 return target;
35453 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
35455 static rtx
35456 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
35457 rtx target)
35459 rtx pat;
35460 tree arg0 = CALL_EXPR_ARG (exp, 0);
35461 tree arg1 = CALL_EXPR_ARG (exp, 1);
35462 rtx op0 = expand_normal (arg0);
35463 rtx op1 = expand_normal (arg1);
35464 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
35465 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
35466 enum rtx_code comparison = d->comparison;
35468 if (VECTOR_MODE_P (mode0))
35469 op0 = safe_vector_operand (op0, mode0);
35470 if (VECTOR_MODE_P (mode1))
35471 op1 = safe_vector_operand (op1, mode1);
35473 target = gen_reg_rtx (SImode);
35474 emit_move_insn (target, const0_rtx);
35475 target = gen_rtx_SUBREG (QImode, target, 0);
35477 if ((optimize && !register_operand (op0, mode0))
35478 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
35479 op0 = copy_to_mode_reg (mode0, op0);
35480 if ((optimize && !register_operand (op1, mode1))
35481 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
35482 op1 = copy_to_mode_reg (mode1, op1);
35484 pat = GEN_FCN (d->icode) (op0, op1);
35485 if (! pat)
35486 return 0;
35487 emit_insn (pat);
35488 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35489 gen_rtx_fmt_ee (comparison, QImode,
35490 SET_DEST (pat),
35491 const0_rtx)));
35493 return SUBREG_REG (target);
35496 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
35498 static rtx
35499 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
35500 tree exp, rtx target)
35502 rtx pat;
35503 tree arg0 = CALL_EXPR_ARG (exp, 0);
35504 tree arg1 = CALL_EXPR_ARG (exp, 1);
35505 tree arg2 = CALL_EXPR_ARG (exp, 2);
35506 tree arg3 = CALL_EXPR_ARG (exp, 3);
35507 tree arg4 = CALL_EXPR_ARG (exp, 4);
35508 rtx scratch0, scratch1;
35509 rtx op0 = expand_normal (arg0);
35510 rtx op1 = expand_normal (arg1);
35511 rtx op2 = expand_normal (arg2);
35512 rtx op3 = expand_normal (arg3);
35513 rtx op4 = expand_normal (arg4);
35514 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
35516 tmode0 = insn_data[d->icode].operand[0].mode;
35517 tmode1 = insn_data[d->icode].operand[1].mode;
35518 modev2 = insn_data[d->icode].operand[2].mode;
35519 modei3 = insn_data[d->icode].operand[3].mode;
35520 modev4 = insn_data[d->icode].operand[4].mode;
35521 modei5 = insn_data[d->icode].operand[5].mode;
35522 modeimm = insn_data[d->icode].operand[6].mode;
35524 if (VECTOR_MODE_P (modev2))
35525 op0 = safe_vector_operand (op0, modev2);
35526 if (VECTOR_MODE_P (modev4))
35527 op2 = safe_vector_operand (op2, modev4);
35529 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35530 op0 = copy_to_mode_reg (modev2, op0);
35531 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
35532 op1 = copy_to_mode_reg (modei3, op1);
35533 if ((optimize && !register_operand (op2, modev4))
35534 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
35535 op2 = copy_to_mode_reg (modev4, op2);
35536 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
35537 op3 = copy_to_mode_reg (modei5, op3);
35539 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
35541 error ("the fifth argument must be an 8-bit immediate");
35542 return const0_rtx;
35545 if (d->code == IX86_BUILTIN_PCMPESTRI128)
35547 if (optimize || !target
35548 || GET_MODE (target) != tmode0
35549 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35550 target = gen_reg_rtx (tmode0);
35552 scratch1 = gen_reg_rtx (tmode1);
35554 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
35556 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
35558 if (optimize || !target
35559 || GET_MODE (target) != tmode1
35560 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35561 target = gen_reg_rtx (tmode1);
35563 scratch0 = gen_reg_rtx (tmode0);
35565 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
35567 else
35569 gcc_assert (d->flag);
35571 scratch0 = gen_reg_rtx (tmode0);
35572 scratch1 = gen_reg_rtx (tmode1);
35574 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
35577 if (! pat)
35578 return 0;
35580 emit_insn (pat);
35582 if (d->flag)
35584 target = gen_reg_rtx (SImode);
35585 emit_move_insn (target, const0_rtx);
35586 target = gen_rtx_SUBREG (QImode, target, 0);
35588 emit_insn
35589 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35590 gen_rtx_fmt_ee (EQ, QImode,
35591 gen_rtx_REG ((machine_mode) d->flag,
35592 FLAGS_REG),
35593 const0_rtx)));
35594 return SUBREG_REG (target);
35596 else
35597 return target;
35601 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
35603 static rtx
35604 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
35605 tree exp, rtx target)
35607 rtx pat;
35608 tree arg0 = CALL_EXPR_ARG (exp, 0);
35609 tree arg1 = CALL_EXPR_ARG (exp, 1);
35610 tree arg2 = CALL_EXPR_ARG (exp, 2);
35611 rtx scratch0, scratch1;
35612 rtx op0 = expand_normal (arg0);
35613 rtx op1 = expand_normal (arg1);
35614 rtx op2 = expand_normal (arg2);
35615 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
35617 tmode0 = insn_data[d->icode].operand[0].mode;
35618 tmode1 = insn_data[d->icode].operand[1].mode;
35619 modev2 = insn_data[d->icode].operand[2].mode;
35620 modev3 = insn_data[d->icode].operand[3].mode;
35621 modeimm = insn_data[d->icode].operand[4].mode;
35623 if (VECTOR_MODE_P (modev2))
35624 op0 = safe_vector_operand (op0, modev2);
35625 if (VECTOR_MODE_P (modev3))
35626 op1 = safe_vector_operand (op1, modev3);
35628 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
35629 op0 = copy_to_mode_reg (modev2, op0);
35630 if ((optimize && !register_operand (op1, modev3))
35631 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
35632 op1 = copy_to_mode_reg (modev3, op1);
35634 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
35636 error ("the third argument must be an 8-bit immediate");
35637 return const0_rtx;
35640 if (d->code == IX86_BUILTIN_PCMPISTRI128)
35642 if (optimize || !target
35643 || GET_MODE (target) != tmode0
35644 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
35645 target = gen_reg_rtx (tmode0);
35647 scratch1 = gen_reg_rtx (tmode1);
35649 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
35651 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
35653 if (optimize || !target
35654 || GET_MODE (target) != tmode1
35655 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
35656 target = gen_reg_rtx (tmode1);
35658 scratch0 = gen_reg_rtx (tmode0);
35660 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
35662 else
35664 gcc_assert (d->flag);
35666 scratch0 = gen_reg_rtx (tmode0);
35667 scratch1 = gen_reg_rtx (tmode1);
35669 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
35672 if (! pat)
35673 return 0;
35675 emit_insn (pat);
35677 if (d->flag)
35679 target = gen_reg_rtx (SImode);
35680 emit_move_insn (target, const0_rtx);
35681 target = gen_rtx_SUBREG (QImode, target, 0);
35683 emit_insn
35684 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35685 gen_rtx_fmt_ee (EQ, QImode,
35686 gen_rtx_REG ((machine_mode) d->flag,
35687 FLAGS_REG),
35688 const0_rtx)));
35689 return SUBREG_REG (target);
35691 else
35692 return target;
35695 /* Subroutine of ix86_expand_builtin to take care of insns with
35696 variable number of operands. */
35698 static rtx
35699 ix86_expand_args_builtin (const struct builtin_description *d,
35700 tree exp, rtx target)
35702 rtx pat, real_target;
35703 unsigned int i, nargs;
35704 unsigned int nargs_constant = 0;
35705 unsigned int mask_pos = 0;
35706 int num_memory = 0;
35707 struct
35709 rtx op;
35710 machine_mode mode;
35711 } args[6];
35712 bool second_arg_count = false;
35713 enum insn_code icode = d->icode;
35714 const struct insn_data_d *insn_p = &insn_data[icode];
35715 machine_mode tmode = insn_p->operand[0].mode;
35716 machine_mode rmode = VOIDmode;
35717 bool swap = false;
35718 enum rtx_code comparison = d->comparison;
35720 switch ((enum ix86_builtin_func_type) d->flag)
35722 case V2DF_FTYPE_V2DF_ROUND:
35723 case V4DF_FTYPE_V4DF_ROUND:
35724 case V8DF_FTYPE_V8DF_ROUND:
35725 case V4SF_FTYPE_V4SF_ROUND:
35726 case V8SF_FTYPE_V8SF_ROUND:
35727 case V16SF_FTYPE_V16SF_ROUND:
35728 case V4SI_FTYPE_V4SF_ROUND:
35729 case V8SI_FTYPE_V8SF_ROUND:
35730 case V16SI_FTYPE_V16SF_ROUND:
35731 return ix86_expand_sse_round (d, exp, target);
35732 case V4SI_FTYPE_V2DF_V2DF_ROUND:
35733 case V8SI_FTYPE_V4DF_V4DF_ROUND:
35734 case V16SI_FTYPE_V8DF_V8DF_ROUND:
35735 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
35736 case INT_FTYPE_V8SF_V8SF_PTEST:
35737 case INT_FTYPE_V4DI_V4DI_PTEST:
35738 case INT_FTYPE_V4DF_V4DF_PTEST:
35739 case INT_FTYPE_V4SF_V4SF_PTEST:
35740 case INT_FTYPE_V2DI_V2DI_PTEST:
35741 case INT_FTYPE_V2DF_V2DF_PTEST:
35742 return ix86_expand_sse_ptest (d, exp, target);
35743 case FLOAT128_FTYPE_FLOAT128:
35744 case FLOAT_FTYPE_FLOAT:
35745 case INT_FTYPE_INT:
35746 case UINT_FTYPE_UINT:
35747 case UINT16_FTYPE_UINT16:
35748 case UINT64_FTYPE_INT:
35749 case UINT64_FTYPE_UINT64:
35750 case INT64_FTYPE_INT64:
35751 case INT64_FTYPE_V4SF:
35752 case INT64_FTYPE_V2DF:
35753 case INT_FTYPE_V16QI:
35754 case INT_FTYPE_V8QI:
35755 case INT_FTYPE_V8SF:
35756 case INT_FTYPE_V4DF:
35757 case INT_FTYPE_V4SF:
35758 case INT_FTYPE_V2DF:
35759 case INT_FTYPE_V32QI:
35760 case V16QI_FTYPE_V16QI:
35761 case V8SI_FTYPE_V8SF:
35762 case V8SI_FTYPE_V4SI:
35763 case V8HI_FTYPE_V8HI:
35764 case V8HI_FTYPE_V16QI:
35765 case V8QI_FTYPE_V8QI:
35766 case V8SF_FTYPE_V8SF:
35767 case V8SF_FTYPE_V8SI:
35768 case V8SF_FTYPE_V4SF:
35769 case V8SF_FTYPE_V8HI:
35770 case V4SI_FTYPE_V4SI:
35771 case V4SI_FTYPE_V16QI:
35772 case V4SI_FTYPE_V4SF:
35773 case V4SI_FTYPE_V8SI:
35774 case V4SI_FTYPE_V8HI:
35775 case V4SI_FTYPE_V4DF:
35776 case V4SI_FTYPE_V2DF:
35777 case V4HI_FTYPE_V4HI:
35778 case V4DF_FTYPE_V4DF:
35779 case V4DF_FTYPE_V4SI:
35780 case V4DF_FTYPE_V4SF:
35781 case V4DF_FTYPE_V2DF:
35782 case V4SF_FTYPE_V4SF:
35783 case V4SF_FTYPE_V4SI:
35784 case V4SF_FTYPE_V8SF:
35785 case V4SF_FTYPE_V4DF:
35786 case V4SF_FTYPE_V8HI:
35787 case V4SF_FTYPE_V2DF:
35788 case V2DI_FTYPE_V2DI:
35789 case V2DI_FTYPE_V16QI:
35790 case V2DI_FTYPE_V8HI:
35791 case V2DI_FTYPE_V4SI:
35792 case V2DF_FTYPE_V2DF:
35793 case V2DF_FTYPE_V4SI:
35794 case V2DF_FTYPE_V4DF:
35795 case V2DF_FTYPE_V4SF:
35796 case V2DF_FTYPE_V2SI:
35797 case V2SI_FTYPE_V2SI:
35798 case V2SI_FTYPE_V4SF:
35799 case V2SI_FTYPE_V2SF:
35800 case V2SI_FTYPE_V2DF:
35801 case V2SF_FTYPE_V2SF:
35802 case V2SF_FTYPE_V2SI:
35803 case V32QI_FTYPE_V32QI:
35804 case V32QI_FTYPE_V16QI:
35805 case V16HI_FTYPE_V16HI:
35806 case V16HI_FTYPE_V8HI:
35807 case V8SI_FTYPE_V8SI:
35808 case V16HI_FTYPE_V16QI:
35809 case V8SI_FTYPE_V16QI:
35810 case V4DI_FTYPE_V16QI:
35811 case V8SI_FTYPE_V8HI:
35812 case V4DI_FTYPE_V8HI:
35813 case V4DI_FTYPE_V4SI:
35814 case V4DI_FTYPE_V2DI:
35815 case UQI_FTYPE_UQI:
35816 case UHI_FTYPE_UHI:
35817 case USI_FTYPE_USI:
35818 case USI_FTYPE_UQI:
35819 case USI_FTYPE_UHI:
35820 case UDI_FTYPE_UDI:
35821 case UHI_FTYPE_V16QI:
35822 case USI_FTYPE_V32QI:
35823 case UDI_FTYPE_V64QI:
35824 case V16QI_FTYPE_UHI:
35825 case V32QI_FTYPE_USI:
35826 case V64QI_FTYPE_UDI:
35827 case V8HI_FTYPE_UQI:
35828 case V16HI_FTYPE_UHI:
35829 case V32HI_FTYPE_USI:
35830 case V4SI_FTYPE_UQI:
35831 case V8SI_FTYPE_UQI:
35832 case V4SI_FTYPE_UHI:
35833 case V8SI_FTYPE_UHI:
35834 case UQI_FTYPE_V8HI:
35835 case UHI_FTYPE_V16HI:
35836 case USI_FTYPE_V32HI:
35837 case UQI_FTYPE_V4SI:
35838 case UQI_FTYPE_V8SI:
35839 case UHI_FTYPE_V16SI:
35840 case UQI_FTYPE_V2DI:
35841 case UQI_FTYPE_V4DI:
35842 case UQI_FTYPE_V8DI:
35843 case V16SI_FTYPE_UHI:
35844 case V2DI_FTYPE_UQI:
35845 case V4DI_FTYPE_UQI:
35846 case V16SI_FTYPE_INT:
35847 case V16SF_FTYPE_V8SF:
35848 case V16SI_FTYPE_V8SI:
35849 case V16SF_FTYPE_V4SF:
35850 case V16SI_FTYPE_V4SI:
35851 case V16SI_FTYPE_V16SF:
35852 case V16SI_FTYPE_V16SI:
35853 case V16SF_FTYPE_V16SF:
35854 case V8DI_FTYPE_UQI:
35855 case V8DI_FTYPE_V8DI:
35856 case V8DF_FTYPE_V4DF:
35857 case V8DF_FTYPE_V2DF:
35858 case V8DF_FTYPE_V8DF:
35859 nargs = 1;
35860 break;
35861 case V4SF_FTYPE_V4SF_VEC_MERGE:
35862 case V2DF_FTYPE_V2DF_VEC_MERGE:
35863 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35864 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35865 case V16QI_FTYPE_V16QI_V16QI:
35866 case V16QI_FTYPE_V8HI_V8HI:
35867 case V16SF_FTYPE_V16SF_V16SF:
35868 case V8QI_FTYPE_V8QI_V8QI:
35869 case V8QI_FTYPE_V4HI_V4HI:
35870 case V8HI_FTYPE_V8HI_V8HI:
35871 case V8HI_FTYPE_V16QI_V16QI:
35872 case V8HI_FTYPE_V4SI_V4SI:
35873 case V8SF_FTYPE_V8SF_V8SF:
35874 case V8SF_FTYPE_V8SF_V8SI:
35875 case V8DF_FTYPE_V8DF_V8DF:
35876 case V4SI_FTYPE_V4SI_V4SI:
35877 case V4SI_FTYPE_V8HI_V8HI:
35878 case V4SI_FTYPE_V2DF_V2DF:
35879 case V4HI_FTYPE_V4HI_V4HI:
35880 case V4HI_FTYPE_V8QI_V8QI:
35881 case V4HI_FTYPE_V2SI_V2SI:
35882 case V4DF_FTYPE_V4DF_V4DF:
35883 case V4DF_FTYPE_V4DF_V4DI:
35884 case V4SF_FTYPE_V4SF_V4SF:
35885 case V4SF_FTYPE_V4SF_V4SI:
35886 case V4SF_FTYPE_V4SF_V2SI:
35887 case V4SF_FTYPE_V4SF_V2DF:
35888 case V4SF_FTYPE_V4SF_UINT:
35889 case V4SF_FTYPE_V4SF_DI:
35890 case V4SF_FTYPE_V4SF_SI:
35891 case V2DI_FTYPE_V2DI_V2DI:
35892 case V2DI_FTYPE_V16QI_V16QI:
35893 case V2DI_FTYPE_V4SI_V4SI:
35894 case V2DI_FTYPE_V2DI_V16QI:
35895 case V2SI_FTYPE_V2SI_V2SI:
35896 case V2SI_FTYPE_V4HI_V4HI:
35897 case V2SI_FTYPE_V2SF_V2SF:
35898 case V2DF_FTYPE_V2DF_V2DF:
35899 case V2DF_FTYPE_V2DF_V4SF:
35900 case V2DF_FTYPE_V2DF_V2DI:
35901 case V2DF_FTYPE_V2DF_DI:
35902 case V2DF_FTYPE_V2DF_SI:
35903 case V2DF_FTYPE_V2DF_UINT:
35904 case V2SF_FTYPE_V2SF_V2SF:
35905 case V1DI_FTYPE_V1DI_V1DI:
35906 case V1DI_FTYPE_V8QI_V8QI:
35907 case V1DI_FTYPE_V2SI_V2SI:
35908 case V32QI_FTYPE_V16HI_V16HI:
35909 case V16HI_FTYPE_V8SI_V8SI:
35910 case V32QI_FTYPE_V32QI_V32QI:
35911 case V16HI_FTYPE_V32QI_V32QI:
35912 case V16HI_FTYPE_V16HI_V16HI:
35913 case V8SI_FTYPE_V4DF_V4DF:
35914 case V8SI_FTYPE_V8SI_V8SI:
35915 case V8SI_FTYPE_V16HI_V16HI:
35916 case V4DI_FTYPE_V4DI_V4DI:
35917 case V4DI_FTYPE_V8SI_V8SI:
35918 case V8DI_FTYPE_V64QI_V64QI:
35919 if (comparison == UNKNOWN)
35920 return ix86_expand_binop_builtin (icode, exp, target);
35921 nargs = 2;
35922 break;
35923 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35924 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35925 gcc_assert (comparison != UNKNOWN);
35926 nargs = 2;
35927 swap = true;
35928 break;
35929 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35930 case V16HI_FTYPE_V16HI_SI_COUNT:
35931 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35932 case V8SI_FTYPE_V8SI_SI_COUNT:
35933 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35934 case V4DI_FTYPE_V4DI_INT_COUNT:
35935 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35936 case V8HI_FTYPE_V8HI_SI_COUNT:
35937 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35938 case V4SI_FTYPE_V4SI_SI_COUNT:
35939 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35940 case V4HI_FTYPE_V4HI_SI_COUNT:
35941 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35942 case V2DI_FTYPE_V2DI_SI_COUNT:
35943 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35944 case V2SI_FTYPE_V2SI_SI_COUNT:
35945 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35946 case V1DI_FTYPE_V1DI_SI_COUNT:
35947 nargs = 2;
35948 second_arg_count = true;
35949 break;
35950 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35951 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35952 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35953 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35954 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35955 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35956 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35957 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35958 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35959 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35960 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35961 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35962 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35963 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35964 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35965 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35966 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35967 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35968 nargs = 4;
35969 second_arg_count = true;
35970 break;
35971 case UINT64_FTYPE_UINT64_UINT64:
35972 case UINT_FTYPE_UINT_UINT:
35973 case UINT_FTYPE_UINT_USHORT:
35974 case UINT_FTYPE_UINT_UCHAR:
35975 case UINT16_FTYPE_UINT16_INT:
35976 case UINT8_FTYPE_UINT8_INT:
35977 case UQI_FTYPE_UQI_UQI:
35978 case UHI_FTYPE_UHI_UHI:
35979 case USI_FTYPE_USI_USI:
35980 case UDI_FTYPE_UDI_UDI:
35981 case V16SI_FTYPE_V8DF_V8DF:
35982 nargs = 2;
35983 break;
35984 case V2DI_FTYPE_V2DI_INT_CONVERT:
35985 nargs = 2;
35986 rmode = V1TImode;
35987 nargs_constant = 1;
35988 break;
35989 case V4DI_FTYPE_V4DI_INT_CONVERT:
35990 nargs = 2;
35991 rmode = V2TImode;
35992 nargs_constant = 1;
35993 break;
35994 case V8DI_FTYPE_V8DI_INT_CONVERT:
35995 nargs = 2;
35996 rmode = V4TImode;
35997 nargs_constant = 1;
35998 break;
35999 case V8HI_FTYPE_V8HI_INT:
36000 case V8HI_FTYPE_V8SF_INT:
36001 case V16HI_FTYPE_V16SF_INT:
36002 case V8HI_FTYPE_V4SF_INT:
36003 case V8SF_FTYPE_V8SF_INT:
36004 case V4SF_FTYPE_V16SF_INT:
36005 case V16SF_FTYPE_V16SF_INT:
36006 case V4SI_FTYPE_V4SI_INT:
36007 case V4SI_FTYPE_V8SI_INT:
36008 case V4HI_FTYPE_V4HI_INT:
36009 case V4DF_FTYPE_V4DF_INT:
36010 case V4DF_FTYPE_V8DF_INT:
36011 case V4SF_FTYPE_V4SF_INT:
36012 case V4SF_FTYPE_V8SF_INT:
36013 case V2DI_FTYPE_V2DI_INT:
36014 case V2DF_FTYPE_V2DF_INT:
36015 case V2DF_FTYPE_V4DF_INT:
36016 case V16HI_FTYPE_V16HI_INT:
36017 case V8SI_FTYPE_V8SI_INT:
36018 case V16SI_FTYPE_V16SI_INT:
36019 case V4SI_FTYPE_V16SI_INT:
36020 case V4DI_FTYPE_V4DI_INT:
36021 case V2DI_FTYPE_V4DI_INT:
36022 case V4DI_FTYPE_V8DI_INT:
36023 case QI_FTYPE_V4SF_INT:
36024 case QI_FTYPE_V2DF_INT:
36025 case UQI_FTYPE_UQI_UQI_CONST:
36026 case UHI_FTYPE_UHI_UQI:
36027 case USI_FTYPE_USI_UQI:
36028 case UDI_FTYPE_UDI_UQI:
36029 nargs = 2;
36030 nargs_constant = 1;
36031 break;
36032 case V16QI_FTYPE_V16QI_V16QI_V16QI:
36033 case V8SF_FTYPE_V8SF_V8SF_V8SF:
36034 case V4DF_FTYPE_V4DF_V4DF_V4DF:
36035 case V4SF_FTYPE_V4SF_V4SF_V4SF:
36036 case V2DF_FTYPE_V2DF_V2DF_V2DF:
36037 case V32QI_FTYPE_V32QI_V32QI_V32QI:
36038 case UHI_FTYPE_V16SI_V16SI_UHI:
36039 case UQI_FTYPE_V8DI_V8DI_UQI:
36040 case V16HI_FTYPE_V16SI_V16HI_UHI:
36041 case V16QI_FTYPE_V16SI_V16QI_UHI:
36042 case V16QI_FTYPE_V8DI_V16QI_UQI:
36043 case V16SF_FTYPE_V16SF_V16SF_UHI:
36044 case V16SF_FTYPE_V4SF_V16SF_UHI:
36045 case V16SI_FTYPE_SI_V16SI_UHI:
36046 case V16SI_FTYPE_V16HI_V16SI_UHI:
36047 case V16SI_FTYPE_V16QI_V16SI_UHI:
36048 case V8SF_FTYPE_V4SF_V8SF_UQI:
36049 case V4DF_FTYPE_V2DF_V4DF_UQI:
36050 case V8SI_FTYPE_V4SI_V8SI_UQI:
36051 case V8SI_FTYPE_SI_V8SI_UQI:
36052 case V4SI_FTYPE_V4SI_V4SI_UQI:
36053 case V4SI_FTYPE_SI_V4SI_UQI:
36054 case V4DI_FTYPE_V2DI_V4DI_UQI:
36055 case V4DI_FTYPE_DI_V4DI_UQI:
36056 case V2DI_FTYPE_V2DI_V2DI_UQI:
36057 case V2DI_FTYPE_DI_V2DI_UQI:
36058 case V64QI_FTYPE_V64QI_V64QI_UDI:
36059 case V64QI_FTYPE_V16QI_V64QI_UDI:
36060 case V64QI_FTYPE_QI_V64QI_UDI:
36061 case V32QI_FTYPE_V32QI_V32QI_USI:
36062 case V32QI_FTYPE_V16QI_V32QI_USI:
36063 case V32QI_FTYPE_QI_V32QI_USI:
36064 case V16QI_FTYPE_V16QI_V16QI_UHI:
36065 case V16QI_FTYPE_QI_V16QI_UHI:
36066 case V32HI_FTYPE_V8HI_V32HI_USI:
36067 case V32HI_FTYPE_HI_V32HI_USI:
36068 case V16HI_FTYPE_V8HI_V16HI_UHI:
36069 case V16HI_FTYPE_HI_V16HI_UHI:
36070 case V8HI_FTYPE_V8HI_V8HI_UQI:
36071 case V8HI_FTYPE_HI_V8HI_UQI:
36072 case V8SF_FTYPE_V8HI_V8SF_UQI:
36073 case V4SF_FTYPE_V8HI_V4SF_UQI:
36074 case V8SI_FTYPE_V8SF_V8SI_UQI:
36075 case V4SI_FTYPE_V4SF_V4SI_UQI:
36076 case V4DI_FTYPE_V4SF_V4DI_UQI:
36077 case V2DI_FTYPE_V4SF_V2DI_UQI:
36078 case V4SF_FTYPE_V4DI_V4SF_UQI:
36079 case V4SF_FTYPE_V2DI_V4SF_UQI:
36080 case V4DF_FTYPE_V4DI_V4DF_UQI:
36081 case V2DF_FTYPE_V2DI_V2DF_UQI:
36082 case V16QI_FTYPE_V8HI_V16QI_UQI:
36083 case V16QI_FTYPE_V16HI_V16QI_UHI:
36084 case V16QI_FTYPE_V4SI_V16QI_UQI:
36085 case V16QI_FTYPE_V8SI_V16QI_UQI:
36086 case V8HI_FTYPE_V4SI_V8HI_UQI:
36087 case V8HI_FTYPE_V8SI_V8HI_UQI:
36088 case V16QI_FTYPE_V2DI_V16QI_UQI:
36089 case V16QI_FTYPE_V4DI_V16QI_UQI:
36090 case V8HI_FTYPE_V2DI_V8HI_UQI:
36091 case V8HI_FTYPE_V4DI_V8HI_UQI:
36092 case V4SI_FTYPE_V2DI_V4SI_UQI:
36093 case V4SI_FTYPE_V4DI_V4SI_UQI:
36094 case V32QI_FTYPE_V32HI_V32QI_USI:
36095 case UHI_FTYPE_V16QI_V16QI_UHI:
36096 case USI_FTYPE_V32QI_V32QI_USI:
36097 case UDI_FTYPE_V64QI_V64QI_UDI:
36098 case UQI_FTYPE_V8HI_V8HI_UQI:
36099 case UHI_FTYPE_V16HI_V16HI_UHI:
36100 case USI_FTYPE_V32HI_V32HI_USI:
36101 case UQI_FTYPE_V4SI_V4SI_UQI:
36102 case UQI_FTYPE_V8SI_V8SI_UQI:
36103 case UQI_FTYPE_V2DI_V2DI_UQI:
36104 case UQI_FTYPE_V4DI_V4DI_UQI:
36105 case V4SF_FTYPE_V2DF_V4SF_UQI:
36106 case V4SF_FTYPE_V4DF_V4SF_UQI:
36107 case V16SI_FTYPE_V16SI_V16SI_UHI:
36108 case V16SI_FTYPE_V4SI_V16SI_UHI:
36109 case V2DI_FTYPE_V4SI_V2DI_UQI:
36110 case V2DI_FTYPE_V8HI_V2DI_UQI:
36111 case V2DI_FTYPE_V16QI_V2DI_UQI:
36112 case V4DI_FTYPE_V4DI_V4DI_UQI:
36113 case V4DI_FTYPE_V4SI_V4DI_UQI:
36114 case V4DI_FTYPE_V8HI_V4DI_UQI:
36115 case V4DI_FTYPE_V16QI_V4DI_UQI:
36116 case V4DI_FTYPE_V4DF_V4DI_UQI:
36117 case V2DI_FTYPE_V2DF_V2DI_UQI:
36118 case V4SI_FTYPE_V4DF_V4SI_UQI:
36119 case V4SI_FTYPE_V2DF_V4SI_UQI:
36120 case V4SI_FTYPE_V8HI_V4SI_UQI:
36121 case V4SI_FTYPE_V16QI_V4SI_UQI:
36122 case V4DI_FTYPE_V4DI_V4DI_V4DI:
36123 case V8DF_FTYPE_V2DF_V8DF_UQI:
36124 case V8DF_FTYPE_V4DF_V8DF_UQI:
36125 case V8DF_FTYPE_V8DF_V8DF_UQI:
36126 case V8SF_FTYPE_V8SF_V8SF_UQI:
36127 case V8SF_FTYPE_V8SI_V8SF_UQI:
36128 case V4DF_FTYPE_V4DF_V4DF_UQI:
36129 case V4SF_FTYPE_V4SF_V4SF_UQI:
36130 case V2DF_FTYPE_V2DF_V2DF_UQI:
36131 case V2DF_FTYPE_V4SF_V2DF_UQI:
36132 case V2DF_FTYPE_V4SI_V2DF_UQI:
36133 case V4SF_FTYPE_V4SI_V4SF_UQI:
36134 case V4DF_FTYPE_V4SF_V4DF_UQI:
36135 case V4DF_FTYPE_V4SI_V4DF_UQI:
36136 case V8SI_FTYPE_V8SI_V8SI_UQI:
36137 case V8SI_FTYPE_V8HI_V8SI_UQI:
36138 case V8SI_FTYPE_V16QI_V8SI_UQI:
36139 case V8DF_FTYPE_V8SI_V8DF_UQI:
36140 case V8DI_FTYPE_DI_V8DI_UQI:
36141 case V16SF_FTYPE_V8SF_V16SF_UHI:
36142 case V16SI_FTYPE_V8SI_V16SI_UHI:
36143 case V16HI_FTYPE_V16HI_V16HI_UHI:
36144 case V8HI_FTYPE_V16QI_V8HI_UQI:
36145 case V16HI_FTYPE_V16QI_V16HI_UHI:
36146 case V32HI_FTYPE_V32HI_V32HI_USI:
36147 case V32HI_FTYPE_V32QI_V32HI_USI:
36148 case V8DI_FTYPE_V16QI_V8DI_UQI:
36149 case V8DI_FTYPE_V2DI_V8DI_UQI:
36150 case V8DI_FTYPE_V4DI_V8DI_UQI:
36151 case V8DI_FTYPE_V8DI_V8DI_UQI:
36152 case V8DI_FTYPE_V8HI_V8DI_UQI:
36153 case V8DI_FTYPE_V8SI_V8DI_UQI:
36154 case V8HI_FTYPE_V8DI_V8HI_UQI:
36155 case V8SI_FTYPE_V8DI_V8SI_UQI:
36156 case V4SI_FTYPE_V4SI_V4SI_V4SI:
36157 nargs = 3;
36158 break;
36159 case V32QI_FTYPE_V32QI_V32QI_INT:
36160 case V16HI_FTYPE_V16HI_V16HI_INT:
36161 case V16QI_FTYPE_V16QI_V16QI_INT:
36162 case V4DI_FTYPE_V4DI_V4DI_INT:
36163 case V8HI_FTYPE_V8HI_V8HI_INT:
36164 case V8SI_FTYPE_V8SI_V8SI_INT:
36165 case V8SI_FTYPE_V8SI_V4SI_INT:
36166 case V8SF_FTYPE_V8SF_V8SF_INT:
36167 case V8SF_FTYPE_V8SF_V4SF_INT:
36168 case V4SI_FTYPE_V4SI_V4SI_INT:
36169 case V4DF_FTYPE_V4DF_V4DF_INT:
36170 case V16SF_FTYPE_V16SF_V16SF_INT:
36171 case V16SF_FTYPE_V16SF_V4SF_INT:
36172 case V16SI_FTYPE_V16SI_V4SI_INT:
36173 case V4DF_FTYPE_V4DF_V2DF_INT:
36174 case V4SF_FTYPE_V4SF_V4SF_INT:
36175 case V2DI_FTYPE_V2DI_V2DI_INT:
36176 case V4DI_FTYPE_V4DI_V2DI_INT:
36177 case V2DF_FTYPE_V2DF_V2DF_INT:
36178 case UQI_FTYPE_V8DI_V8UDI_INT:
36179 case UQI_FTYPE_V8DF_V8DF_INT:
36180 case UQI_FTYPE_V2DF_V2DF_INT:
36181 case UQI_FTYPE_V4SF_V4SF_INT:
36182 case UHI_FTYPE_V16SI_V16SI_INT:
36183 case UHI_FTYPE_V16SF_V16SF_INT:
36184 nargs = 3;
36185 nargs_constant = 1;
36186 break;
36187 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
36188 nargs = 3;
36189 rmode = V4DImode;
36190 nargs_constant = 1;
36191 break;
36192 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
36193 nargs = 3;
36194 rmode = V2DImode;
36195 nargs_constant = 1;
36196 break;
36197 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
36198 nargs = 3;
36199 rmode = DImode;
36200 nargs_constant = 1;
36201 break;
36202 case V2DI_FTYPE_V2DI_UINT_UINT:
36203 nargs = 3;
36204 nargs_constant = 2;
36205 break;
36206 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
36207 nargs = 3;
36208 rmode = V8DImode;
36209 nargs_constant = 1;
36210 break;
36211 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
36212 nargs = 5;
36213 rmode = V8DImode;
36214 mask_pos = 2;
36215 nargs_constant = 1;
36216 break;
36217 case QI_FTYPE_V8DF_INT_UQI:
36218 case QI_FTYPE_V4DF_INT_UQI:
36219 case QI_FTYPE_V2DF_INT_UQI:
36220 case HI_FTYPE_V16SF_INT_UHI:
36221 case QI_FTYPE_V8SF_INT_UQI:
36222 case QI_FTYPE_V4SF_INT_UQI:
36223 nargs = 3;
36224 mask_pos = 1;
36225 nargs_constant = 1;
36226 break;
36227 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
36228 nargs = 5;
36229 rmode = V4DImode;
36230 mask_pos = 2;
36231 nargs_constant = 1;
36232 break;
36233 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
36234 nargs = 5;
36235 rmode = V2DImode;
36236 mask_pos = 2;
36237 nargs_constant = 1;
36238 break;
36239 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
36240 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
36241 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
36242 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
36243 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
36244 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
36245 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
36246 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
36247 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
36248 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
36249 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
36250 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
36251 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
36252 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
36253 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
36254 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
36255 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
36256 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
36257 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
36258 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
36259 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
36260 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
36261 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
36262 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
36263 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
36264 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
36265 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
36266 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
36267 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
36268 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
36269 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
36270 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
36271 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
36272 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
36273 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
36274 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
36275 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
36276 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
36277 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
36278 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
36279 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
36280 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
36281 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
36282 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
36283 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
36284 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
36285 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
36286 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
36287 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
36288 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
36289 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
36290 nargs = 4;
36291 break;
36292 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
36293 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
36294 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
36295 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
36296 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
36297 nargs = 4;
36298 nargs_constant = 1;
36299 break;
36300 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
36301 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
36302 case QI_FTYPE_V4DF_V4DF_INT_UQI:
36303 case QI_FTYPE_V8SF_V8SF_INT_UQI:
36304 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
36305 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
36306 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
36307 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
36308 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
36309 case USI_FTYPE_V32QI_V32QI_INT_USI:
36310 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
36311 case USI_FTYPE_V32HI_V32HI_INT_USI:
36312 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
36313 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
36314 nargs = 4;
36315 mask_pos = 1;
36316 nargs_constant = 1;
36317 break;
36318 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
36319 nargs = 4;
36320 nargs_constant = 2;
36321 break;
36322 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
36323 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
36324 nargs = 4;
36325 break;
36326 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
36327 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
36328 mask_pos = 1;
36329 nargs = 4;
36330 nargs_constant = 1;
36331 break;
36332 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
36333 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
36334 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
36335 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
36336 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
36337 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
36338 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
36339 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
36340 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
36341 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
36342 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
36343 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
36344 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
36345 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
36346 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
36347 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
36348 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
36349 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
36350 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
36351 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
36352 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
36353 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
36354 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
36355 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
36356 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
36357 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
36358 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
36359 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
36360 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
36361 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
36362 nargs = 4;
36363 mask_pos = 2;
36364 nargs_constant = 1;
36365 break;
36366 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
36367 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
36368 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
36369 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
36370 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
36371 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
36372 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
36373 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
36374 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
36375 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
36376 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
36377 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
36378 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
36379 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
36380 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
36381 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
36382 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
36383 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
36384 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
36385 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
36386 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
36387 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
36388 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
36389 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
36390 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
36391 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
36392 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
36393 nargs = 5;
36394 mask_pos = 2;
36395 nargs_constant = 1;
36396 break;
36397 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
36398 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
36399 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
36400 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
36401 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
36402 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
36403 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
36404 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
36405 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
36406 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
36407 nargs = 5;
36408 mask_pos = 1;
36409 nargs_constant = 1;
36410 break;
36412 default:
36413 gcc_unreachable ();
36416 gcc_assert (nargs <= ARRAY_SIZE (args));
36418 if (comparison != UNKNOWN)
36420 gcc_assert (nargs == 2);
36421 return ix86_expand_sse_compare (d, exp, target, swap);
36424 if (rmode == VOIDmode || rmode == tmode)
36426 if (optimize
36427 || target == 0
36428 || GET_MODE (target) != tmode
36429 || !insn_p->operand[0].predicate (target, tmode))
36430 target = gen_reg_rtx (tmode);
36431 else if (memory_operand (target, tmode))
36432 num_memory++;
36433 real_target = target;
36435 else
36437 real_target = gen_reg_rtx (tmode);
36438 target = lowpart_subreg (rmode, real_target, tmode);
36441 for (i = 0; i < nargs; i++)
36443 tree arg = CALL_EXPR_ARG (exp, i);
36444 rtx op = expand_normal (arg);
36445 machine_mode mode = insn_p->operand[i + 1].mode;
36446 bool match = insn_p->operand[i + 1].predicate (op, mode);
36448 if (second_arg_count && i == 1)
36450 /* SIMD shift insns take either an 8-bit immediate or
36451 register as count. But builtin functions take int as
36452 count. If count doesn't match, we put it in register.
36453 The instructions are using 64-bit count, if op is just
36454 32-bit, zero-extend it, as negative shift counts
36455 are undefined behavior and zero-extension is more
36456 efficient. */
36457 if (!match)
36459 if (SCALAR_INT_MODE_P (GET_MODE (op)))
36460 op = convert_modes (mode, GET_MODE (op), op, 1);
36461 else
36462 op = lowpart_subreg (mode, op, GET_MODE (op));
36463 if (!insn_p->operand[i + 1].predicate (op, mode))
36464 op = copy_to_reg (op);
36467 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36468 (!mask_pos && (nargs - i) <= nargs_constant))
36470 if (!match)
36471 switch (icode)
36473 case CODE_FOR_avx_vinsertf128v4di:
36474 case CODE_FOR_avx_vextractf128v4di:
36475 error ("the last argument must be an 1-bit immediate");
36476 return const0_rtx;
36478 case CODE_FOR_avx512f_cmpv8di3_mask:
36479 case CODE_FOR_avx512f_cmpv16si3_mask:
36480 case CODE_FOR_avx512f_ucmpv8di3_mask:
36481 case CODE_FOR_avx512f_ucmpv16si3_mask:
36482 case CODE_FOR_avx512vl_cmpv4di3_mask:
36483 case CODE_FOR_avx512vl_cmpv8si3_mask:
36484 case CODE_FOR_avx512vl_ucmpv4di3_mask:
36485 case CODE_FOR_avx512vl_ucmpv8si3_mask:
36486 case CODE_FOR_avx512vl_cmpv2di3_mask:
36487 case CODE_FOR_avx512vl_cmpv4si3_mask:
36488 case CODE_FOR_avx512vl_ucmpv2di3_mask:
36489 case CODE_FOR_avx512vl_ucmpv4si3_mask:
36490 error ("the last argument must be a 3-bit immediate");
36491 return const0_rtx;
36493 case CODE_FOR_sse4_1_roundsd:
36494 case CODE_FOR_sse4_1_roundss:
36496 case CODE_FOR_sse4_1_roundpd:
36497 case CODE_FOR_sse4_1_roundps:
36498 case CODE_FOR_avx_roundpd256:
36499 case CODE_FOR_avx_roundps256:
36501 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
36502 case CODE_FOR_sse4_1_roundps_sfix:
36503 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
36504 case CODE_FOR_avx_roundps_sfix256:
36506 case CODE_FOR_sse4_1_blendps:
36507 case CODE_FOR_avx_blendpd256:
36508 case CODE_FOR_avx_vpermilv4df:
36509 case CODE_FOR_avx_vpermilv4df_mask:
36510 case CODE_FOR_avx512f_getmantv8df_mask:
36511 case CODE_FOR_avx512f_getmantv16sf_mask:
36512 case CODE_FOR_avx512vl_getmantv8sf_mask:
36513 case CODE_FOR_avx512vl_getmantv4df_mask:
36514 case CODE_FOR_avx512vl_getmantv4sf_mask:
36515 case CODE_FOR_avx512vl_getmantv2df_mask:
36516 case CODE_FOR_avx512dq_rangepv8df_mask_round:
36517 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
36518 case CODE_FOR_avx512dq_rangepv4df_mask:
36519 case CODE_FOR_avx512dq_rangepv8sf_mask:
36520 case CODE_FOR_avx512dq_rangepv2df_mask:
36521 case CODE_FOR_avx512dq_rangepv4sf_mask:
36522 case CODE_FOR_avx_shufpd256_mask:
36523 error ("the last argument must be a 4-bit immediate");
36524 return const0_rtx;
36526 case CODE_FOR_sha1rnds4:
36527 case CODE_FOR_sse4_1_blendpd:
36528 case CODE_FOR_avx_vpermilv2df:
36529 case CODE_FOR_avx_vpermilv2df_mask:
36530 case CODE_FOR_xop_vpermil2v2df3:
36531 case CODE_FOR_xop_vpermil2v4sf3:
36532 case CODE_FOR_xop_vpermil2v4df3:
36533 case CODE_FOR_xop_vpermil2v8sf3:
36534 case CODE_FOR_avx512f_vinsertf32x4_mask:
36535 case CODE_FOR_avx512f_vinserti32x4_mask:
36536 case CODE_FOR_avx512f_vextractf32x4_mask:
36537 case CODE_FOR_avx512f_vextracti32x4_mask:
36538 case CODE_FOR_sse2_shufpd:
36539 case CODE_FOR_sse2_shufpd_mask:
36540 case CODE_FOR_avx512dq_shuf_f64x2_mask:
36541 case CODE_FOR_avx512dq_shuf_i64x2_mask:
36542 case CODE_FOR_avx512vl_shuf_i32x4_mask:
36543 case CODE_FOR_avx512vl_shuf_f32x4_mask:
36544 error ("the last argument must be a 2-bit immediate");
36545 return const0_rtx;
36547 case CODE_FOR_avx_vextractf128v4df:
36548 case CODE_FOR_avx_vextractf128v8sf:
36549 case CODE_FOR_avx_vextractf128v8si:
36550 case CODE_FOR_avx_vinsertf128v4df:
36551 case CODE_FOR_avx_vinsertf128v8sf:
36552 case CODE_FOR_avx_vinsertf128v8si:
36553 case CODE_FOR_avx512f_vinsertf64x4_mask:
36554 case CODE_FOR_avx512f_vinserti64x4_mask:
36555 case CODE_FOR_avx512f_vextractf64x4_mask:
36556 case CODE_FOR_avx512f_vextracti64x4_mask:
36557 case CODE_FOR_avx512dq_vinsertf32x8_mask:
36558 case CODE_FOR_avx512dq_vinserti32x8_mask:
36559 case CODE_FOR_avx512vl_vinsertv4df:
36560 case CODE_FOR_avx512vl_vinsertv4di:
36561 case CODE_FOR_avx512vl_vinsertv8sf:
36562 case CODE_FOR_avx512vl_vinsertv8si:
36563 error ("the last argument must be a 1-bit immediate");
36564 return const0_rtx;
36566 case CODE_FOR_avx_vmcmpv2df3:
36567 case CODE_FOR_avx_vmcmpv4sf3:
36568 case CODE_FOR_avx_cmpv2df3:
36569 case CODE_FOR_avx_cmpv4sf3:
36570 case CODE_FOR_avx_cmpv4df3:
36571 case CODE_FOR_avx_cmpv8sf3:
36572 case CODE_FOR_avx512f_cmpv8df3_mask:
36573 case CODE_FOR_avx512f_cmpv16sf3_mask:
36574 case CODE_FOR_avx512f_vmcmpv2df3_mask:
36575 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
36576 error ("the last argument must be a 5-bit immediate");
36577 return const0_rtx;
36579 default:
36580 switch (nargs_constant)
36582 case 2:
36583 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
36584 (!mask_pos && (nargs - i) == nargs_constant))
36586 error ("the next to last argument must be an 8-bit immediate");
36587 break;
36589 /* FALLTHRU */
36590 case 1:
36591 error ("the last argument must be an 8-bit immediate");
36592 break;
36593 default:
36594 gcc_unreachable ();
36596 return const0_rtx;
36599 else
36601 if (VECTOR_MODE_P (mode))
36602 op = safe_vector_operand (op, mode);
36604 /* If we aren't optimizing, only allow one memory operand to
36605 be generated. */
36606 if (memory_operand (op, mode))
36607 num_memory++;
36609 op = fixup_modeless_constant (op, mode);
36611 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36613 if (optimize || !match || num_memory > 1)
36614 op = copy_to_mode_reg (mode, op);
36616 else
36618 op = copy_to_reg (op);
36619 op = lowpart_subreg (mode, op, GET_MODE (op));
36623 args[i].op = op;
36624 args[i].mode = mode;
36627 switch (nargs)
36629 case 1:
36630 pat = GEN_FCN (icode) (real_target, args[0].op);
36631 break;
36632 case 2:
36633 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
36634 break;
36635 case 3:
36636 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36637 args[2].op);
36638 break;
36639 case 4:
36640 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36641 args[2].op, args[3].op);
36642 break;
36643 case 5:
36644 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36645 args[2].op, args[3].op, args[4].op);
36646 break;
36647 case 6:
36648 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
36649 args[2].op, args[3].op, args[4].op,
36650 args[5].op);
36651 break;
36652 default:
36653 gcc_unreachable ();
36656 if (! pat)
36657 return 0;
36659 emit_insn (pat);
36660 return target;
36663 /* Transform pattern of following layout:
36664 (set A
36665 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
36667 into:
36668 (set (A B)) */
36670 static rtx
36671 ix86_erase_embedded_rounding (rtx pat)
36673 if (GET_CODE (pat) == INSN)
36674 pat = PATTERN (pat);
36676 gcc_assert (GET_CODE (pat) == SET);
36677 rtx src = SET_SRC (pat);
36678 gcc_assert (XVECLEN (src, 0) == 2);
36679 rtx p0 = XVECEXP (src, 0, 0);
36680 gcc_assert (GET_CODE (src) == UNSPEC
36681 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
36682 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
36683 return res;
36686 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
36687 with rounding. */
36688 static rtx
36689 ix86_expand_sse_comi_round (const struct builtin_description *d,
36690 tree exp, rtx target)
36692 rtx pat, set_dst;
36693 tree arg0 = CALL_EXPR_ARG (exp, 0);
36694 tree arg1 = CALL_EXPR_ARG (exp, 1);
36695 tree arg2 = CALL_EXPR_ARG (exp, 2);
36696 tree arg3 = CALL_EXPR_ARG (exp, 3);
36697 rtx op0 = expand_normal (arg0);
36698 rtx op1 = expand_normal (arg1);
36699 rtx op2 = expand_normal (arg2);
36700 rtx op3 = expand_normal (arg3);
36701 enum insn_code icode = d->icode;
36702 const struct insn_data_d *insn_p = &insn_data[icode];
36703 machine_mode mode0 = insn_p->operand[0].mode;
36704 machine_mode mode1 = insn_p->operand[1].mode;
36705 enum rtx_code comparison = UNEQ;
36706 bool need_ucomi = false;
36708 /* See avxintrin.h for values. */
36709 enum rtx_code comi_comparisons[32] =
36711 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
36712 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
36713 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
36715 bool need_ucomi_values[32] =
36717 true, false, false, true, true, false, false, true,
36718 true, false, false, true, true, false, false, true,
36719 false, true, true, false, false, true, true, false,
36720 false, true, true, false, false, true, true, false
36723 if (!CONST_INT_P (op2))
36725 error ("the third argument must be comparison constant");
36726 return const0_rtx;
36728 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
36730 error ("incorrect comparison mode");
36731 return const0_rtx;
36734 if (!insn_p->operand[2].predicate (op3, SImode))
36736 error ("incorrect rounding operand");
36737 return const0_rtx;
36740 comparison = comi_comparisons[INTVAL (op2)];
36741 need_ucomi = need_ucomi_values[INTVAL (op2)];
36743 if (VECTOR_MODE_P (mode0))
36744 op0 = safe_vector_operand (op0, mode0);
36745 if (VECTOR_MODE_P (mode1))
36746 op1 = safe_vector_operand (op1, mode1);
36748 target = gen_reg_rtx (SImode);
36749 emit_move_insn (target, const0_rtx);
36750 target = gen_rtx_SUBREG (QImode, target, 0);
36752 if ((optimize && !register_operand (op0, mode0))
36753 || !insn_p->operand[0].predicate (op0, mode0))
36754 op0 = copy_to_mode_reg (mode0, op0);
36755 if ((optimize && !register_operand (op1, mode1))
36756 || !insn_p->operand[1].predicate (op1, mode1))
36757 op1 = copy_to_mode_reg (mode1, op1);
36759 if (need_ucomi)
36760 icode = icode == CODE_FOR_sse_comi_round
36761 ? CODE_FOR_sse_ucomi_round
36762 : CODE_FOR_sse2_ucomi_round;
36764 pat = GEN_FCN (icode) (op0, op1, op3);
36765 if (! pat)
36766 return 0;
36768 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
36769 if (INTVAL (op3) == NO_ROUND)
36771 pat = ix86_erase_embedded_rounding (pat);
36772 if (! pat)
36773 return 0;
36775 set_dst = SET_DEST (pat);
36777 else
36779 gcc_assert (GET_CODE (pat) == SET);
36780 set_dst = SET_DEST (pat);
36783 emit_insn (pat);
36784 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
36785 gen_rtx_fmt_ee (comparison, QImode,
36786 set_dst,
36787 const0_rtx)));
36789 return SUBREG_REG (target);
36792 static rtx
36793 ix86_expand_round_builtin (const struct builtin_description *d,
36794 tree exp, rtx target)
36796 rtx pat;
36797 unsigned int i, nargs;
36798 struct
36800 rtx op;
36801 machine_mode mode;
36802 } args[6];
36803 enum insn_code icode = d->icode;
36804 const struct insn_data_d *insn_p = &insn_data[icode];
36805 machine_mode tmode = insn_p->operand[0].mode;
36806 unsigned int nargs_constant = 0;
36807 unsigned int redundant_embed_rnd = 0;
36809 switch ((enum ix86_builtin_func_type) d->flag)
36811 case UINT64_FTYPE_V2DF_INT:
36812 case UINT64_FTYPE_V4SF_INT:
36813 case UINT_FTYPE_V2DF_INT:
36814 case UINT_FTYPE_V4SF_INT:
36815 case INT64_FTYPE_V2DF_INT:
36816 case INT64_FTYPE_V4SF_INT:
36817 case INT_FTYPE_V2DF_INT:
36818 case INT_FTYPE_V4SF_INT:
36819 nargs = 2;
36820 break;
36821 case V4SF_FTYPE_V4SF_UINT_INT:
36822 case V4SF_FTYPE_V4SF_UINT64_INT:
36823 case V2DF_FTYPE_V2DF_UINT64_INT:
36824 case V4SF_FTYPE_V4SF_INT_INT:
36825 case V4SF_FTYPE_V4SF_INT64_INT:
36826 case V2DF_FTYPE_V2DF_INT64_INT:
36827 case V4SF_FTYPE_V4SF_V4SF_INT:
36828 case V2DF_FTYPE_V2DF_V2DF_INT:
36829 case V4SF_FTYPE_V4SF_V2DF_INT:
36830 case V2DF_FTYPE_V2DF_V4SF_INT:
36831 nargs = 3;
36832 break;
36833 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36834 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36835 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36836 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36837 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36838 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36839 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36840 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36841 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36842 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36843 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36844 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36845 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36846 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36847 nargs = 4;
36848 break;
36849 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36850 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36851 nargs_constant = 2;
36852 nargs = 4;
36853 break;
36854 case INT_FTYPE_V4SF_V4SF_INT_INT:
36855 case INT_FTYPE_V2DF_V2DF_INT_INT:
36856 return ix86_expand_sse_comi_round (d, exp, target);
36857 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36858 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36859 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36860 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36861 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36862 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36863 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36864 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36865 nargs = 5;
36866 break;
36867 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36868 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36869 nargs_constant = 4;
36870 nargs = 5;
36871 break;
36872 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36873 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36874 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36875 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36876 nargs_constant = 3;
36877 nargs = 5;
36878 break;
36879 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36880 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36881 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36882 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36883 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
36884 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
36885 nargs = 6;
36886 nargs_constant = 4;
36887 break;
36888 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36889 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36890 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36891 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36892 nargs = 6;
36893 nargs_constant = 3;
36894 break;
36895 default:
36896 gcc_unreachable ();
36898 gcc_assert (nargs <= ARRAY_SIZE (args));
36900 if (optimize
36901 || target == 0
36902 || GET_MODE (target) != tmode
36903 || !insn_p->operand[0].predicate (target, tmode))
36904 target = gen_reg_rtx (tmode);
36906 for (i = 0; i < nargs; i++)
36908 tree arg = CALL_EXPR_ARG (exp, i);
36909 rtx op = expand_normal (arg);
36910 machine_mode mode = insn_p->operand[i + 1].mode;
36911 bool match = insn_p->operand[i + 1].predicate (op, mode);
36913 if (i == nargs - nargs_constant)
36915 if (!match)
36917 switch (icode)
36919 case CODE_FOR_avx512f_getmantv8df_mask_round:
36920 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36921 case CODE_FOR_avx512f_vgetmantv2df_round:
36922 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
36923 case CODE_FOR_avx512f_vgetmantv4sf_round:
36924 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
36925 error ("the immediate argument must be a 4-bit immediate");
36926 return const0_rtx;
36927 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36928 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36929 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36930 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36931 error ("the immediate argument must be a 5-bit immediate");
36932 return const0_rtx;
36933 default:
36934 error ("the immediate argument must be an 8-bit immediate");
36935 return const0_rtx;
36939 else if (i == nargs-1)
36941 if (!insn_p->operand[nargs].predicate (op, SImode))
36943 error ("incorrect rounding operand");
36944 return const0_rtx;
36947 /* If there is no rounding use normal version of the pattern. */
36948 if (INTVAL (op) == NO_ROUND)
36949 redundant_embed_rnd = 1;
36951 else
36953 if (VECTOR_MODE_P (mode))
36954 op = safe_vector_operand (op, mode);
36956 op = fixup_modeless_constant (op, mode);
36958 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36960 if (optimize || !match)
36961 op = copy_to_mode_reg (mode, op);
36963 else
36965 op = copy_to_reg (op);
36966 op = lowpart_subreg (mode, op, GET_MODE (op));
36970 args[i].op = op;
36971 args[i].mode = mode;
36974 switch (nargs)
36976 case 1:
36977 pat = GEN_FCN (icode) (target, args[0].op);
36978 break;
36979 case 2:
36980 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36981 break;
36982 case 3:
36983 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36984 args[2].op);
36985 break;
36986 case 4:
36987 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36988 args[2].op, args[3].op);
36989 break;
36990 case 5:
36991 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36992 args[2].op, args[3].op, args[4].op);
36993 break;
36994 case 6:
36995 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36996 args[2].op, args[3].op, args[4].op,
36997 args[5].op);
36998 break;
36999 default:
37000 gcc_unreachable ();
37003 if (!pat)
37004 return 0;
37006 if (redundant_embed_rnd)
37007 pat = ix86_erase_embedded_rounding (pat);
37009 emit_insn (pat);
37010 return target;
37013 /* Subroutine of ix86_expand_builtin to take care of special insns
37014 with variable number of operands. */
37016 static rtx
37017 ix86_expand_special_args_builtin (const struct builtin_description *d,
37018 tree exp, rtx target)
37020 tree arg;
37021 rtx pat, op;
37022 unsigned int i, nargs, arg_adjust, memory;
37023 bool aligned_mem = false;
37024 struct
37026 rtx op;
37027 machine_mode mode;
37028 } args[3];
37029 enum insn_code icode = d->icode;
37030 bool last_arg_constant = false;
37031 const struct insn_data_d *insn_p = &insn_data[icode];
37032 machine_mode tmode = insn_p->operand[0].mode;
37033 enum { load, store } klass;
37035 switch ((enum ix86_builtin_func_type) d->flag)
37037 case VOID_FTYPE_VOID:
37038 emit_insn (GEN_FCN (icode) (target));
37039 return 0;
37040 case VOID_FTYPE_UINT64:
37041 case VOID_FTYPE_UNSIGNED:
37042 nargs = 0;
37043 klass = store;
37044 memory = 0;
37045 break;
37047 case INT_FTYPE_VOID:
37048 case USHORT_FTYPE_VOID:
37049 case UINT64_FTYPE_VOID:
37050 case UNSIGNED_FTYPE_VOID:
37051 nargs = 0;
37052 klass = load;
37053 memory = 0;
37054 break;
37055 case UINT64_FTYPE_PUNSIGNED:
37056 case V2DI_FTYPE_PV2DI:
37057 case V4DI_FTYPE_PV4DI:
37058 case V32QI_FTYPE_PCCHAR:
37059 case V16QI_FTYPE_PCCHAR:
37060 case V8SF_FTYPE_PCV4SF:
37061 case V8SF_FTYPE_PCFLOAT:
37062 case V4SF_FTYPE_PCFLOAT:
37063 case V4DF_FTYPE_PCV2DF:
37064 case V4DF_FTYPE_PCDOUBLE:
37065 case V2DF_FTYPE_PCDOUBLE:
37066 case VOID_FTYPE_PVOID:
37067 case V8DI_FTYPE_PV8DI:
37068 nargs = 1;
37069 klass = load;
37070 memory = 0;
37071 switch (icode)
37073 case CODE_FOR_sse4_1_movntdqa:
37074 case CODE_FOR_avx2_movntdqa:
37075 case CODE_FOR_avx512f_movntdqa:
37076 aligned_mem = true;
37077 break;
37078 default:
37079 break;
37081 break;
37082 case VOID_FTYPE_PV2SF_V4SF:
37083 case VOID_FTYPE_PV8DI_V8DI:
37084 case VOID_FTYPE_PV4DI_V4DI:
37085 case VOID_FTYPE_PV2DI_V2DI:
37086 case VOID_FTYPE_PCHAR_V32QI:
37087 case VOID_FTYPE_PCHAR_V16QI:
37088 case VOID_FTYPE_PFLOAT_V16SF:
37089 case VOID_FTYPE_PFLOAT_V8SF:
37090 case VOID_FTYPE_PFLOAT_V4SF:
37091 case VOID_FTYPE_PDOUBLE_V8DF:
37092 case VOID_FTYPE_PDOUBLE_V4DF:
37093 case VOID_FTYPE_PDOUBLE_V2DF:
37094 case VOID_FTYPE_PLONGLONG_LONGLONG:
37095 case VOID_FTYPE_PULONGLONG_ULONGLONG:
37096 case VOID_FTYPE_PINT_INT:
37097 nargs = 1;
37098 klass = store;
37099 /* Reserve memory operand for target. */
37100 memory = ARRAY_SIZE (args);
37101 switch (icode)
37103 /* These builtins and instructions require the memory
37104 to be properly aligned. */
37105 case CODE_FOR_avx_movntv4di:
37106 case CODE_FOR_sse2_movntv2di:
37107 case CODE_FOR_avx_movntv8sf:
37108 case CODE_FOR_sse_movntv4sf:
37109 case CODE_FOR_sse4a_vmmovntv4sf:
37110 case CODE_FOR_avx_movntv4df:
37111 case CODE_FOR_sse2_movntv2df:
37112 case CODE_FOR_sse4a_vmmovntv2df:
37113 case CODE_FOR_sse2_movntidi:
37114 case CODE_FOR_sse_movntq:
37115 case CODE_FOR_sse2_movntisi:
37116 case CODE_FOR_avx512f_movntv16sf:
37117 case CODE_FOR_avx512f_movntv8df:
37118 case CODE_FOR_avx512f_movntv8di:
37119 aligned_mem = true;
37120 break;
37121 default:
37122 break;
37124 break;
37125 case V4SF_FTYPE_V4SF_PCV2SF:
37126 case V2DF_FTYPE_V2DF_PCDOUBLE:
37127 nargs = 2;
37128 klass = load;
37129 memory = 1;
37130 break;
37131 case V8SF_FTYPE_PCV8SF_V8SI:
37132 case V4DF_FTYPE_PCV4DF_V4DI:
37133 case V4SF_FTYPE_PCV4SF_V4SI:
37134 case V2DF_FTYPE_PCV2DF_V2DI:
37135 case V8SI_FTYPE_PCV8SI_V8SI:
37136 case V4DI_FTYPE_PCV4DI_V4DI:
37137 case V4SI_FTYPE_PCV4SI_V4SI:
37138 case V2DI_FTYPE_PCV2DI_V2DI:
37139 case VOID_FTYPE_INT_INT64:
37140 nargs = 2;
37141 klass = load;
37142 memory = 0;
37143 break;
37144 case VOID_FTYPE_PV8DF_V8DF_UQI:
37145 case VOID_FTYPE_PV4DF_V4DF_UQI:
37146 case VOID_FTYPE_PV2DF_V2DF_UQI:
37147 case VOID_FTYPE_PV16SF_V16SF_UHI:
37148 case VOID_FTYPE_PV8SF_V8SF_UQI:
37149 case VOID_FTYPE_PV4SF_V4SF_UQI:
37150 case VOID_FTYPE_PV8DI_V8DI_UQI:
37151 case VOID_FTYPE_PV4DI_V4DI_UQI:
37152 case VOID_FTYPE_PV2DI_V2DI_UQI:
37153 case VOID_FTYPE_PV16SI_V16SI_UHI:
37154 case VOID_FTYPE_PV8SI_V8SI_UQI:
37155 case VOID_FTYPE_PV4SI_V4SI_UQI:
37156 switch (icode)
37158 /* These builtins and instructions require the memory
37159 to be properly aligned. */
37160 case CODE_FOR_avx512f_storev16sf_mask:
37161 case CODE_FOR_avx512f_storev16si_mask:
37162 case CODE_FOR_avx512f_storev8df_mask:
37163 case CODE_FOR_avx512f_storev8di_mask:
37164 case CODE_FOR_avx512vl_storev8sf_mask:
37165 case CODE_FOR_avx512vl_storev8si_mask:
37166 case CODE_FOR_avx512vl_storev4df_mask:
37167 case CODE_FOR_avx512vl_storev4di_mask:
37168 case CODE_FOR_avx512vl_storev4sf_mask:
37169 case CODE_FOR_avx512vl_storev4si_mask:
37170 case CODE_FOR_avx512vl_storev2df_mask:
37171 case CODE_FOR_avx512vl_storev2di_mask:
37172 aligned_mem = true;
37173 break;
37174 default:
37175 break;
37177 /* FALLTHRU */
37178 case VOID_FTYPE_PV8SF_V8SI_V8SF:
37179 case VOID_FTYPE_PV4DF_V4DI_V4DF:
37180 case VOID_FTYPE_PV4SF_V4SI_V4SF:
37181 case VOID_FTYPE_PV2DF_V2DI_V2DF:
37182 case VOID_FTYPE_PV8SI_V8SI_V8SI:
37183 case VOID_FTYPE_PV4DI_V4DI_V4DI:
37184 case VOID_FTYPE_PV4SI_V4SI_V4SI:
37185 case VOID_FTYPE_PV2DI_V2DI_V2DI:
37186 case VOID_FTYPE_PV8SI_V8DI_UQI:
37187 case VOID_FTYPE_PV8HI_V8DI_UQI:
37188 case VOID_FTYPE_PV16HI_V16SI_UHI:
37189 case VOID_FTYPE_PV16QI_V8DI_UQI:
37190 case VOID_FTYPE_PV16QI_V16SI_UHI:
37191 case VOID_FTYPE_PV4SI_V4DI_UQI:
37192 case VOID_FTYPE_PV4SI_V2DI_UQI:
37193 case VOID_FTYPE_PV8HI_V4DI_UQI:
37194 case VOID_FTYPE_PV8HI_V2DI_UQI:
37195 case VOID_FTYPE_PV8HI_V8SI_UQI:
37196 case VOID_FTYPE_PV8HI_V4SI_UQI:
37197 case VOID_FTYPE_PV16QI_V4DI_UQI:
37198 case VOID_FTYPE_PV16QI_V2DI_UQI:
37199 case VOID_FTYPE_PV16QI_V8SI_UQI:
37200 case VOID_FTYPE_PV16QI_V4SI_UQI:
37201 case VOID_FTYPE_PCHAR_V64QI_UDI:
37202 case VOID_FTYPE_PCHAR_V32QI_USI:
37203 case VOID_FTYPE_PCHAR_V16QI_UHI:
37204 case VOID_FTYPE_PSHORT_V32HI_USI:
37205 case VOID_FTYPE_PSHORT_V16HI_UHI:
37206 case VOID_FTYPE_PSHORT_V8HI_UQI:
37207 case VOID_FTYPE_PINT_V16SI_UHI:
37208 case VOID_FTYPE_PINT_V8SI_UQI:
37209 case VOID_FTYPE_PINT_V4SI_UQI:
37210 case VOID_FTYPE_PINT64_V8DI_UQI:
37211 case VOID_FTYPE_PINT64_V4DI_UQI:
37212 case VOID_FTYPE_PINT64_V2DI_UQI:
37213 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
37214 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
37215 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
37216 case VOID_FTYPE_PFLOAT_V16SF_UHI:
37217 case VOID_FTYPE_PFLOAT_V8SF_UQI:
37218 case VOID_FTYPE_PFLOAT_V4SF_UQI:
37219 case VOID_FTYPE_PV32QI_V32HI_USI:
37220 case VOID_FTYPE_PV16QI_V16HI_UHI:
37221 case VOID_FTYPE_PV8QI_V8HI_UQI:
37222 nargs = 2;
37223 klass = store;
37224 /* Reserve memory operand for target. */
37225 memory = ARRAY_SIZE (args);
37226 break;
37227 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
37228 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
37229 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
37230 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
37231 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
37232 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
37233 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
37234 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
37235 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
37236 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
37237 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
37238 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
37239 switch (icode)
37241 /* These builtins and instructions require the memory
37242 to be properly aligned. */
37243 case CODE_FOR_avx512f_loadv16sf_mask:
37244 case CODE_FOR_avx512f_loadv16si_mask:
37245 case CODE_FOR_avx512f_loadv8df_mask:
37246 case CODE_FOR_avx512f_loadv8di_mask:
37247 case CODE_FOR_avx512vl_loadv8sf_mask:
37248 case CODE_FOR_avx512vl_loadv8si_mask:
37249 case CODE_FOR_avx512vl_loadv4df_mask:
37250 case CODE_FOR_avx512vl_loadv4di_mask:
37251 case CODE_FOR_avx512vl_loadv4sf_mask:
37252 case CODE_FOR_avx512vl_loadv4si_mask:
37253 case CODE_FOR_avx512vl_loadv2df_mask:
37254 case CODE_FOR_avx512vl_loadv2di_mask:
37255 case CODE_FOR_avx512bw_loadv64qi_mask:
37256 case CODE_FOR_avx512vl_loadv32qi_mask:
37257 case CODE_FOR_avx512vl_loadv16qi_mask:
37258 case CODE_FOR_avx512bw_loadv32hi_mask:
37259 case CODE_FOR_avx512vl_loadv16hi_mask:
37260 case CODE_FOR_avx512vl_loadv8hi_mask:
37261 aligned_mem = true;
37262 break;
37263 default:
37264 break;
37266 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
37267 case V32QI_FTYPE_PCCHAR_V32QI_USI:
37268 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
37269 case V32HI_FTYPE_PCSHORT_V32HI_USI:
37270 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
37271 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
37272 case V16SI_FTYPE_PCINT_V16SI_UHI:
37273 case V8SI_FTYPE_PCINT_V8SI_UQI:
37274 case V4SI_FTYPE_PCINT_V4SI_UQI:
37275 case V8DI_FTYPE_PCINT64_V8DI_UQI:
37276 case V4DI_FTYPE_PCINT64_V4DI_UQI:
37277 case V2DI_FTYPE_PCINT64_V2DI_UQI:
37278 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
37279 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
37280 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
37281 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
37282 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
37283 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
37284 nargs = 3;
37285 klass = load;
37286 memory = 0;
37287 break;
37288 case VOID_FTYPE_UINT_UINT_UINT:
37289 case VOID_FTYPE_UINT64_UINT_UINT:
37290 case UCHAR_FTYPE_UINT_UINT_UINT:
37291 case UCHAR_FTYPE_UINT64_UINT_UINT:
37292 nargs = 3;
37293 klass = load;
37294 memory = ARRAY_SIZE (args);
37295 last_arg_constant = true;
37296 break;
37297 default:
37298 gcc_unreachable ();
37301 gcc_assert (nargs <= ARRAY_SIZE (args));
37303 if (klass == store)
37305 arg = CALL_EXPR_ARG (exp, 0);
37306 op = expand_normal (arg);
37307 gcc_assert (target == 0);
37308 if (memory)
37310 op = ix86_zero_extend_to_Pmode (op);
37311 target = gen_rtx_MEM (tmode, op);
37312 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
37313 on it. Try to improve it using get_pointer_alignment,
37314 and if the special builtin is one that requires strict
37315 mode alignment, also from it's GET_MODE_ALIGNMENT.
37316 Failure to do so could lead to ix86_legitimate_combined_insn
37317 rejecting all changes to such insns. */
37318 unsigned int align = get_pointer_alignment (arg);
37319 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
37320 align = GET_MODE_ALIGNMENT (tmode);
37321 if (MEM_ALIGN (target) < align)
37322 set_mem_align (target, align);
37324 else
37325 target = force_reg (tmode, op);
37326 arg_adjust = 1;
37328 else
37330 arg_adjust = 0;
37331 if (optimize
37332 || target == 0
37333 || !register_operand (target, tmode)
37334 || GET_MODE (target) != tmode)
37335 target = gen_reg_rtx (tmode);
37338 for (i = 0; i < nargs; i++)
37340 machine_mode mode = insn_p->operand[i + 1].mode;
37341 bool match;
37343 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
37344 op = expand_normal (arg);
37345 match = insn_p->operand[i + 1].predicate (op, mode);
37347 if (last_arg_constant && (i + 1) == nargs)
37349 if (!match)
37351 if (icode == CODE_FOR_lwp_lwpvalsi3
37352 || icode == CODE_FOR_lwp_lwpinssi3
37353 || icode == CODE_FOR_lwp_lwpvaldi3
37354 || icode == CODE_FOR_lwp_lwpinsdi3)
37355 error ("the last argument must be a 32-bit immediate");
37356 else
37357 error ("the last argument must be an 8-bit immediate");
37358 return const0_rtx;
37361 else
37363 if (i == memory)
37365 /* This must be the memory operand. */
37366 op = ix86_zero_extend_to_Pmode (op);
37367 op = gen_rtx_MEM (mode, op);
37368 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
37369 on it. Try to improve it using get_pointer_alignment,
37370 and if the special builtin is one that requires strict
37371 mode alignment, also from it's GET_MODE_ALIGNMENT.
37372 Failure to do so could lead to ix86_legitimate_combined_insn
37373 rejecting all changes to such insns. */
37374 unsigned int align = get_pointer_alignment (arg);
37375 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
37376 align = GET_MODE_ALIGNMENT (mode);
37377 if (MEM_ALIGN (op) < align)
37378 set_mem_align (op, align);
37380 else
37382 /* This must be register. */
37383 if (VECTOR_MODE_P (mode))
37384 op = safe_vector_operand (op, mode);
37386 op = fixup_modeless_constant (op, mode);
37388 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
37389 op = copy_to_mode_reg (mode, op);
37390 else
37392 op = copy_to_reg (op);
37393 op = lowpart_subreg (mode, op, GET_MODE (op));
37398 args[i].op = op;
37399 args[i].mode = mode;
37402 switch (nargs)
37404 case 0:
37405 pat = GEN_FCN (icode) (target);
37406 break;
37407 case 1:
37408 pat = GEN_FCN (icode) (target, args[0].op);
37409 break;
37410 case 2:
37411 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
37412 break;
37413 case 3:
37414 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
37415 break;
37416 default:
37417 gcc_unreachable ();
37420 if (! pat)
37421 return 0;
37422 emit_insn (pat);
37423 return klass == store ? 0 : target;
37426 /* Return the integer constant in ARG. Constrain it to be in the range
37427 of the subparts of VEC_TYPE; issue an error if not. */
37429 static int
37430 get_element_number (tree vec_type, tree arg)
37432 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
37434 if (!tree_fits_uhwi_p (arg)
37435 || (elt = tree_to_uhwi (arg), elt > max))
37437 error ("selector must be an integer constant in the range 0..%wi", max);
37438 return 0;
37441 return elt;
37444 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37445 ix86_expand_vector_init. We DO have language-level syntax for this, in
37446 the form of (type){ init-list }. Except that since we can't place emms
37447 instructions from inside the compiler, we can't allow the use of MMX
37448 registers unless the user explicitly asks for it. So we do *not* define
37449 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
37450 we have builtins invoked by mmintrin.h that gives us license to emit
37451 these sorts of instructions. */
37453 static rtx
37454 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
37456 machine_mode tmode = TYPE_MODE (type);
37457 machine_mode inner_mode = GET_MODE_INNER (tmode);
37458 int i, n_elt = GET_MODE_NUNITS (tmode);
37459 rtvec v = rtvec_alloc (n_elt);
37461 gcc_assert (VECTOR_MODE_P (tmode));
37462 gcc_assert (call_expr_nargs (exp) == n_elt);
37464 for (i = 0; i < n_elt; ++i)
37466 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
37467 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
37470 if (!target || !register_operand (target, tmode))
37471 target = gen_reg_rtx (tmode);
37473 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
37474 return target;
37477 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37478 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
37479 had a language-level syntax for referencing vector elements. */
37481 static rtx
37482 ix86_expand_vec_ext_builtin (tree exp, rtx target)
37484 machine_mode tmode, mode0;
37485 tree arg0, arg1;
37486 int elt;
37487 rtx op0;
37489 arg0 = CALL_EXPR_ARG (exp, 0);
37490 arg1 = CALL_EXPR_ARG (exp, 1);
37492 op0 = expand_normal (arg0);
37493 elt = get_element_number (TREE_TYPE (arg0), arg1);
37495 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37496 mode0 = TYPE_MODE (TREE_TYPE (arg0));
37497 gcc_assert (VECTOR_MODE_P (mode0));
37499 op0 = force_reg (mode0, op0);
37501 if (optimize || !target || !register_operand (target, tmode))
37502 target = gen_reg_rtx (tmode);
37504 ix86_expand_vector_extract (true, target, op0, elt);
37506 return target;
37509 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
37510 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
37511 a language-level syntax for referencing vector elements. */
37513 static rtx
37514 ix86_expand_vec_set_builtin (tree exp)
37516 machine_mode tmode, mode1;
37517 tree arg0, arg1, arg2;
37518 int elt;
37519 rtx op0, op1, target;
37521 arg0 = CALL_EXPR_ARG (exp, 0);
37522 arg1 = CALL_EXPR_ARG (exp, 1);
37523 arg2 = CALL_EXPR_ARG (exp, 2);
37525 tmode = TYPE_MODE (TREE_TYPE (arg0));
37526 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
37527 gcc_assert (VECTOR_MODE_P (tmode));
37529 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
37530 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
37531 elt = get_element_number (TREE_TYPE (arg0), arg2);
37533 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
37534 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
37536 op0 = force_reg (tmode, op0);
37537 op1 = force_reg (mode1, op1);
37539 /* OP0 is the source of these builtin functions and shouldn't be
37540 modified. Create a copy, use it and return it as target. */
37541 target = gen_reg_rtx (tmode);
37542 emit_move_insn (target, op0);
37543 ix86_expand_vector_set (true, target, op1, elt);
37545 return target;
37548 /* Emit conditional move of SRC to DST with condition
37549 OP1 CODE OP2. */
37550 static void
37551 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
37553 rtx t;
37555 if (TARGET_CMOVE)
37557 t = ix86_expand_compare (code, op1, op2);
37558 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
37559 src, dst)));
37561 else
37563 rtx_code_label *nomove = gen_label_rtx ();
37564 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
37565 const0_rtx, GET_MODE (op1), 1, nomove);
37566 emit_move_insn (dst, src);
37567 emit_label (nomove);
37571 /* Choose max of DST and SRC and put it to DST. */
37572 static void
37573 ix86_emit_move_max (rtx dst, rtx src)
37575 ix86_emit_cmove (dst, src, LTU, dst, src);
37578 /* Expand an expression EXP that calls a built-in function,
37579 with result going to TARGET if that's convenient
37580 (and in mode MODE if that's convenient).
37581 SUBTARGET may be used as the target for computing one of EXP's operands.
37582 IGNORE is nonzero if the value is to be ignored. */
37584 static rtx
37585 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
37586 machine_mode mode, int ignore)
37588 size_t i;
37589 enum insn_code icode;
37590 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
37591 tree arg0, arg1, arg2, arg3, arg4;
37592 rtx op0, op1, op2, op3, op4, pat, insn;
37593 machine_mode mode0, mode1, mode2, mode3, mode4;
37594 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
37596 /* For CPU builtins that can be folded, fold first and expand the fold. */
37597 switch (fcode)
37599 case IX86_BUILTIN_CPU_INIT:
37601 /* Make it call __cpu_indicator_init in libgcc. */
37602 tree call_expr, fndecl, type;
37603 type = build_function_type_list (integer_type_node, NULL_TREE);
37604 fndecl = build_fn_decl ("__cpu_indicator_init", type);
37605 call_expr = build_call_expr (fndecl, 0);
37606 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
37608 case IX86_BUILTIN_CPU_IS:
37609 case IX86_BUILTIN_CPU_SUPPORTS:
37611 tree arg0 = CALL_EXPR_ARG (exp, 0);
37612 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
37613 gcc_assert (fold_expr != NULL_TREE);
37614 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
37618 /* Determine whether the builtin function is available under the current ISA.
37619 Originally the builtin was not created if it wasn't applicable to the
37620 current ISA based on the command line switches. With function specific
37621 options, we need to check in the context of the function making the call
37622 whether it is supported. Treat AVX512VL specially. For other flags,
37623 if isa includes more than one ISA bit, treat those are requiring any
37624 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
37625 ISAs. Similarly for 64BIT, but we shouldn't be building such builtins
37626 at all, -m64 is a whole TU option. */
37627 if (((ix86_builtins_isa[fcode].isa
37628 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT))
37629 && !(ix86_builtins_isa[fcode].isa
37630 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_64BIT)
37631 & ix86_isa_flags))
37632 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
37633 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
37634 || (ix86_builtins_isa[fcode].isa2
37635 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
37637 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
37638 ix86_builtins_isa[fcode].isa2, 0, 0,
37639 NULL, NULL, (enum fpmath_unit) 0,
37640 false);
37641 if (!opts)
37642 error ("%qE needs unknown isa option", fndecl);
37643 else
37645 gcc_assert (opts != NULL);
37646 error ("%qE needs isa option %s", fndecl, opts);
37647 free (opts);
37649 return expand_call (exp, target, ignore);
37652 switch (fcode)
37654 case IX86_BUILTIN_BNDMK:
37655 if (!target
37656 || GET_MODE (target) != BNDmode
37657 || !register_operand (target, BNDmode))
37658 target = gen_reg_rtx (BNDmode);
37660 arg0 = CALL_EXPR_ARG (exp, 0);
37661 arg1 = CALL_EXPR_ARG (exp, 1);
37663 op0 = expand_normal (arg0);
37664 op1 = expand_normal (arg1);
37666 if (!register_operand (op0, Pmode))
37667 op0 = ix86_zero_extend_to_Pmode (op0);
37668 if (!register_operand (op1, Pmode))
37669 op1 = ix86_zero_extend_to_Pmode (op1);
37671 /* Builtin arg1 is size of block but instruction op1 should
37672 be (size - 1). */
37673 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
37674 NULL_RTX, 1, OPTAB_DIRECT);
37676 emit_insn (BNDmode == BND64mode
37677 ? gen_bnd64_mk (target, op0, op1)
37678 : gen_bnd32_mk (target, op0, op1));
37679 return target;
37681 case IX86_BUILTIN_BNDSTX:
37682 arg0 = CALL_EXPR_ARG (exp, 0);
37683 arg1 = CALL_EXPR_ARG (exp, 1);
37684 arg2 = CALL_EXPR_ARG (exp, 2);
37686 op0 = expand_normal (arg0);
37687 op1 = expand_normal (arg1);
37688 op2 = expand_normal (arg2);
37690 if (!register_operand (op0, Pmode))
37691 op0 = ix86_zero_extend_to_Pmode (op0);
37692 if (!register_operand (op1, BNDmode))
37693 op1 = copy_to_mode_reg (BNDmode, op1);
37694 if (!register_operand (op2, Pmode))
37695 op2 = ix86_zero_extend_to_Pmode (op2);
37697 emit_insn (BNDmode == BND64mode
37698 ? gen_bnd64_stx (op2, op0, op1)
37699 : gen_bnd32_stx (op2, op0, op1));
37700 return 0;
37702 case IX86_BUILTIN_BNDLDX:
37703 if (!target
37704 || GET_MODE (target) != BNDmode
37705 || !register_operand (target, BNDmode))
37706 target = gen_reg_rtx (BNDmode);
37708 arg0 = CALL_EXPR_ARG (exp, 0);
37709 arg1 = CALL_EXPR_ARG (exp, 1);
37711 op0 = expand_normal (arg0);
37712 op1 = expand_normal (arg1);
37714 if (!register_operand (op0, Pmode))
37715 op0 = ix86_zero_extend_to_Pmode (op0);
37716 if (!register_operand (op1, Pmode))
37717 op1 = ix86_zero_extend_to_Pmode (op1);
37719 emit_insn (BNDmode == BND64mode
37720 ? gen_bnd64_ldx (target, op0, op1)
37721 : gen_bnd32_ldx (target, op0, op1));
37722 return target;
37724 case IX86_BUILTIN_BNDCL:
37725 arg0 = CALL_EXPR_ARG (exp, 0);
37726 arg1 = CALL_EXPR_ARG (exp, 1);
37728 op0 = expand_normal (arg0);
37729 op1 = expand_normal (arg1);
37731 if (!register_operand (op0, Pmode))
37732 op0 = ix86_zero_extend_to_Pmode (op0);
37733 if (!register_operand (op1, BNDmode))
37734 op1 = copy_to_mode_reg (BNDmode, op1);
37736 emit_insn (BNDmode == BND64mode
37737 ? gen_bnd64_cl (op1, op0)
37738 : gen_bnd32_cl (op1, op0));
37739 return 0;
37741 case IX86_BUILTIN_BNDCU:
37742 arg0 = CALL_EXPR_ARG (exp, 0);
37743 arg1 = CALL_EXPR_ARG (exp, 1);
37745 op0 = expand_normal (arg0);
37746 op1 = expand_normal (arg1);
37748 if (!register_operand (op0, Pmode))
37749 op0 = ix86_zero_extend_to_Pmode (op0);
37750 if (!register_operand (op1, BNDmode))
37751 op1 = copy_to_mode_reg (BNDmode, op1);
37753 emit_insn (BNDmode == BND64mode
37754 ? gen_bnd64_cu (op1, op0)
37755 : gen_bnd32_cu (op1, op0));
37756 return 0;
37758 case IX86_BUILTIN_BNDRET:
37759 arg0 = CALL_EXPR_ARG (exp, 0);
37760 target = chkp_get_rtl_bounds (arg0);
37762 /* If no bounds were specified for returned value,
37763 then use INIT bounds. It usually happens when
37764 some built-in function is expanded. */
37765 if (!target)
37767 rtx t1 = gen_reg_rtx (Pmode);
37768 rtx t2 = gen_reg_rtx (Pmode);
37769 target = gen_reg_rtx (BNDmode);
37770 emit_move_insn (t1, const0_rtx);
37771 emit_move_insn (t2, constm1_rtx);
37772 emit_insn (BNDmode == BND64mode
37773 ? gen_bnd64_mk (target, t1, t2)
37774 : gen_bnd32_mk (target, t1, t2));
37777 gcc_assert (target && REG_P (target));
37778 return target;
37780 case IX86_BUILTIN_BNDNARROW:
37782 rtx m1, m1h1, m1h2, lb, ub, t1;
37784 /* Return value and lb. */
37785 arg0 = CALL_EXPR_ARG (exp, 0);
37786 /* Bounds. */
37787 arg1 = CALL_EXPR_ARG (exp, 1);
37788 /* Size. */
37789 arg2 = CALL_EXPR_ARG (exp, 2);
37791 lb = expand_normal (arg0);
37792 op1 = expand_normal (arg1);
37793 op2 = expand_normal (arg2);
37795 /* Size was passed but we need to use (size - 1) as for bndmk. */
37796 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
37797 NULL_RTX, 1, OPTAB_DIRECT);
37799 /* Add LB to size and inverse to get UB. */
37800 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
37801 op2, 1, OPTAB_DIRECT);
37802 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
37804 if (!register_operand (lb, Pmode))
37805 lb = ix86_zero_extend_to_Pmode (lb);
37806 if (!register_operand (ub, Pmode))
37807 ub = ix86_zero_extend_to_Pmode (ub);
37809 /* We need to move bounds to memory before any computations. */
37810 if (MEM_P (op1))
37811 m1 = op1;
37812 else
37814 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
37815 emit_move_insn (m1, op1);
37818 /* Generate mem expression to be used for access to LB and UB. */
37819 m1h1 = adjust_address (m1, Pmode, 0);
37820 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
37822 t1 = gen_reg_rtx (Pmode);
37824 /* Compute LB. */
37825 emit_move_insn (t1, m1h1);
37826 ix86_emit_move_max (t1, lb);
37827 emit_move_insn (m1h1, t1);
37829 /* Compute UB. UB is stored in 1's complement form. Therefore
37830 we also use max here. */
37831 emit_move_insn (t1, m1h2);
37832 ix86_emit_move_max (t1, ub);
37833 emit_move_insn (m1h2, t1);
37835 op2 = gen_reg_rtx (BNDmode);
37836 emit_move_insn (op2, m1);
37838 return chkp_join_splitted_slot (lb, op2);
37841 case IX86_BUILTIN_BNDINT:
37843 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
37845 if (!target
37846 || GET_MODE (target) != BNDmode
37847 || !register_operand (target, BNDmode))
37848 target = gen_reg_rtx (BNDmode);
37850 arg0 = CALL_EXPR_ARG (exp, 0);
37851 arg1 = CALL_EXPR_ARG (exp, 1);
37853 op0 = expand_normal (arg0);
37854 op1 = expand_normal (arg1);
37856 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
37857 rh1 = adjust_address (res, Pmode, 0);
37858 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
37860 /* Put first bounds to temporaries. */
37861 lb1 = gen_reg_rtx (Pmode);
37862 ub1 = gen_reg_rtx (Pmode);
37863 if (MEM_P (op0))
37865 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
37866 emit_move_insn (ub1, adjust_address (op0, Pmode,
37867 GET_MODE_SIZE (Pmode)));
37869 else
37871 emit_move_insn (res, op0);
37872 emit_move_insn (lb1, rh1);
37873 emit_move_insn (ub1, rh2);
37876 /* Put second bounds to temporaries. */
37877 lb2 = gen_reg_rtx (Pmode);
37878 ub2 = gen_reg_rtx (Pmode);
37879 if (MEM_P (op1))
37881 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
37882 emit_move_insn (ub2, adjust_address (op1, Pmode,
37883 GET_MODE_SIZE (Pmode)));
37885 else
37887 emit_move_insn (res, op1);
37888 emit_move_insn (lb2, rh1);
37889 emit_move_insn (ub2, rh2);
37892 /* Compute LB. */
37893 ix86_emit_move_max (lb1, lb2);
37894 emit_move_insn (rh1, lb1);
37896 /* Compute UB. UB is stored in 1's complement form. Therefore
37897 we also use max here. */
37898 ix86_emit_move_max (ub1, ub2);
37899 emit_move_insn (rh2, ub1);
37901 emit_move_insn (target, res);
37903 return target;
37906 case IX86_BUILTIN_SIZEOF:
37908 tree name;
37909 rtx symbol;
37911 if (!target
37912 || GET_MODE (target) != Pmode
37913 || !register_operand (target, Pmode))
37914 target = gen_reg_rtx (Pmode);
37916 arg0 = CALL_EXPR_ARG (exp, 0);
37917 gcc_assert (VAR_P (arg0));
37919 name = DECL_ASSEMBLER_NAME (arg0);
37920 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
37922 emit_insn (Pmode == SImode
37923 ? gen_move_size_reloc_si (target, symbol)
37924 : gen_move_size_reloc_di (target, symbol));
37926 return target;
37929 case IX86_BUILTIN_BNDLOWER:
37931 rtx mem, hmem;
37933 if (!target
37934 || GET_MODE (target) != Pmode
37935 || !register_operand (target, Pmode))
37936 target = gen_reg_rtx (Pmode);
37938 arg0 = CALL_EXPR_ARG (exp, 0);
37939 op0 = expand_normal (arg0);
37941 /* We need to move bounds to memory first. */
37942 if (MEM_P (op0))
37943 mem = op0;
37944 else
37946 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37947 emit_move_insn (mem, op0);
37950 /* Generate mem expression to access LB and load it. */
37951 hmem = adjust_address (mem, Pmode, 0);
37952 emit_move_insn (target, hmem);
37954 return target;
37957 case IX86_BUILTIN_BNDUPPER:
37959 rtx mem, hmem, res;
37961 if (!target
37962 || GET_MODE (target) != Pmode
37963 || !register_operand (target, Pmode))
37964 target = gen_reg_rtx (Pmode);
37966 arg0 = CALL_EXPR_ARG (exp, 0);
37967 op0 = expand_normal (arg0);
37969 /* We need to move bounds to memory first. */
37970 if (MEM_P (op0))
37971 mem = op0;
37972 else
37974 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
37975 emit_move_insn (mem, op0);
37978 /* Generate mem expression to access UB. */
37979 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
37981 /* We need to inverse all bits of UB. */
37982 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
37984 if (res != target)
37985 emit_move_insn (target, res);
37987 return target;
37990 case IX86_BUILTIN_MASKMOVQ:
37991 case IX86_BUILTIN_MASKMOVDQU:
37992 icode = (fcode == IX86_BUILTIN_MASKMOVQ
37993 ? CODE_FOR_mmx_maskmovq
37994 : CODE_FOR_sse2_maskmovdqu);
37995 /* Note the arg order is different from the operand order. */
37996 arg1 = CALL_EXPR_ARG (exp, 0);
37997 arg2 = CALL_EXPR_ARG (exp, 1);
37998 arg0 = CALL_EXPR_ARG (exp, 2);
37999 op0 = expand_normal (arg0);
38000 op1 = expand_normal (arg1);
38001 op2 = expand_normal (arg2);
38002 mode0 = insn_data[icode].operand[0].mode;
38003 mode1 = insn_data[icode].operand[1].mode;
38004 mode2 = insn_data[icode].operand[2].mode;
38006 op0 = ix86_zero_extend_to_Pmode (op0);
38007 op0 = gen_rtx_MEM (mode1, op0);
38009 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38010 op0 = copy_to_mode_reg (mode0, op0);
38011 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38012 op1 = copy_to_mode_reg (mode1, op1);
38013 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38014 op2 = copy_to_mode_reg (mode2, op2);
38015 pat = GEN_FCN (icode) (op0, op1, op2);
38016 if (! pat)
38017 return 0;
38018 emit_insn (pat);
38019 return 0;
38021 case IX86_BUILTIN_LDMXCSR:
38022 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
38023 target = assign_386_stack_local (SImode, SLOT_TEMP);
38024 emit_move_insn (target, op0);
38025 emit_insn (gen_sse_ldmxcsr (target));
38026 return 0;
38028 case IX86_BUILTIN_STMXCSR:
38029 target = assign_386_stack_local (SImode, SLOT_TEMP);
38030 emit_insn (gen_sse_stmxcsr (target));
38031 return copy_to_mode_reg (SImode, target);
38033 case IX86_BUILTIN_CLFLUSH:
38034 arg0 = CALL_EXPR_ARG (exp, 0);
38035 op0 = expand_normal (arg0);
38036 icode = CODE_FOR_sse2_clflush;
38037 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38038 op0 = ix86_zero_extend_to_Pmode (op0);
38040 emit_insn (gen_sse2_clflush (op0));
38041 return 0;
38043 case IX86_BUILTIN_CLWB:
38044 arg0 = CALL_EXPR_ARG (exp, 0);
38045 op0 = expand_normal (arg0);
38046 icode = CODE_FOR_clwb;
38047 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38048 op0 = ix86_zero_extend_to_Pmode (op0);
38050 emit_insn (gen_clwb (op0));
38051 return 0;
38053 case IX86_BUILTIN_CLFLUSHOPT:
38054 arg0 = CALL_EXPR_ARG (exp, 0);
38055 op0 = expand_normal (arg0);
38056 icode = CODE_FOR_clflushopt;
38057 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38058 op0 = ix86_zero_extend_to_Pmode (op0);
38060 emit_insn (gen_clflushopt (op0));
38061 return 0;
38063 case IX86_BUILTIN_MONITOR:
38064 case IX86_BUILTIN_MONITORX:
38065 arg0 = CALL_EXPR_ARG (exp, 0);
38066 arg1 = CALL_EXPR_ARG (exp, 1);
38067 arg2 = CALL_EXPR_ARG (exp, 2);
38068 op0 = expand_normal (arg0);
38069 op1 = expand_normal (arg1);
38070 op2 = expand_normal (arg2);
38071 if (!REG_P (op0))
38072 op0 = ix86_zero_extend_to_Pmode (op0);
38073 if (!REG_P (op1))
38074 op1 = copy_to_mode_reg (SImode, op1);
38075 if (!REG_P (op2))
38076 op2 = copy_to_mode_reg (SImode, op2);
38078 emit_insn (fcode == IX86_BUILTIN_MONITOR
38079 ? ix86_gen_monitor (op0, op1, op2)
38080 : ix86_gen_monitorx (op0, op1, op2));
38081 return 0;
38083 case IX86_BUILTIN_MWAIT:
38084 arg0 = CALL_EXPR_ARG (exp, 0);
38085 arg1 = CALL_EXPR_ARG (exp, 1);
38086 op0 = expand_normal (arg0);
38087 op1 = expand_normal (arg1);
38088 if (!REG_P (op0))
38089 op0 = copy_to_mode_reg (SImode, op0);
38090 if (!REG_P (op1))
38091 op1 = copy_to_mode_reg (SImode, op1);
38092 emit_insn (gen_sse3_mwait (op0, op1));
38093 return 0;
38095 case IX86_BUILTIN_MWAITX:
38096 arg0 = CALL_EXPR_ARG (exp, 0);
38097 arg1 = CALL_EXPR_ARG (exp, 1);
38098 arg2 = CALL_EXPR_ARG (exp, 2);
38099 op0 = expand_normal (arg0);
38100 op1 = expand_normal (arg1);
38101 op2 = expand_normal (arg2);
38102 if (!REG_P (op0))
38103 op0 = copy_to_mode_reg (SImode, op0);
38104 if (!REG_P (op1))
38105 op1 = copy_to_mode_reg (SImode, op1);
38106 if (!REG_P (op2))
38107 op2 = copy_to_mode_reg (SImode, op2);
38108 emit_insn (gen_mwaitx (op0, op1, op2));
38109 return 0;
38111 case IX86_BUILTIN_CLZERO:
38112 arg0 = CALL_EXPR_ARG (exp, 0);
38113 op0 = expand_normal (arg0);
38114 if (!REG_P (op0))
38115 op0 = ix86_zero_extend_to_Pmode (op0);
38116 emit_insn (ix86_gen_clzero (op0));
38117 return 0;
38119 case IX86_BUILTIN_VEC_INIT_V2SI:
38120 case IX86_BUILTIN_VEC_INIT_V4HI:
38121 case IX86_BUILTIN_VEC_INIT_V8QI:
38122 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
38124 case IX86_BUILTIN_VEC_EXT_V2DF:
38125 case IX86_BUILTIN_VEC_EXT_V2DI:
38126 case IX86_BUILTIN_VEC_EXT_V4SF:
38127 case IX86_BUILTIN_VEC_EXT_V4SI:
38128 case IX86_BUILTIN_VEC_EXT_V8HI:
38129 case IX86_BUILTIN_VEC_EXT_V2SI:
38130 case IX86_BUILTIN_VEC_EXT_V4HI:
38131 case IX86_BUILTIN_VEC_EXT_V16QI:
38132 return ix86_expand_vec_ext_builtin (exp, target);
38134 case IX86_BUILTIN_VEC_SET_V2DI:
38135 case IX86_BUILTIN_VEC_SET_V4SF:
38136 case IX86_BUILTIN_VEC_SET_V4SI:
38137 case IX86_BUILTIN_VEC_SET_V8HI:
38138 case IX86_BUILTIN_VEC_SET_V4HI:
38139 case IX86_BUILTIN_VEC_SET_V16QI:
38140 return ix86_expand_vec_set_builtin (exp);
38142 case IX86_BUILTIN_NANQ:
38143 case IX86_BUILTIN_NANSQ:
38144 return expand_call (exp, target, ignore);
38146 case IX86_BUILTIN_RDPMC:
38147 case IX86_BUILTIN_RDTSC:
38148 case IX86_BUILTIN_RDTSCP:
38149 case IX86_BUILTIN_XGETBV:
38151 op0 = gen_reg_rtx (DImode);
38152 op1 = gen_reg_rtx (DImode);
38154 if (fcode == IX86_BUILTIN_RDPMC)
38156 arg0 = CALL_EXPR_ARG (exp, 0);
38157 op2 = expand_normal (arg0);
38158 if (!register_operand (op2, SImode))
38159 op2 = copy_to_mode_reg (SImode, op2);
38161 insn = (TARGET_64BIT
38162 ? gen_rdpmc_rex64 (op0, op1, op2)
38163 : gen_rdpmc (op0, op2));
38164 emit_insn (insn);
38166 else if (fcode == IX86_BUILTIN_XGETBV)
38168 arg0 = CALL_EXPR_ARG (exp, 0);
38169 op2 = expand_normal (arg0);
38170 if (!register_operand (op2, SImode))
38171 op2 = copy_to_mode_reg (SImode, op2);
38173 insn = (TARGET_64BIT
38174 ? gen_xgetbv_rex64 (op0, op1, op2)
38175 : gen_xgetbv (op0, op2));
38176 emit_insn (insn);
38178 else if (fcode == IX86_BUILTIN_RDTSC)
38180 insn = (TARGET_64BIT
38181 ? gen_rdtsc_rex64 (op0, op1)
38182 : gen_rdtsc (op0));
38183 emit_insn (insn);
38185 else
38187 op2 = gen_reg_rtx (SImode);
38189 insn = (TARGET_64BIT
38190 ? gen_rdtscp_rex64 (op0, op1, op2)
38191 : gen_rdtscp (op0, op2));
38192 emit_insn (insn);
38194 arg0 = CALL_EXPR_ARG (exp, 0);
38195 op4 = expand_normal (arg0);
38196 if (!address_operand (op4, VOIDmode))
38198 op4 = convert_memory_address (Pmode, op4);
38199 op4 = copy_addr_to_reg (op4);
38201 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
38204 if (target == 0)
38206 /* mode is VOIDmode if __builtin_rd* has been called
38207 without lhs. */
38208 if (mode == VOIDmode)
38209 return target;
38210 target = gen_reg_rtx (mode);
38213 if (TARGET_64BIT)
38215 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
38216 op1, 1, OPTAB_DIRECT);
38217 op0 = expand_simple_binop (DImode, IOR, op0, op1,
38218 op0, 1, OPTAB_DIRECT);
38221 emit_move_insn (target, op0);
38222 return target;
38224 case IX86_BUILTIN_FXSAVE:
38225 case IX86_BUILTIN_FXRSTOR:
38226 case IX86_BUILTIN_FXSAVE64:
38227 case IX86_BUILTIN_FXRSTOR64:
38228 case IX86_BUILTIN_FNSTENV:
38229 case IX86_BUILTIN_FLDENV:
38230 mode0 = BLKmode;
38231 switch (fcode)
38233 case IX86_BUILTIN_FXSAVE:
38234 icode = CODE_FOR_fxsave;
38235 break;
38236 case IX86_BUILTIN_FXRSTOR:
38237 icode = CODE_FOR_fxrstor;
38238 break;
38239 case IX86_BUILTIN_FXSAVE64:
38240 icode = CODE_FOR_fxsave64;
38241 break;
38242 case IX86_BUILTIN_FXRSTOR64:
38243 icode = CODE_FOR_fxrstor64;
38244 break;
38245 case IX86_BUILTIN_FNSTENV:
38246 icode = CODE_FOR_fnstenv;
38247 break;
38248 case IX86_BUILTIN_FLDENV:
38249 icode = CODE_FOR_fldenv;
38250 break;
38251 default:
38252 gcc_unreachable ();
38255 arg0 = CALL_EXPR_ARG (exp, 0);
38256 op0 = expand_normal (arg0);
38258 if (!address_operand (op0, VOIDmode))
38260 op0 = convert_memory_address (Pmode, op0);
38261 op0 = copy_addr_to_reg (op0);
38263 op0 = gen_rtx_MEM (mode0, op0);
38265 pat = GEN_FCN (icode) (op0);
38266 if (pat)
38267 emit_insn (pat);
38268 return 0;
38270 case IX86_BUILTIN_XSETBV:
38271 arg0 = CALL_EXPR_ARG (exp, 0);
38272 arg1 = CALL_EXPR_ARG (exp, 1);
38273 op0 = expand_normal (arg0);
38274 op1 = expand_normal (arg1);
38276 if (!REG_P (op0))
38277 op0 = copy_to_mode_reg (SImode, op0);
38279 if (TARGET_64BIT)
38281 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38282 NULL, 1, OPTAB_DIRECT);
38284 op2 = gen_lowpart (SImode, op2);
38285 op1 = gen_lowpart (SImode, op1);
38286 if (!REG_P (op1))
38287 op1 = copy_to_mode_reg (SImode, op1);
38288 if (!REG_P (op2))
38289 op2 = copy_to_mode_reg (SImode, op2);
38290 icode = CODE_FOR_xsetbv_rex64;
38291 pat = GEN_FCN (icode) (op0, op1, op2);
38293 else
38295 if (!REG_P (op1))
38296 op1 = copy_to_mode_reg (DImode, op1);
38297 icode = CODE_FOR_xsetbv;
38298 pat = GEN_FCN (icode) (op0, op1);
38300 if (pat)
38301 emit_insn (pat);
38302 return 0;
38304 case IX86_BUILTIN_XSAVE:
38305 case IX86_BUILTIN_XRSTOR:
38306 case IX86_BUILTIN_XSAVE64:
38307 case IX86_BUILTIN_XRSTOR64:
38308 case IX86_BUILTIN_XSAVEOPT:
38309 case IX86_BUILTIN_XSAVEOPT64:
38310 case IX86_BUILTIN_XSAVES:
38311 case IX86_BUILTIN_XRSTORS:
38312 case IX86_BUILTIN_XSAVES64:
38313 case IX86_BUILTIN_XRSTORS64:
38314 case IX86_BUILTIN_XSAVEC:
38315 case IX86_BUILTIN_XSAVEC64:
38316 arg0 = CALL_EXPR_ARG (exp, 0);
38317 arg1 = CALL_EXPR_ARG (exp, 1);
38318 op0 = expand_normal (arg0);
38319 op1 = expand_normal (arg1);
38321 if (!address_operand (op0, VOIDmode))
38323 op0 = convert_memory_address (Pmode, op0);
38324 op0 = copy_addr_to_reg (op0);
38326 op0 = gen_rtx_MEM (BLKmode, op0);
38328 op1 = force_reg (DImode, op1);
38330 if (TARGET_64BIT)
38332 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
38333 NULL, 1, OPTAB_DIRECT);
38334 switch (fcode)
38336 case IX86_BUILTIN_XSAVE:
38337 icode = CODE_FOR_xsave_rex64;
38338 break;
38339 case IX86_BUILTIN_XRSTOR:
38340 icode = CODE_FOR_xrstor_rex64;
38341 break;
38342 case IX86_BUILTIN_XSAVE64:
38343 icode = CODE_FOR_xsave64;
38344 break;
38345 case IX86_BUILTIN_XRSTOR64:
38346 icode = CODE_FOR_xrstor64;
38347 break;
38348 case IX86_BUILTIN_XSAVEOPT:
38349 icode = CODE_FOR_xsaveopt_rex64;
38350 break;
38351 case IX86_BUILTIN_XSAVEOPT64:
38352 icode = CODE_FOR_xsaveopt64;
38353 break;
38354 case IX86_BUILTIN_XSAVES:
38355 icode = CODE_FOR_xsaves_rex64;
38356 break;
38357 case IX86_BUILTIN_XRSTORS:
38358 icode = CODE_FOR_xrstors_rex64;
38359 break;
38360 case IX86_BUILTIN_XSAVES64:
38361 icode = CODE_FOR_xsaves64;
38362 break;
38363 case IX86_BUILTIN_XRSTORS64:
38364 icode = CODE_FOR_xrstors64;
38365 break;
38366 case IX86_BUILTIN_XSAVEC:
38367 icode = CODE_FOR_xsavec_rex64;
38368 break;
38369 case IX86_BUILTIN_XSAVEC64:
38370 icode = CODE_FOR_xsavec64;
38371 break;
38372 default:
38373 gcc_unreachable ();
38376 op2 = gen_lowpart (SImode, op2);
38377 op1 = gen_lowpart (SImode, op1);
38378 pat = GEN_FCN (icode) (op0, op1, op2);
38380 else
38382 switch (fcode)
38384 case IX86_BUILTIN_XSAVE:
38385 icode = CODE_FOR_xsave;
38386 break;
38387 case IX86_BUILTIN_XRSTOR:
38388 icode = CODE_FOR_xrstor;
38389 break;
38390 case IX86_BUILTIN_XSAVEOPT:
38391 icode = CODE_FOR_xsaveopt;
38392 break;
38393 case IX86_BUILTIN_XSAVES:
38394 icode = CODE_FOR_xsaves;
38395 break;
38396 case IX86_BUILTIN_XRSTORS:
38397 icode = CODE_FOR_xrstors;
38398 break;
38399 case IX86_BUILTIN_XSAVEC:
38400 icode = CODE_FOR_xsavec;
38401 break;
38402 default:
38403 gcc_unreachable ();
38405 pat = GEN_FCN (icode) (op0, op1);
38408 if (pat)
38409 emit_insn (pat);
38410 return 0;
38412 case IX86_BUILTIN_LLWPCB:
38413 arg0 = CALL_EXPR_ARG (exp, 0);
38414 op0 = expand_normal (arg0);
38415 icode = CODE_FOR_lwp_llwpcb;
38416 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38417 op0 = ix86_zero_extend_to_Pmode (op0);
38418 emit_insn (gen_lwp_llwpcb (op0));
38419 return 0;
38421 case IX86_BUILTIN_SLWPCB:
38422 icode = CODE_FOR_lwp_slwpcb;
38423 if (!target
38424 || !insn_data[icode].operand[0].predicate (target, Pmode))
38425 target = gen_reg_rtx (Pmode);
38426 emit_insn (gen_lwp_slwpcb (target));
38427 return target;
38429 case IX86_BUILTIN_BEXTRI32:
38430 case IX86_BUILTIN_BEXTRI64:
38431 arg0 = CALL_EXPR_ARG (exp, 0);
38432 arg1 = CALL_EXPR_ARG (exp, 1);
38433 op0 = expand_normal (arg0);
38434 op1 = expand_normal (arg1);
38435 icode = (fcode == IX86_BUILTIN_BEXTRI32
38436 ? CODE_FOR_tbm_bextri_si
38437 : CODE_FOR_tbm_bextri_di);
38438 if (!CONST_INT_P (op1))
38440 error ("last argument must be an immediate");
38441 return const0_rtx;
38443 else
38445 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
38446 unsigned char lsb_index = INTVAL (op1) & 0xFF;
38447 op1 = GEN_INT (length);
38448 op2 = GEN_INT (lsb_index);
38449 pat = GEN_FCN (icode) (target, op0, op1, op2);
38450 if (pat)
38451 emit_insn (pat);
38452 return target;
38455 case IX86_BUILTIN_RDRAND16_STEP:
38456 icode = CODE_FOR_rdrandhi_1;
38457 mode0 = HImode;
38458 goto rdrand_step;
38460 case IX86_BUILTIN_RDRAND32_STEP:
38461 icode = CODE_FOR_rdrandsi_1;
38462 mode0 = SImode;
38463 goto rdrand_step;
38465 case IX86_BUILTIN_RDRAND64_STEP:
38466 icode = CODE_FOR_rdranddi_1;
38467 mode0 = DImode;
38469 rdrand_step:
38470 arg0 = CALL_EXPR_ARG (exp, 0);
38471 op1 = expand_normal (arg0);
38472 if (!address_operand (op1, VOIDmode))
38474 op1 = convert_memory_address (Pmode, op1);
38475 op1 = copy_addr_to_reg (op1);
38478 op0 = gen_reg_rtx (mode0);
38479 emit_insn (GEN_FCN (icode) (op0));
38481 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38483 op1 = gen_reg_rtx (SImode);
38484 emit_move_insn (op1, CONST1_RTX (SImode));
38486 /* Emit SImode conditional move. */
38487 if (mode0 == HImode)
38489 if (TARGET_ZERO_EXTEND_WITH_AND
38490 && optimize_function_for_speed_p (cfun))
38492 op2 = force_reg (SImode, const0_rtx);
38494 emit_insn (gen_movstricthi
38495 (gen_lowpart (HImode, op2), op0));
38497 else
38499 op2 = gen_reg_rtx (SImode);
38501 emit_insn (gen_zero_extendhisi2 (op2, op0));
38504 else if (mode0 == SImode)
38505 op2 = op0;
38506 else
38507 op2 = gen_rtx_SUBREG (SImode, op0, 0);
38509 if (target == 0
38510 || !register_operand (target, SImode))
38511 target = gen_reg_rtx (SImode);
38513 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
38514 const0_rtx);
38515 emit_insn (gen_rtx_SET (target,
38516 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
38517 return target;
38519 case IX86_BUILTIN_RDSEED16_STEP:
38520 icode = CODE_FOR_rdseedhi_1;
38521 mode0 = HImode;
38522 goto rdseed_step;
38524 case IX86_BUILTIN_RDSEED32_STEP:
38525 icode = CODE_FOR_rdseedsi_1;
38526 mode0 = SImode;
38527 goto rdseed_step;
38529 case IX86_BUILTIN_RDSEED64_STEP:
38530 icode = CODE_FOR_rdseeddi_1;
38531 mode0 = DImode;
38533 rdseed_step:
38534 arg0 = CALL_EXPR_ARG (exp, 0);
38535 op1 = expand_normal (arg0);
38536 if (!address_operand (op1, VOIDmode))
38538 op1 = convert_memory_address (Pmode, op1);
38539 op1 = copy_addr_to_reg (op1);
38542 op0 = gen_reg_rtx (mode0);
38543 emit_insn (GEN_FCN (icode) (op0));
38545 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
38547 op2 = gen_reg_rtx (QImode);
38549 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
38550 const0_rtx);
38551 emit_insn (gen_rtx_SET (op2, pat));
38553 if (target == 0
38554 || !register_operand (target, SImode))
38555 target = gen_reg_rtx (SImode);
38557 emit_insn (gen_zero_extendqisi2 (target, op2));
38558 return target;
38560 case IX86_BUILTIN_SBB32:
38561 icode = CODE_FOR_subborrowsi;
38562 mode0 = SImode;
38563 goto handlecarry;
38565 case IX86_BUILTIN_SBB64:
38566 icode = CODE_FOR_subborrowdi;
38567 mode0 = DImode;
38568 goto handlecarry;
38570 case IX86_BUILTIN_ADDCARRYX32:
38571 icode = CODE_FOR_addcarrysi;
38572 mode0 = SImode;
38573 goto handlecarry;
38575 case IX86_BUILTIN_ADDCARRYX64:
38576 icode = CODE_FOR_addcarrydi;
38577 mode0 = DImode;
38579 handlecarry:
38580 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
38581 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
38582 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
38583 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
38585 op1 = expand_normal (arg0);
38586 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
38588 op2 = expand_normal (arg1);
38589 if (!register_operand (op2, mode0))
38590 op2 = copy_to_mode_reg (mode0, op2);
38592 op3 = expand_normal (arg2);
38593 if (!register_operand (op3, mode0))
38594 op3 = copy_to_mode_reg (mode0, op3);
38596 op4 = expand_normal (arg3);
38597 if (!address_operand (op4, VOIDmode))
38599 op4 = convert_memory_address (Pmode, op4);
38600 op4 = copy_addr_to_reg (op4);
38603 /* Generate CF from input operand. */
38604 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
38606 /* Generate instruction that consumes CF. */
38607 op0 = gen_reg_rtx (mode0);
38609 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
38610 pat = gen_rtx_LTU (mode0, op1, const0_rtx);
38611 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat));
38613 /* Return current CF value. */
38614 if (target == 0)
38615 target = gen_reg_rtx (QImode);
38617 PUT_MODE (pat, QImode);
38618 emit_insn (gen_rtx_SET (target, pat));
38620 /* Store the result. */
38621 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
38623 return target;
38625 case IX86_BUILTIN_READ_FLAGS:
38626 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
38628 if (optimize
38629 || target == NULL_RTX
38630 || !nonimmediate_operand (target, word_mode)
38631 || GET_MODE (target) != word_mode)
38632 target = gen_reg_rtx (word_mode);
38634 emit_insn (gen_pop (target));
38635 return target;
38637 case IX86_BUILTIN_WRITE_FLAGS:
38639 arg0 = CALL_EXPR_ARG (exp, 0);
38640 op0 = expand_normal (arg0);
38641 if (!general_no_elim_operand (op0, word_mode))
38642 op0 = copy_to_mode_reg (word_mode, op0);
38644 emit_insn (gen_push (op0));
38645 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
38646 return 0;
38648 case IX86_BUILTIN_KTESTC8:
38649 icode = CODE_FOR_ktestqi;
38650 mode3 = CCCmode;
38651 goto kortest;
38653 case IX86_BUILTIN_KTESTZ8:
38654 icode = CODE_FOR_ktestqi;
38655 mode3 = CCZmode;
38656 goto kortest;
38658 case IX86_BUILTIN_KTESTC16:
38659 icode = CODE_FOR_ktesthi;
38660 mode3 = CCCmode;
38661 goto kortest;
38663 case IX86_BUILTIN_KTESTZ16:
38664 icode = CODE_FOR_ktesthi;
38665 mode3 = CCZmode;
38666 goto kortest;
38668 case IX86_BUILTIN_KTESTC32:
38669 icode = CODE_FOR_ktestsi;
38670 mode3 = CCCmode;
38671 goto kortest;
38673 case IX86_BUILTIN_KTESTZ32:
38674 icode = CODE_FOR_ktestsi;
38675 mode3 = CCZmode;
38676 goto kortest;
38678 case IX86_BUILTIN_KTESTC64:
38679 icode = CODE_FOR_ktestdi;
38680 mode3 = CCCmode;
38681 goto kortest;
38683 case IX86_BUILTIN_KTESTZ64:
38684 icode = CODE_FOR_ktestdi;
38685 mode3 = CCZmode;
38686 goto kortest;
38688 case IX86_BUILTIN_KORTESTC8:
38689 icode = CODE_FOR_kortestqi;
38690 mode3 = CCCmode;
38691 goto kortest;
38693 case IX86_BUILTIN_KORTESTZ8:
38694 icode = CODE_FOR_kortestqi;
38695 mode3 = CCZmode;
38696 goto kortest;
38698 case IX86_BUILTIN_KORTESTC16:
38699 icode = CODE_FOR_kortesthi;
38700 mode3 = CCCmode;
38701 goto kortest;
38703 case IX86_BUILTIN_KORTESTZ16:
38704 icode = CODE_FOR_kortesthi;
38705 mode3 = CCZmode;
38706 goto kortest;
38708 case IX86_BUILTIN_KORTESTC32:
38709 icode = CODE_FOR_kortestsi;
38710 mode3 = CCCmode;
38711 goto kortest;
38713 case IX86_BUILTIN_KORTESTZ32:
38714 icode = CODE_FOR_kortestsi;
38715 mode3 = CCZmode;
38716 goto kortest;
38718 case IX86_BUILTIN_KORTESTC64:
38719 icode = CODE_FOR_kortestdi;
38720 mode3 = CCCmode;
38721 goto kortest;
38723 case IX86_BUILTIN_KORTESTZ64:
38724 icode = CODE_FOR_kortestdi;
38725 mode3 = CCZmode;
38727 kortest:
38728 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
38729 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
38730 op0 = expand_normal (arg0);
38731 op1 = expand_normal (arg1);
38733 mode0 = insn_data[icode].operand[0].mode;
38734 mode1 = insn_data[icode].operand[1].mode;
38736 if (GET_MODE (op0) != VOIDmode)
38737 op0 = force_reg (GET_MODE (op0), op0);
38739 op0 = gen_lowpart (mode0, op0);
38741 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38742 op0 = copy_to_mode_reg (mode0, op0);
38744 if (GET_MODE (op1) != VOIDmode)
38745 op1 = force_reg (GET_MODE (op1), op1);
38747 op1 = gen_lowpart (mode1, op1);
38749 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38750 op1 = copy_to_mode_reg (mode1, op1);
38752 target = gen_reg_rtx (QImode);
38754 /* Emit kortest. */
38755 emit_insn (GEN_FCN (icode) (op0, op1));
38756 /* And use setcc to return result from flags. */
38757 ix86_expand_setcc (target, EQ,
38758 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
38759 return target;
38761 case IX86_BUILTIN_GATHERSIV2DF:
38762 icode = CODE_FOR_avx2_gathersiv2df;
38763 goto gather_gen;
38764 case IX86_BUILTIN_GATHERSIV4DF:
38765 icode = CODE_FOR_avx2_gathersiv4df;
38766 goto gather_gen;
38767 case IX86_BUILTIN_GATHERDIV2DF:
38768 icode = CODE_FOR_avx2_gatherdiv2df;
38769 goto gather_gen;
38770 case IX86_BUILTIN_GATHERDIV4DF:
38771 icode = CODE_FOR_avx2_gatherdiv4df;
38772 goto gather_gen;
38773 case IX86_BUILTIN_GATHERSIV4SF:
38774 icode = CODE_FOR_avx2_gathersiv4sf;
38775 goto gather_gen;
38776 case IX86_BUILTIN_GATHERSIV8SF:
38777 icode = CODE_FOR_avx2_gathersiv8sf;
38778 goto gather_gen;
38779 case IX86_BUILTIN_GATHERDIV4SF:
38780 icode = CODE_FOR_avx2_gatherdiv4sf;
38781 goto gather_gen;
38782 case IX86_BUILTIN_GATHERDIV8SF:
38783 icode = CODE_FOR_avx2_gatherdiv8sf;
38784 goto gather_gen;
38785 case IX86_BUILTIN_GATHERSIV2DI:
38786 icode = CODE_FOR_avx2_gathersiv2di;
38787 goto gather_gen;
38788 case IX86_BUILTIN_GATHERSIV4DI:
38789 icode = CODE_FOR_avx2_gathersiv4di;
38790 goto gather_gen;
38791 case IX86_BUILTIN_GATHERDIV2DI:
38792 icode = CODE_FOR_avx2_gatherdiv2di;
38793 goto gather_gen;
38794 case IX86_BUILTIN_GATHERDIV4DI:
38795 icode = CODE_FOR_avx2_gatherdiv4di;
38796 goto gather_gen;
38797 case IX86_BUILTIN_GATHERSIV4SI:
38798 icode = CODE_FOR_avx2_gathersiv4si;
38799 goto gather_gen;
38800 case IX86_BUILTIN_GATHERSIV8SI:
38801 icode = CODE_FOR_avx2_gathersiv8si;
38802 goto gather_gen;
38803 case IX86_BUILTIN_GATHERDIV4SI:
38804 icode = CODE_FOR_avx2_gatherdiv4si;
38805 goto gather_gen;
38806 case IX86_BUILTIN_GATHERDIV8SI:
38807 icode = CODE_FOR_avx2_gatherdiv8si;
38808 goto gather_gen;
38809 case IX86_BUILTIN_GATHERALTSIV4DF:
38810 icode = CODE_FOR_avx2_gathersiv4df;
38811 goto gather_gen;
38812 case IX86_BUILTIN_GATHERALTDIV8SF:
38813 icode = CODE_FOR_avx2_gatherdiv8sf;
38814 goto gather_gen;
38815 case IX86_BUILTIN_GATHERALTSIV4DI:
38816 icode = CODE_FOR_avx2_gathersiv4di;
38817 goto gather_gen;
38818 case IX86_BUILTIN_GATHERALTDIV8SI:
38819 icode = CODE_FOR_avx2_gatherdiv8si;
38820 goto gather_gen;
38821 case IX86_BUILTIN_GATHER3SIV16SF:
38822 icode = CODE_FOR_avx512f_gathersiv16sf;
38823 goto gather_gen;
38824 case IX86_BUILTIN_GATHER3SIV8DF:
38825 icode = CODE_FOR_avx512f_gathersiv8df;
38826 goto gather_gen;
38827 case IX86_BUILTIN_GATHER3DIV16SF:
38828 icode = CODE_FOR_avx512f_gatherdiv16sf;
38829 goto gather_gen;
38830 case IX86_BUILTIN_GATHER3DIV8DF:
38831 icode = CODE_FOR_avx512f_gatherdiv8df;
38832 goto gather_gen;
38833 case IX86_BUILTIN_GATHER3SIV16SI:
38834 icode = CODE_FOR_avx512f_gathersiv16si;
38835 goto gather_gen;
38836 case IX86_BUILTIN_GATHER3SIV8DI:
38837 icode = CODE_FOR_avx512f_gathersiv8di;
38838 goto gather_gen;
38839 case IX86_BUILTIN_GATHER3DIV16SI:
38840 icode = CODE_FOR_avx512f_gatherdiv16si;
38841 goto gather_gen;
38842 case IX86_BUILTIN_GATHER3DIV8DI:
38843 icode = CODE_FOR_avx512f_gatherdiv8di;
38844 goto gather_gen;
38845 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38846 icode = CODE_FOR_avx512f_gathersiv8df;
38847 goto gather_gen;
38848 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38849 icode = CODE_FOR_avx512f_gatherdiv16sf;
38850 goto gather_gen;
38851 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38852 icode = CODE_FOR_avx512f_gathersiv8di;
38853 goto gather_gen;
38854 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38855 icode = CODE_FOR_avx512f_gatherdiv16si;
38856 goto gather_gen;
38857 case IX86_BUILTIN_GATHER3SIV2DF:
38858 icode = CODE_FOR_avx512vl_gathersiv2df;
38859 goto gather_gen;
38860 case IX86_BUILTIN_GATHER3SIV4DF:
38861 icode = CODE_FOR_avx512vl_gathersiv4df;
38862 goto gather_gen;
38863 case IX86_BUILTIN_GATHER3DIV2DF:
38864 icode = CODE_FOR_avx512vl_gatherdiv2df;
38865 goto gather_gen;
38866 case IX86_BUILTIN_GATHER3DIV4DF:
38867 icode = CODE_FOR_avx512vl_gatherdiv4df;
38868 goto gather_gen;
38869 case IX86_BUILTIN_GATHER3SIV4SF:
38870 icode = CODE_FOR_avx512vl_gathersiv4sf;
38871 goto gather_gen;
38872 case IX86_BUILTIN_GATHER3SIV8SF:
38873 icode = CODE_FOR_avx512vl_gathersiv8sf;
38874 goto gather_gen;
38875 case IX86_BUILTIN_GATHER3DIV4SF:
38876 icode = CODE_FOR_avx512vl_gatherdiv4sf;
38877 goto gather_gen;
38878 case IX86_BUILTIN_GATHER3DIV8SF:
38879 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38880 goto gather_gen;
38881 case IX86_BUILTIN_GATHER3SIV2DI:
38882 icode = CODE_FOR_avx512vl_gathersiv2di;
38883 goto gather_gen;
38884 case IX86_BUILTIN_GATHER3SIV4DI:
38885 icode = CODE_FOR_avx512vl_gathersiv4di;
38886 goto gather_gen;
38887 case IX86_BUILTIN_GATHER3DIV2DI:
38888 icode = CODE_FOR_avx512vl_gatherdiv2di;
38889 goto gather_gen;
38890 case IX86_BUILTIN_GATHER3DIV4DI:
38891 icode = CODE_FOR_avx512vl_gatherdiv4di;
38892 goto gather_gen;
38893 case IX86_BUILTIN_GATHER3SIV4SI:
38894 icode = CODE_FOR_avx512vl_gathersiv4si;
38895 goto gather_gen;
38896 case IX86_BUILTIN_GATHER3SIV8SI:
38897 icode = CODE_FOR_avx512vl_gathersiv8si;
38898 goto gather_gen;
38899 case IX86_BUILTIN_GATHER3DIV4SI:
38900 icode = CODE_FOR_avx512vl_gatherdiv4si;
38901 goto gather_gen;
38902 case IX86_BUILTIN_GATHER3DIV8SI:
38903 icode = CODE_FOR_avx512vl_gatherdiv8si;
38904 goto gather_gen;
38905 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38906 icode = CODE_FOR_avx512vl_gathersiv4df;
38907 goto gather_gen;
38908 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38909 icode = CODE_FOR_avx512vl_gatherdiv8sf;
38910 goto gather_gen;
38911 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38912 icode = CODE_FOR_avx512vl_gathersiv4di;
38913 goto gather_gen;
38914 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38915 icode = CODE_FOR_avx512vl_gatherdiv8si;
38916 goto gather_gen;
38917 case IX86_BUILTIN_SCATTERSIV16SF:
38918 icode = CODE_FOR_avx512f_scattersiv16sf;
38919 goto scatter_gen;
38920 case IX86_BUILTIN_SCATTERSIV8DF:
38921 icode = CODE_FOR_avx512f_scattersiv8df;
38922 goto scatter_gen;
38923 case IX86_BUILTIN_SCATTERDIV16SF:
38924 icode = CODE_FOR_avx512f_scatterdiv16sf;
38925 goto scatter_gen;
38926 case IX86_BUILTIN_SCATTERDIV8DF:
38927 icode = CODE_FOR_avx512f_scatterdiv8df;
38928 goto scatter_gen;
38929 case IX86_BUILTIN_SCATTERSIV16SI:
38930 icode = CODE_FOR_avx512f_scattersiv16si;
38931 goto scatter_gen;
38932 case IX86_BUILTIN_SCATTERSIV8DI:
38933 icode = CODE_FOR_avx512f_scattersiv8di;
38934 goto scatter_gen;
38935 case IX86_BUILTIN_SCATTERDIV16SI:
38936 icode = CODE_FOR_avx512f_scatterdiv16si;
38937 goto scatter_gen;
38938 case IX86_BUILTIN_SCATTERDIV8DI:
38939 icode = CODE_FOR_avx512f_scatterdiv8di;
38940 goto scatter_gen;
38941 case IX86_BUILTIN_SCATTERSIV8SF:
38942 icode = CODE_FOR_avx512vl_scattersiv8sf;
38943 goto scatter_gen;
38944 case IX86_BUILTIN_SCATTERSIV4SF:
38945 icode = CODE_FOR_avx512vl_scattersiv4sf;
38946 goto scatter_gen;
38947 case IX86_BUILTIN_SCATTERSIV4DF:
38948 icode = CODE_FOR_avx512vl_scattersiv4df;
38949 goto scatter_gen;
38950 case IX86_BUILTIN_SCATTERSIV2DF:
38951 icode = CODE_FOR_avx512vl_scattersiv2df;
38952 goto scatter_gen;
38953 case IX86_BUILTIN_SCATTERDIV8SF:
38954 icode = CODE_FOR_avx512vl_scatterdiv8sf;
38955 goto scatter_gen;
38956 case IX86_BUILTIN_SCATTERDIV4SF:
38957 icode = CODE_FOR_avx512vl_scatterdiv4sf;
38958 goto scatter_gen;
38959 case IX86_BUILTIN_SCATTERDIV4DF:
38960 icode = CODE_FOR_avx512vl_scatterdiv4df;
38961 goto scatter_gen;
38962 case IX86_BUILTIN_SCATTERDIV2DF:
38963 icode = CODE_FOR_avx512vl_scatterdiv2df;
38964 goto scatter_gen;
38965 case IX86_BUILTIN_SCATTERSIV8SI:
38966 icode = CODE_FOR_avx512vl_scattersiv8si;
38967 goto scatter_gen;
38968 case IX86_BUILTIN_SCATTERSIV4SI:
38969 icode = CODE_FOR_avx512vl_scattersiv4si;
38970 goto scatter_gen;
38971 case IX86_BUILTIN_SCATTERSIV4DI:
38972 icode = CODE_FOR_avx512vl_scattersiv4di;
38973 goto scatter_gen;
38974 case IX86_BUILTIN_SCATTERSIV2DI:
38975 icode = CODE_FOR_avx512vl_scattersiv2di;
38976 goto scatter_gen;
38977 case IX86_BUILTIN_SCATTERDIV8SI:
38978 icode = CODE_FOR_avx512vl_scatterdiv8si;
38979 goto scatter_gen;
38980 case IX86_BUILTIN_SCATTERDIV4SI:
38981 icode = CODE_FOR_avx512vl_scatterdiv4si;
38982 goto scatter_gen;
38983 case IX86_BUILTIN_SCATTERDIV4DI:
38984 icode = CODE_FOR_avx512vl_scatterdiv4di;
38985 goto scatter_gen;
38986 case IX86_BUILTIN_SCATTERDIV2DI:
38987 icode = CODE_FOR_avx512vl_scatterdiv2di;
38988 goto scatter_gen;
38989 case IX86_BUILTIN_GATHERPFDPD:
38990 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
38991 goto vec_prefetch_gen;
38992 case IX86_BUILTIN_SCATTERALTSIV8DF:
38993 icode = CODE_FOR_avx512f_scattersiv8df;
38994 goto scatter_gen;
38995 case IX86_BUILTIN_SCATTERALTDIV16SF:
38996 icode = CODE_FOR_avx512f_scatterdiv16sf;
38997 goto scatter_gen;
38998 case IX86_BUILTIN_SCATTERALTSIV8DI:
38999 icode = CODE_FOR_avx512f_scattersiv8di;
39000 goto scatter_gen;
39001 case IX86_BUILTIN_SCATTERALTDIV16SI:
39002 icode = CODE_FOR_avx512f_scatterdiv16si;
39003 goto scatter_gen;
39004 case IX86_BUILTIN_GATHERPFDPS:
39005 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
39006 goto vec_prefetch_gen;
39007 case IX86_BUILTIN_GATHERPFQPD:
39008 icode = CODE_FOR_avx512pf_gatherpfv8didf;
39009 goto vec_prefetch_gen;
39010 case IX86_BUILTIN_GATHERPFQPS:
39011 icode = CODE_FOR_avx512pf_gatherpfv8disf;
39012 goto vec_prefetch_gen;
39013 case IX86_BUILTIN_SCATTERPFDPD:
39014 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
39015 goto vec_prefetch_gen;
39016 case IX86_BUILTIN_SCATTERPFDPS:
39017 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
39018 goto vec_prefetch_gen;
39019 case IX86_BUILTIN_SCATTERPFQPD:
39020 icode = CODE_FOR_avx512pf_scatterpfv8didf;
39021 goto vec_prefetch_gen;
39022 case IX86_BUILTIN_SCATTERPFQPS:
39023 icode = CODE_FOR_avx512pf_scatterpfv8disf;
39024 goto vec_prefetch_gen;
39026 gather_gen:
39027 rtx half;
39028 rtx (*gen) (rtx, rtx);
39030 arg0 = CALL_EXPR_ARG (exp, 0);
39031 arg1 = CALL_EXPR_ARG (exp, 1);
39032 arg2 = CALL_EXPR_ARG (exp, 2);
39033 arg3 = CALL_EXPR_ARG (exp, 3);
39034 arg4 = CALL_EXPR_ARG (exp, 4);
39035 op0 = expand_normal (arg0);
39036 op1 = expand_normal (arg1);
39037 op2 = expand_normal (arg2);
39038 op3 = expand_normal (arg3);
39039 op4 = expand_normal (arg4);
39040 /* Note the arg order is different from the operand order. */
39041 mode0 = insn_data[icode].operand[1].mode;
39042 mode2 = insn_data[icode].operand[3].mode;
39043 mode3 = insn_data[icode].operand[4].mode;
39044 mode4 = insn_data[icode].operand[5].mode;
39046 if (target == NULL_RTX
39047 || GET_MODE (target) != insn_data[icode].operand[0].mode
39048 || !insn_data[icode].operand[0].predicate (target,
39049 GET_MODE (target)))
39050 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
39051 else
39052 subtarget = target;
39054 switch (fcode)
39056 case IX86_BUILTIN_GATHER3ALTSIV8DF:
39057 case IX86_BUILTIN_GATHER3ALTSIV8DI:
39058 half = gen_reg_rtx (V8SImode);
39059 if (!nonimmediate_operand (op2, V16SImode))
39060 op2 = copy_to_mode_reg (V16SImode, op2);
39061 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39062 op2 = half;
39063 break;
39064 case IX86_BUILTIN_GATHER3ALTSIV4DF:
39065 case IX86_BUILTIN_GATHER3ALTSIV4DI:
39066 case IX86_BUILTIN_GATHERALTSIV4DF:
39067 case IX86_BUILTIN_GATHERALTSIV4DI:
39068 half = gen_reg_rtx (V4SImode);
39069 if (!nonimmediate_operand (op2, V8SImode))
39070 op2 = copy_to_mode_reg (V8SImode, op2);
39071 emit_insn (gen_vec_extract_lo_v8si (half, op2));
39072 op2 = half;
39073 break;
39074 case IX86_BUILTIN_GATHER3ALTDIV16SF:
39075 case IX86_BUILTIN_GATHER3ALTDIV16SI:
39076 half = gen_reg_rtx (mode0);
39077 if (mode0 == V8SFmode)
39078 gen = gen_vec_extract_lo_v16sf;
39079 else
39080 gen = gen_vec_extract_lo_v16si;
39081 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39082 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39083 emit_insn (gen (half, op0));
39084 op0 = half;
39085 if (GET_MODE (op3) != VOIDmode)
39087 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39088 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39089 emit_insn (gen (half, op3));
39090 op3 = half;
39092 break;
39093 case IX86_BUILTIN_GATHER3ALTDIV8SF:
39094 case IX86_BUILTIN_GATHER3ALTDIV8SI:
39095 case IX86_BUILTIN_GATHERALTDIV8SF:
39096 case IX86_BUILTIN_GATHERALTDIV8SI:
39097 half = gen_reg_rtx (mode0);
39098 if (mode0 == V4SFmode)
39099 gen = gen_vec_extract_lo_v8sf;
39100 else
39101 gen = gen_vec_extract_lo_v8si;
39102 if (!nonimmediate_operand (op0, GET_MODE (op0)))
39103 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
39104 emit_insn (gen (half, op0));
39105 op0 = half;
39106 if (GET_MODE (op3) != VOIDmode)
39108 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39109 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39110 emit_insn (gen (half, op3));
39111 op3 = half;
39113 break;
39114 default:
39115 break;
39118 /* Force memory operand only with base register here. But we
39119 don't want to do it on memory operand for other builtin
39120 functions. */
39121 op1 = ix86_zero_extend_to_Pmode (op1);
39123 if (!insn_data[icode].operand[1].predicate (op0, mode0))
39124 op0 = copy_to_mode_reg (mode0, op0);
39125 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
39126 op1 = copy_to_mode_reg (Pmode, op1);
39127 if (!insn_data[icode].operand[3].predicate (op2, mode2))
39128 op2 = copy_to_mode_reg (mode2, op2);
39130 op3 = fixup_modeless_constant (op3, mode3);
39132 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
39134 if (!insn_data[icode].operand[4].predicate (op3, mode3))
39135 op3 = copy_to_mode_reg (mode3, op3);
39137 else
39139 op3 = copy_to_reg (op3);
39140 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
39142 if (!insn_data[icode].operand[5].predicate (op4, mode4))
39144 error ("the last argument must be scale 1, 2, 4, 8");
39145 return const0_rtx;
39148 /* Optimize. If mask is known to have all high bits set,
39149 replace op0 with pc_rtx to signal that the instruction
39150 overwrites the whole destination and doesn't use its
39151 previous contents. */
39152 if (optimize)
39154 if (TREE_CODE (arg3) == INTEGER_CST)
39156 if (integer_all_onesp (arg3))
39157 op0 = pc_rtx;
39159 else if (TREE_CODE (arg3) == VECTOR_CST)
39161 unsigned int negative = 0;
39162 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
39164 tree cst = VECTOR_CST_ELT (arg3, i);
39165 if (TREE_CODE (cst) == INTEGER_CST
39166 && tree_int_cst_sign_bit (cst))
39167 negative++;
39168 else if (TREE_CODE (cst) == REAL_CST
39169 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
39170 negative++;
39172 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
39173 op0 = pc_rtx;
39175 else if (TREE_CODE (arg3) == SSA_NAME
39176 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
39178 /* Recognize also when mask is like:
39179 __v2df src = _mm_setzero_pd ();
39180 __v2df mask = _mm_cmpeq_pd (src, src);
39182 __v8sf src = _mm256_setzero_ps ();
39183 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
39184 as that is a cheaper way to load all ones into
39185 a register than having to load a constant from
39186 memory. */
39187 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
39188 if (is_gimple_call (def_stmt))
39190 tree fndecl = gimple_call_fndecl (def_stmt);
39191 if (fndecl
39192 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
39193 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
39195 case IX86_BUILTIN_CMPPD:
39196 case IX86_BUILTIN_CMPPS:
39197 case IX86_BUILTIN_CMPPD256:
39198 case IX86_BUILTIN_CMPPS256:
39199 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
39200 break;
39201 /* FALLTHRU */
39202 case IX86_BUILTIN_CMPEQPD:
39203 case IX86_BUILTIN_CMPEQPS:
39204 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
39205 && initializer_zerop (gimple_call_arg (def_stmt,
39206 1)))
39207 op0 = pc_rtx;
39208 break;
39209 default:
39210 break;
39216 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
39217 if (! pat)
39218 return const0_rtx;
39219 emit_insn (pat);
39221 switch (fcode)
39223 case IX86_BUILTIN_GATHER3DIV16SF:
39224 if (target == NULL_RTX)
39225 target = gen_reg_rtx (V8SFmode);
39226 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
39227 break;
39228 case IX86_BUILTIN_GATHER3DIV16SI:
39229 if (target == NULL_RTX)
39230 target = gen_reg_rtx (V8SImode);
39231 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
39232 break;
39233 case IX86_BUILTIN_GATHER3DIV8SF:
39234 case IX86_BUILTIN_GATHERDIV8SF:
39235 if (target == NULL_RTX)
39236 target = gen_reg_rtx (V4SFmode);
39237 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
39238 break;
39239 case IX86_BUILTIN_GATHER3DIV8SI:
39240 case IX86_BUILTIN_GATHERDIV8SI:
39241 if (target == NULL_RTX)
39242 target = gen_reg_rtx (V4SImode);
39243 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
39244 break;
39245 default:
39246 target = subtarget;
39247 break;
39249 return target;
39251 scatter_gen:
39252 arg0 = CALL_EXPR_ARG (exp, 0);
39253 arg1 = CALL_EXPR_ARG (exp, 1);
39254 arg2 = CALL_EXPR_ARG (exp, 2);
39255 arg3 = CALL_EXPR_ARG (exp, 3);
39256 arg4 = CALL_EXPR_ARG (exp, 4);
39257 op0 = expand_normal (arg0);
39258 op1 = expand_normal (arg1);
39259 op2 = expand_normal (arg2);
39260 op3 = expand_normal (arg3);
39261 op4 = expand_normal (arg4);
39262 mode1 = insn_data[icode].operand[1].mode;
39263 mode2 = insn_data[icode].operand[2].mode;
39264 mode3 = insn_data[icode].operand[3].mode;
39265 mode4 = insn_data[icode].operand[4].mode;
39267 /* Scatter instruction stores operand op3 to memory with
39268 indices from op2 and scale from op4 under writemask op1.
39269 If index operand op2 has more elements then source operand
39270 op3 one need to use only its low half. And vice versa. */
39271 switch (fcode)
39273 case IX86_BUILTIN_SCATTERALTSIV8DF:
39274 case IX86_BUILTIN_SCATTERALTSIV8DI:
39275 half = gen_reg_rtx (V8SImode);
39276 if (!nonimmediate_operand (op2, V16SImode))
39277 op2 = copy_to_mode_reg (V16SImode, op2);
39278 emit_insn (gen_vec_extract_lo_v16si (half, op2));
39279 op2 = half;
39280 break;
39281 case IX86_BUILTIN_SCATTERALTDIV16SF:
39282 case IX86_BUILTIN_SCATTERALTDIV16SI:
39283 half = gen_reg_rtx (mode3);
39284 if (mode3 == V8SFmode)
39285 gen = gen_vec_extract_lo_v16sf;
39286 else
39287 gen = gen_vec_extract_lo_v16si;
39288 if (!nonimmediate_operand (op3, GET_MODE (op3)))
39289 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
39290 emit_insn (gen (half, op3));
39291 op3 = half;
39292 break;
39293 default:
39294 break;
39297 /* Force memory operand only with base register here. But we
39298 don't want to do it on memory operand for other builtin
39299 functions. */
39300 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
39302 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
39303 op0 = copy_to_mode_reg (Pmode, op0);
39305 op1 = fixup_modeless_constant (op1, mode1);
39307 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
39309 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39310 op1 = copy_to_mode_reg (mode1, op1);
39312 else
39314 op1 = copy_to_reg (op1);
39315 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
39318 if (!insn_data[icode].operand[2].predicate (op2, mode2))
39319 op2 = copy_to_mode_reg (mode2, op2);
39321 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39322 op3 = copy_to_mode_reg (mode3, op3);
39324 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39326 error ("the last argument must be scale 1, 2, 4, 8");
39327 return const0_rtx;
39330 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39331 if (! pat)
39332 return const0_rtx;
39334 emit_insn (pat);
39335 return 0;
39337 vec_prefetch_gen:
39338 arg0 = CALL_EXPR_ARG (exp, 0);
39339 arg1 = CALL_EXPR_ARG (exp, 1);
39340 arg2 = CALL_EXPR_ARG (exp, 2);
39341 arg3 = CALL_EXPR_ARG (exp, 3);
39342 arg4 = CALL_EXPR_ARG (exp, 4);
39343 op0 = expand_normal (arg0);
39344 op1 = expand_normal (arg1);
39345 op2 = expand_normal (arg2);
39346 op3 = expand_normal (arg3);
39347 op4 = expand_normal (arg4);
39348 mode0 = insn_data[icode].operand[0].mode;
39349 mode1 = insn_data[icode].operand[1].mode;
39350 mode3 = insn_data[icode].operand[3].mode;
39351 mode4 = insn_data[icode].operand[4].mode;
39353 op0 = fixup_modeless_constant (op0, mode0);
39355 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
39357 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39358 op0 = copy_to_mode_reg (mode0, op0);
39360 else
39362 op0 = copy_to_reg (op0);
39363 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
39366 if (!insn_data[icode].operand[1].predicate (op1, mode1))
39367 op1 = copy_to_mode_reg (mode1, op1);
39369 /* Force memory operand only with base register here. But we
39370 don't want to do it on memory operand for other builtin
39371 functions. */
39372 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
39374 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
39375 op2 = copy_to_mode_reg (Pmode, op2);
39377 if (!insn_data[icode].operand[3].predicate (op3, mode3))
39379 error ("the forth argument must be scale 1, 2, 4, 8");
39380 return const0_rtx;
39383 if (!insn_data[icode].operand[4].predicate (op4, mode4))
39385 error ("incorrect hint operand");
39386 return const0_rtx;
39389 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
39390 if (! pat)
39391 return const0_rtx;
39393 emit_insn (pat);
39395 return 0;
39397 case IX86_BUILTIN_XABORT:
39398 icode = CODE_FOR_xabort;
39399 arg0 = CALL_EXPR_ARG (exp, 0);
39400 op0 = expand_normal (arg0);
39401 mode0 = insn_data[icode].operand[0].mode;
39402 if (!insn_data[icode].operand[0].predicate (op0, mode0))
39404 error ("the xabort's argument must be an 8-bit immediate");
39405 return const0_rtx;
39407 emit_insn (gen_xabort (op0));
39408 return 0;
39410 default:
39411 break;
39414 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
39415 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
39417 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
39418 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
39419 target);
39422 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
39423 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
39425 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
39426 switch (fcode)
39428 case IX86_BUILTIN_FABSQ:
39429 case IX86_BUILTIN_COPYSIGNQ:
39430 if (!TARGET_SSE)
39431 /* Emit a normal call if SSE isn't available. */
39432 return expand_call (exp, target, ignore);
39433 /* FALLTHRU */
39434 default:
39435 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
39439 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
39440 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
39442 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
39443 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
39444 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
39445 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
39446 int masked = 1;
39447 machine_mode mode, wide_mode, nar_mode;
39449 nar_mode = V4SFmode;
39450 mode = V16SFmode;
39451 wide_mode = V64SFmode;
39452 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
39453 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
39455 switch (fcode)
39457 case IX86_BUILTIN_4FMAPS:
39458 fcn = gen_avx5124fmaddps_4fmaddps;
39459 masked = 0;
39460 goto v4fma_expand;
39462 case IX86_BUILTIN_4DPWSSD:
39463 nar_mode = V4SImode;
39464 mode = V16SImode;
39465 wide_mode = V64SImode;
39466 fcn = gen_avx5124vnniw_vp4dpwssd;
39467 masked = 0;
39468 goto v4fma_expand;
39470 case IX86_BUILTIN_4DPWSSDS:
39471 nar_mode = V4SImode;
39472 mode = V16SImode;
39473 wide_mode = V64SImode;
39474 fcn = gen_avx5124vnniw_vp4dpwssds;
39475 masked = 0;
39476 goto v4fma_expand;
39478 case IX86_BUILTIN_4FNMAPS:
39479 fcn = gen_avx5124fmaddps_4fnmaddps;
39480 masked = 0;
39481 goto v4fma_expand;
39483 case IX86_BUILTIN_4FNMAPS_MASK:
39484 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
39485 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
39486 goto v4fma_expand;
39488 case IX86_BUILTIN_4DPWSSD_MASK:
39489 nar_mode = V4SImode;
39490 mode = V16SImode;
39491 wide_mode = V64SImode;
39492 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
39493 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
39494 goto v4fma_expand;
39496 case IX86_BUILTIN_4DPWSSDS_MASK:
39497 nar_mode = V4SImode;
39498 mode = V16SImode;
39499 wide_mode = V64SImode;
39500 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
39501 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
39502 goto v4fma_expand;
39504 case IX86_BUILTIN_4FMAPS_MASK:
39506 tree args[4];
39507 rtx ops[4];
39508 rtx wide_reg;
39509 rtx accum;
39510 rtx addr;
39511 rtx mem;
39513 v4fma_expand:
39514 wide_reg = gen_reg_rtx (wide_mode);
39515 for (i = 0; i < 4; i++)
39517 args[i] = CALL_EXPR_ARG (exp, i);
39518 ops[i] = expand_normal (args[i]);
39520 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
39521 ops[i]);
39524 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39525 accum = force_reg (mode, accum);
39527 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39528 addr = force_reg (Pmode, addr);
39530 mem = gen_rtx_MEM (nar_mode, addr);
39532 target = gen_reg_rtx (mode);
39534 emit_move_insn (target, accum);
39536 if (! masked)
39537 emit_insn (fcn (target, accum, wide_reg, mem));
39538 else
39540 rtx merge, mask;
39541 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39543 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39545 if (CONST_INT_P (mask))
39546 mask = fixup_modeless_constant (mask, HImode);
39548 mask = force_reg (HImode, mask);
39550 if (GET_MODE (mask) != HImode)
39551 mask = gen_rtx_SUBREG (HImode, mask, 0);
39553 /* If merge is 0 then we're about to emit z-masked variant. */
39554 if (const0_operand (merge, mode))
39555 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39556 /* If merge is the same as accum then emit merge-masked variant. */
39557 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39559 merge = force_reg (mode, merge);
39560 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39562 /* Merge with something unknown might happen if we z-mask w/ -O0. */
39563 else
39565 target = gen_reg_rtx (mode);
39566 emit_move_insn (target, merge);
39567 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39570 return target;
39573 case IX86_BUILTIN_4FNMASS:
39574 fcn = gen_avx5124fmaddps_4fnmaddss;
39575 masked = 0;
39576 goto s4fma_expand;
39578 case IX86_BUILTIN_4FMASS:
39579 fcn = gen_avx5124fmaddps_4fmaddss;
39580 masked = 0;
39581 goto s4fma_expand;
39583 case IX86_BUILTIN_4FNMASS_MASK:
39584 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
39585 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
39586 goto s4fma_expand;
39588 case IX86_BUILTIN_4FMASS_MASK:
39590 tree args[4];
39591 rtx ops[4];
39592 rtx wide_reg;
39593 rtx accum;
39594 rtx addr;
39595 rtx mem;
39597 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
39598 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
39600 s4fma_expand:
39601 mode = V4SFmode;
39602 wide_reg = gen_reg_rtx (V64SFmode);
39603 for (i = 0; i < 4; i++)
39605 rtx tmp;
39606 args[i] = CALL_EXPR_ARG (exp, i);
39607 ops[i] = expand_normal (args[i]);
39609 tmp = gen_reg_rtx (SFmode);
39610 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
39612 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
39613 gen_rtx_SUBREG (V16SFmode, tmp, 0));
39616 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
39617 accum = force_reg (V4SFmode, accum);
39619 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
39620 addr = force_reg (Pmode, addr);
39622 mem = gen_rtx_MEM (V4SFmode, addr);
39624 target = gen_reg_rtx (V4SFmode);
39626 emit_move_insn (target, accum);
39628 if (! masked)
39629 emit_insn (fcn (target, accum, wide_reg, mem));
39630 else
39632 rtx merge, mask;
39633 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
39635 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
39637 if (CONST_INT_P (mask))
39638 mask = fixup_modeless_constant (mask, QImode);
39640 mask = force_reg (QImode, mask);
39642 if (GET_MODE (mask) != QImode)
39643 mask = gen_rtx_SUBREG (QImode, mask, 0);
39645 /* If merge is 0 then we're about to emit z-masked variant. */
39646 if (const0_operand (merge, mode))
39647 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
39648 /* If merge is the same as accum then emit merge-masked
39649 variant. */
39650 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
39652 merge = force_reg (mode, merge);
39653 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
39655 /* Merge with something unknown might happen if we z-mask
39656 w/ -O0. */
39657 else
39659 target = gen_reg_rtx (mode);
39660 emit_move_insn (target, merge);
39661 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
39664 return target;
39666 case IX86_BUILTIN_RDPID:
39667 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
39668 target);
39669 default:
39670 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
39674 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
39675 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
39677 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
39678 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
39681 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
39682 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
39684 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
39685 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
39688 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
39689 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
39691 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
39692 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
39695 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
39696 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
39698 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
39699 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
39702 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
39703 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
39705 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
39706 const struct builtin_description *d = bdesc_multi_arg + i;
39707 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
39708 (enum ix86_builtin_func_type)
39709 d->flag, d->comparison);
39712 gcc_unreachable ();
39715 /* This returns the target-specific builtin with code CODE if
39716 current_function_decl has visibility on this builtin, which is checked
39717 using isa flags. Returns NULL_TREE otherwise. */
39719 static tree ix86_get_builtin (enum ix86_builtins code)
39721 struct cl_target_option *opts;
39722 tree target_tree = NULL_TREE;
39724 /* Determine the isa flags of current_function_decl. */
39726 if (current_function_decl)
39727 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
39729 if (target_tree == NULL)
39730 target_tree = target_option_default_node;
39732 opts = TREE_TARGET_OPTION (target_tree);
39734 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
39735 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
39736 return ix86_builtin_decl (code, true);
39737 else
39738 return NULL_TREE;
39741 /* Return function decl for target specific builtin
39742 for given MPX builtin passed i FCODE. */
39743 static tree
39744 ix86_builtin_mpx_function (unsigned fcode)
39746 switch (fcode)
39748 case BUILT_IN_CHKP_BNDMK:
39749 return ix86_builtins[IX86_BUILTIN_BNDMK];
39751 case BUILT_IN_CHKP_BNDSTX:
39752 return ix86_builtins[IX86_BUILTIN_BNDSTX];
39754 case BUILT_IN_CHKP_BNDLDX:
39755 return ix86_builtins[IX86_BUILTIN_BNDLDX];
39757 case BUILT_IN_CHKP_BNDCL:
39758 return ix86_builtins[IX86_BUILTIN_BNDCL];
39760 case BUILT_IN_CHKP_BNDCU:
39761 return ix86_builtins[IX86_BUILTIN_BNDCU];
39763 case BUILT_IN_CHKP_BNDRET:
39764 return ix86_builtins[IX86_BUILTIN_BNDRET];
39766 case BUILT_IN_CHKP_INTERSECT:
39767 return ix86_builtins[IX86_BUILTIN_BNDINT];
39769 case BUILT_IN_CHKP_NARROW:
39770 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
39772 case BUILT_IN_CHKP_SIZEOF:
39773 return ix86_builtins[IX86_BUILTIN_SIZEOF];
39775 case BUILT_IN_CHKP_EXTRACT_LOWER:
39776 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
39778 case BUILT_IN_CHKP_EXTRACT_UPPER:
39779 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
39781 default:
39782 return NULL_TREE;
39785 gcc_unreachable ();
39788 /* Helper function for ix86_load_bounds and ix86_store_bounds.
39790 Return an address to be used to load/store bounds for pointer
39791 passed in SLOT.
39793 SLOT_NO is an integer constant holding number of a target
39794 dependent special slot to be used in case SLOT is not a memory.
39796 SPECIAL_BASE is a pointer to be used as a base of fake address
39797 to access special slots in Bounds Table. SPECIAL_BASE[-1],
39798 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
39800 static rtx
39801 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
39803 rtx addr = NULL;
39805 /* NULL slot means we pass bounds for pointer not passed to the
39806 function at all. Register slot means we pass pointer in a
39807 register. In both these cases bounds are passed via Bounds
39808 Table. Since we do not have actual pointer stored in memory,
39809 we have to use fake addresses to access Bounds Table. We
39810 start with (special_base - sizeof (void*)) and decrease this
39811 address by pointer size to get addresses for other slots. */
39812 if (!slot || REG_P (slot))
39814 gcc_assert (CONST_INT_P (slot_no));
39815 addr = plus_constant (Pmode, special_base,
39816 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
39818 /* If pointer is passed in a memory then its address is used to
39819 access Bounds Table. */
39820 else if (MEM_P (slot))
39822 addr = XEXP (slot, 0);
39823 if (!register_operand (addr, Pmode))
39824 addr = copy_addr_to_reg (addr);
39826 else
39827 gcc_unreachable ();
39829 return addr;
39832 /* Expand pass uses this hook to load bounds for function parameter
39833 PTR passed in SLOT in case its bounds are not passed in a register.
39835 If SLOT is a memory, then bounds are loaded as for regular pointer
39836 loaded from memory. PTR may be NULL in case SLOT is a memory.
39837 In such case value of PTR (if required) may be loaded from SLOT.
39839 If SLOT is NULL or a register then SLOT_NO is an integer constant
39840 holding number of the target dependent special slot which should be
39841 used to obtain bounds.
39843 Return loaded bounds. */
39845 static rtx
39846 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
39848 rtx reg = gen_reg_rtx (BNDmode);
39849 rtx addr;
39851 /* Get address to be used to access Bounds Table. Special slots start
39852 at the location of return address of the current function. */
39853 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
39855 /* Load pointer value from a memory if we don't have it. */
39856 if (!ptr)
39858 gcc_assert (MEM_P (slot));
39859 ptr = copy_addr_to_reg (slot);
39862 if (!register_operand (ptr, Pmode))
39863 ptr = ix86_zero_extend_to_Pmode (ptr);
39865 emit_insn (BNDmode == BND64mode
39866 ? gen_bnd64_ldx (reg, addr, ptr)
39867 : gen_bnd32_ldx (reg, addr, ptr));
39869 return reg;
39872 /* Expand pass uses this hook to store BOUNDS for call argument PTR
39873 passed in SLOT in case BOUNDS are not passed in a register.
39875 If SLOT is a memory, then BOUNDS are stored as for regular pointer
39876 stored in memory. PTR may be NULL in case SLOT is a memory.
39877 In such case value of PTR (if required) may be loaded from SLOT.
39879 If SLOT is NULL or a register then SLOT_NO is an integer constant
39880 holding number of the target dependent special slot which should be
39881 used to store BOUNDS. */
39883 static void
39884 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
39886 rtx addr;
39888 /* Get address to be used to access Bounds Table. Special slots start
39889 at the location of return address of a called function. */
39890 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
39892 /* Load pointer value from a memory if we don't have it. */
39893 if (!ptr)
39895 gcc_assert (MEM_P (slot));
39896 ptr = copy_addr_to_reg (slot);
39899 if (!register_operand (ptr, Pmode))
39900 ptr = ix86_zero_extend_to_Pmode (ptr);
39902 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
39903 if (!register_operand (bounds, BNDmode))
39904 bounds = copy_to_mode_reg (BNDmode, bounds);
39906 emit_insn (BNDmode == BND64mode
39907 ? gen_bnd64_stx (addr, ptr, bounds)
39908 : gen_bnd32_stx (addr, ptr, bounds));
39911 /* Load and return bounds returned by function in SLOT. */
39913 static rtx
39914 ix86_load_returned_bounds (rtx slot)
39916 rtx res;
39918 gcc_assert (REG_P (slot));
39919 res = gen_reg_rtx (BNDmode);
39920 emit_move_insn (res, slot);
39922 return res;
39925 /* Store BOUNDS returned by function into SLOT. */
39927 static void
39928 ix86_store_returned_bounds (rtx slot, rtx bounds)
39930 gcc_assert (REG_P (slot));
39931 emit_move_insn (slot, bounds);
39934 /* Returns a function decl for a vectorized version of the combined function
39935 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
39936 if it is not available. */
39938 static tree
39939 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
39940 tree type_in)
39942 machine_mode in_mode, out_mode;
39943 int in_n, out_n;
39945 if (TREE_CODE (type_out) != VECTOR_TYPE
39946 || TREE_CODE (type_in) != VECTOR_TYPE)
39947 return NULL_TREE;
39949 out_mode = TYPE_MODE (TREE_TYPE (type_out));
39950 out_n = TYPE_VECTOR_SUBPARTS (type_out);
39951 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39952 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39954 switch (fn)
39956 CASE_CFN_EXP2:
39957 if (out_mode == SFmode && in_mode == SFmode)
39959 if (out_n == 16 && in_n == 16)
39960 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
39962 break;
39964 CASE_CFN_IFLOOR:
39965 CASE_CFN_LFLOOR:
39966 CASE_CFN_LLFLOOR:
39967 /* The round insn does not trap on denormals. */
39968 if (flag_trapping_math || !TARGET_SSE4_1)
39969 break;
39971 if (out_mode == SImode && in_mode == DFmode)
39973 if (out_n == 4 && in_n == 2)
39974 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
39975 else if (out_n == 8 && in_n == 4)
39976 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
39977 else if (out_n == 16 && in_n == 8)
39978 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
39980 if (out_mode == SImode && in_mode == SFmode)
39982 if (out_n == 4 && in_n == 4)
39983 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
39984 else if (out_n == 8 && in_n == 8)
39985 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
39986 else if (out_n == 16 && in_n == 16)
39987 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
39989 break;
39991 CASE_CFN_ICEIL:
39992 CASE_CFN_LCEIL:
39993 CASE_CFN_LLCEIL:
39994 /* The round insn does not trap on denormals. */
39995 if (flag_trapping_math || !TARGET_SSE4_1)
39996 break;
39998 if (out_mode == SImode && in_mode == DFmode)
40000 if (out_n == 4 && in_n == 2)
40001 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
40002 else if (out_n == 8 && in_n == 4)
40003 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
40004 else if (out_n == 16 && in_n == 8)
40005 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
40007 if (out_mode == SImode && in_mode == SFmode)
40009 if (out_n == 4 && in_n == 4)
40010 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
40011 else if (out_n == 8 && in_n == 8)
40012 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
40013 else if (out_n == 16 && in_n == 16)
40014 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
40016 break;
40018 CASE_CFN_IRINT:
40019 CASE_CFN_LRINT:
40020 CASE_CFN_LLRINT:
40021 if (out_mode == SImode && in_mode == DFmode)
40023 if (out_n == 4 && in_n == 2)
40024 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
40025 else if (out_n == 8 && in_n == 4)
40026 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
40027 else if (out_n == 16 && in_n == 8)
40028 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
40030 if (out_mode == SImode && in_mode == SFmode)
40032 if (out_n == 4 && in_n == 4)
40033 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
40034 else if (out_n == 8 && in_n == 8)
40035 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
40036 else if (out_n == 16 && in_n == 16)
40037 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
40039 break;
40041 CASE_CFN_IROUND:
40042 CASE_CFN_LROUND:
40043 CASE_CFN_LLROUND:
40044 /* The round insn does not trap on denormals. */
40045 if (flag_trapping_math || !TARGET_SSE4_1)
40046 break;
40048 if (out_mode == SImode && in_mode == DFmode)
40050 if (out_n == 4 && in_n == 2)
40051 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
40052 else if (out_n == 8 && in_n == 4)
40053 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
40054 else if (out_n == 16 && in_n == 8)
40055 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
40057 if (out_mode == SImode && in_mode == SFmode)
40059 if (out_n == 4 && in_n == 4)
40060 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
40061 else if (out_n == 8 && in_n == 8)
40062 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
40063 else if (out_n == 16 && in_n == 16)
40064 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
40066 break;
40068 CASE_CFN_FLOOR:
40069 /* The round insn does not trap on denormals. */
40070 if (flag_trapping_math || !TARGET_SSE4_1)
40071 break;
40073 if (out_mode == DFmode && in_mode == DFmode)
40075 if (out_n == 2 && in_n == 2)
40076 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
40077 else if (out_n == 4 && in_n == 4)
40078 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
40079 else if (out_n == 8 && in_n == 8)
40080 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
40082 if (out_mode == SFmode && in_mode == SFmode)
40084 if (out_n == 4 && in_n == 4)
40085 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
40086 else if (out_n == 8 && in_n == 8)
40087 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
40088 else if (out_n == 16 && in_n == 16)
40089 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
40091 break;
40093 CASE_CFN_CEIL:
40094 /* The round insn does not trap on denormals. */
40095 if (flag_trapping_math || !TARGET_SSE4_1)
40096 break;
40098 if (out_mode == DFmode && in_mode == DFmode)
40100 if (out_n == 2 && in_n == 2)
40101 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
40102 else if (out_n == 4 && in_n == 4)
40103 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
40104 else if (out_n == 8 && in_n == 8)
40105 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
40107 if (out_mode == SFmode && in_mode == SFmode)
40109 if (out_n == 4 && in_n == 4)
40110 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
40111 else if (out_n == 8 && in_n == 8)
40112 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
40113 else if (out_n == 16 && in_n == 16)
40114 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
40116 break;
40118 CASE_CFN_TRUNC:
40119 /* The round insn does not trap on denormals. */
40120 if (flag_trapping_math || !TARGET_SSE4_1)
40121 break;
40123 if (out_mode == DFmode && in_mode == DFmode)
40125 if (out_n == 2 && in_n == 2)
40126 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
40127 else if (out_n == 4 && in_n == 4)
40128 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
40129 else if (out_n == 8 && in_n == 8)
40130 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
40132 if (out_mode == SFmode && in_mode == SFmode)
40134 if (out_n == 4 && in_n == 4)
40135 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
40136 else if (out_n == 8 && in_n == 8)
40137 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
40138 else if (out_n == 16 && in_n == 16)
40139 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
40141 break;
40143 CASE_CFN_RINT:
40144 /* The round insn does not trap on denormals. */
40145 if (flag_trapping_math || !TARGET_SSE4_1)
40146 break;
40148 if (out_mode == DFmode && in_mode == DFmode)
40150 if (out_n == 2 && in_n == 2)
40151 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
40152 else if (out_n == 4 && in_n == 4)
40153 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
40155 if (out_mode == SFmode && in_mode == SFmode)
40157 if (out_n == 4 && in_n == 4)
40158 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
40159 else if (out_n == 8 && in_n == 8)
40160 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
40162 break;
40164 CASE_CFN_FMA:
40165 if (out_mode == DFmode && in_mode == DFmode)
40167 if (out_n == 2 && in_n == 2)
40168 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
40169 if (out_n == 4 && in_n == 4)
40170 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
40172 if (out_mode == SFmode && in_mode == SFmode)
40174 if (out_n == 4 && in_n == 4)
40175 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
40176 if (out_n == 8 && in_n == 8)
40177 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
40179 break;
40181 default:
40182 break;
40185 /* Dispatch to a handler for a vectorization library. */
40186 if (ix86_veclib_handler)
40187 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
40189 return NULL_TREE;
40192 /* Handler for an SVML-style interface to
40193 a library with vectorized intrinsics. */
40195 static tree
40196 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
40198 char name[20];
40199 tree fntype, new_fndecl, args;
40200 unsigned arity;
40201 const char *bname;
40202 machine_mode el_mode, in_mode;
40203 int n, in_n;
40205 /* The SVML is suitable for unsafe math only. */
40206 if (!flag_unsafe_math_optimizations)
40207 return NULL_TREE;
40209 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40210 n = TYPE_VECTOR_SUBPARTS (type_out);
40211 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40212 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40213 if (el_mode != in_mode
40214 || n != in_n)
40215 return NULL_TREE;
40217 switch (fn)
40219 CASE_CFN_EXP:
40220 CASE_CFN_LOG:
40221 CASE_CFN_LOG10:
40222 CASE_CFN_POW:
40223 CASE_CFN_TANH:
40224 CASE_CFN_TAN:
40225 CASE_CFN_ATAN:
40226 CASE_CFN_ATAN2:
40227 CASE_CFN_ATANH:
40228 CASE_CFN_CBRT:
40229 CASE_CFN_SINH:
40230 CASE_CFN_SIN:
40231 CASE_CFN_ASINH:
40232 CASE_CFN_ASIN:
40233 CASE_CFN_COSH:
40234 CASE_CFN_COS:
40235 CASE_CFN_ACOSH:
40236 CASE_CFN_ACOS:
40237 if ((el_mode != DFmode || n != 2)
40238 && (el_mode != SFmode || n != 4))
40239 return NULL_TREE;
40240 break;
40242 default:
40243 return NULL_TREE;
40246 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40247 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40249 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
40250 strcpy (name, "vmlsLn4");
40251 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
40252 strcpy (name, "vmldLn2");
40253 else if (n == 4)
40255 sprintf (name, "vmls%s", bname+10);
40256 name[strlen (name)-1] = '4';
40258 else
40259 sprintf (name, "vmld%s2", bname+10);
40261 /* Convert to uppercase. */
40262 name[4] &= ~0x20;
40264 arity = 0;
40265 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40266 arity++;
40268 if (arity == 1)
40269 fntype = build_function_type_list (type_out, type_in, NULL);
40270 else
40271 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40273 /* Build a function declaration for the vectorized function. */
40274 new_fndecl = build_decl (BUILTINS_LOCATION,
40275 FUNCTION_DECL, get_identifier (name), fntype);
40276 TREE_PUBLIC (new_fndecl) = 1;
40277 DECL_EXTERNAL (new_fndecl) = 1;
40278 DECL_IS_NOVOPS (new_fndecl) = 1;
40279 TREE_READONLY (new_fndecl) = 1;
40281 return new_fndecl;
40284 /* Handler for an ACML-style interface to
40285 a library with vectorized intrinsics. */
40287 static tree
40288 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
40290 char name[20] = "__vr.._";
40291 tree fntype, new_fndecl, args;
40292 unsigned arity;
40293 const char *bname;
40294 machine_mode el_mode, in_mode;
40295 int n, in_n;
40297 /* The ACML is 64bits only and suitable for unsafe math only as
40298 it does not correctly support parts of IEEE with the required
40299 precision such as denormals. */
40300 if (!TARGET_64BIT
40301 || !flag_unsafe_math_optimizations)
40302 return NULL_TREE;
40304 el_mode = TYPE_MODE (TREE_TYPE (type_out));
40305 n = TYPE_VECTOR_SUBPARTS (type_out);
40306 in_mode = TYPE_MODE (TREE_TYPE (type_in));
40307 in_n = TYPE_VECTOR_SUBPARTS (type_in);
40308 if (el_mode != in_mode
40309 || n != in_n)
40310 return NULL_TREE;
40312 switch (fn)
40314 CASE_CFN_SIN:
40315 CASE_CFN_COS:
40316 CASE_CFN_EXP:
40317 CASE_CFN_LOG:
40318 CASE_CFN_LOG2:
40319 CASE_CFN_LOG10:
40320 if (el_mode == DFmode && n == 2)
40322 name[4] = 'd';
40323 name[5] = '2';
40325 else if (el_mode == SFmode && n == 4)
40327 name[4] = 's';
40328 name[5] = '4';
40330 else
40331 return NULL_TREE;
40332 break;
40334 default:
40335 return NULL_TREE;
40338 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
40339 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
40340 sprintf (name + 7, "%s", bname+10);
40342 arity = 0;
40343 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
40344 arity++;
40346 if (arity == 1)
40347 fntype = build_function_type_list (type_out, type_in, NULL);
40348 else
40349 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
40351 /* Build a function declaration for the vectorized function. */
40352 new_fndecl = build_decl (BUILTINS_LOCATION,
40353 FUNCTION_DECL, get_identifier (name), fntype);
40354 TREE_PUBLIC (new_fndecl) = 1;
40355 DECL_EXTERNAL (new_fndecl) = 1;
40356 DECL_IS_NOVOPS (new_fndecl) = 1;
40357 TREE_READONLY (new_fndecl) = 1;
40359 return new_fndecl;
40362 /* Returns a decl of a function that implements gather load with
40363 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
40364 Return NULL_TREE if it is not available. */
40366 static tree
40367 ix86_vectorize_builtin_gather (const_tree mem_vectype,
40368 const_tree index_type, int scale)
40370 bool si;
40371 enum ix86_builtins code;
40373 if (! TARGET_AVX2)
40374 return NULL_TREE;
40376 if ((TREE_CODE (index_type) != INTEGER_TYPE
40377 && !POINTER_TYPE_P (index_type))
40378 || (TYPE_MODE (index_type) != SImode
40379 && TYPE_MODE (index_type) != DImode))
40380 return NULL_TREE;
40382 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40383 return NULL_TREE;
40385 /* v*gather* insn sign extends index to pointer mode. */
40386 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40387 && TYPE_UNSIGNED (index_type))
40388 return NULL_TREE;
40390 if (scale <= 0
40391 || scale > 8
40392 || (scale & (scale - 1)) != 0)
40393 return NULL_TREE;
40395 si = TYPE_MODE (index_type) == SImode;
40396 switch (TYPE_MODE (mem_vectype))
40398 case E_V2DFmode:
40399 if (TARGET_AVX512VL)
40400 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
40401 else
40402 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
40403 break;
40404 case E_V4DFmode:
40405 if (TARGET_AVX512VL)
40406 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
40407 else
40408 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
40409 break;
40410 case E_V2DImode:
40411 if (TARGET_AVX512VL)
40412 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
40413 else
40414 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
40415 break;
40416 case E_V4DImode:
40417 if (TARGET_AVX512VL)
40418 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
40419 else
40420 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
40421 break;
40422 case E_V4SFmode:
40423 if (TARGET_AVX512VL)
40424 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
40425 else
40426 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
40427 break;
40428 case E_V8SFmode:
40429 if (TARGET_AVX512VL)
40430 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
40431 else
40432 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
40433 break;
40434 case E_V4SImode:
40435 if (TARGET_AVX512VL)
40436 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
40437 else
40438 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
40439 break;
40440 case E_V8SImode:
40441 if (TARGET_AVX512VL)
40442 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
40443 else
40444 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
40445 break;
40446 case E_V8DFmode:
40447 if (TARGET_AVX512F)
40448 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
40449 else
40450 return NULL_TREE;
40451 break;
40452 case E_V8DImode:
40453 if (TARGET_AVX512F)
40454 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
40455 else
40456 return NULL_TREE;
40457 break;
40458 case E_V16SFmode:
40459 if (TARGET_AVX512F)
40460 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
40461 else
40462 return NULL_TREE;
40463 break;
40464 case E_V16SImode:
40465 if (TARGET_AVX512F)
40466 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
40467 else
40468 return NULL_TREE;
40469 break;
40470 default:
40471 return NULL_TREE;
40474 return ix86_get_builtin (code);
40477 /* Returns a decl of a function that implements scatter store with
40478 register type VECTYPE and index type INDEX_TYPE and SCALE.
40479 Return NULL_TREE if it is not available. */
40481 static tree
40482 ix86_vectorize_builtin_scatter (const_tree vectype,
40483 const_tree index_type, int scale)
40485 bool si;
40486 enum ix86_builtins code;
40488 if (!TARGET_AVX512F)
40489 return NULL_TREE;
40491 if ((TREE_CODE (index_type) != INTEGER_TYPE
40492 && !POINTER_TYPE_P (index_type))
40493 || (TYPE_MODE (index_type) != SImode
40494 && TYPE_MODE (index_type) != DImode))
40495 return NULL_TREE;
40497 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
40498 return NULL_TREE;
40500 /* v*scatter* insn sign extends index to pointer mode. */
40501 if (TYPE_PRECISION (index_type) < POINTER_SIZE
40502 && TYPE_UNSIGNED (index_type))
40503 return NULL_TREE;
40505 /* Scale can be 1, 2, 4 or 8. */
40506 if (scale <= 0
40507 || scale > 8
40508 || (scale & (scale - 1)) != 0)
40509 return NULL_TREE;
40511 si = TYPE_MODE (index_type) == SImode;
40512 switch (TYPE_MODE (vectype))
40514 case E_V8DFmode:
40515 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
40516 break;
40517 case E_V8DImode:
40518 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
40519 break;
40520 case E_V16SFmode:
40521 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
40522 break;
40523 case E_V16SImode:
40524 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
40525 break;
40526 default:
40527 return NULL_TREE;
40530 return ix86_builtins[code];
40533 /* Return true if it is safe to use the rsqrt optabs to optimize
40534 1.0/sqrt. */
40536 static bool
40537 use_rsqrt_p ()
40539 return (TARGET_SSE_MATH
40540 && flag_finite_math_only
40541 && !flag_trapping_math
40542 && flag_unsafe_math_optimizations);
40545 /* Returns a code for a target-specific builtin that implements
40546 reciprocal of the function, or NULL_TREE if not available. */
40548 static tree
40549 ix86_builtin_reciprocal (tree fndecl)
40551 switch (DECL_FUNCTION_CODE (fndecl))
40553 /* Vectorized version of sqrt to rsqrt conversion. */
40554 case IX86_BUILTIN_SQRTPS_NR:
40555 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
40557 case IX86_BUILTIN_SQRTPS_NR256:
40558 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
40560 default:
40561 return NULL_TREE;
40565 /* Helper for avx_vpermilps256_operand et al. This is also used by
40566 the expansion functions to turn the parallel back into a mask.
40567 The return value is 0 for no match and the imm8+1 for a match. */
40570 avx_vpermilp_parallel (rtx par, machine_mode mode)
40572 unsigned i, nelt = GET_MODE_NUNITS (mode);
40573 unsigned mask = 0;
40574 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
40576 if (XVECLEN (par, 0) != (int) nelt)
40577 return 0;
40579 /* Validate that all of the elements are constants, and not totally
40580 out of range. Copy the data into an integral array to make the
40581 subsequent checks easier. */
40582 for (i = 0; i < nelt; ++i)
40584 rtx er = XVECEXP (par, 0, i);
40585 unsigned HOST_WIDE_INT ei;
40587 if (!CONST_INT_P (er))
40588 return 0;
40589 ei = INTVAL (er);
40590 if (ei >= nelt)
40591 return 0;
40592 ipar[i] = ei;
40595 switch (mode)
40597 case E_V8DFmode:
40598 /* In the 512-bit DFmode case, we can only move elements within
40599 a 128-bit lane. First fill the second part of the mask,
40600 then fallthru. */
40601 for (i = 4; i < 6; ++i)
40603 if (ipar[i] < 4 || ipar[i] >= 6)
40604 return 0;
40605 mask |= (ipar[i] - 4) << i;
40607 for (i = 6; i < 8; ++i)
40609 if (ipar[i] < 6)
40610 return 0;
40611 mask |= (ipar[i] - 6) << i;
40613 /* FALLTHRU */
40615 case E_V4DFmode:
40616 /* In the 256-bit DFmode case, we can only move elements within
40617 a 128-bit lane. */
40618 for (i = 0; i < 2; ++i)
40620 if (ipar[i] >= 2)
40621 return 0;
40622 mask |= ipar[i] << i;
40624 for (i = 2; i < 4; ++i)
40626 if (ipar[i] < 2)
40627 return 0;
40628 mask |= (ipar[i] - 2) << i;
40630 break;
40632 case E_V16SFmode:
40633 /* In 512 bit SFmode case, permutation in the upper 256 bits
40634 must mirror the permutation in the lower 256-bits. */
40635 for (i = 0; i < 8; ++i)
40636 if (ipar[i] + 8 != ipar[i + 8])
40637 return 0;
40638 /* FALLTHRU */
40640 case E_V8SFmode:
40641 /* In 256 bit SFmode case, we have full freedom of
40642 movement within the low 128-bit lane, but the high 128-bit
40643 lane must mirror the exact same pattern. */
40644 for (i = 0; i < 4; ++i)
40645 if (ipar[i] + 4 != ipar[i + 4])
40646 return 0;
40647 nelt = 4;
40648 /* FALLTHRU */
40650 case E_V2DFmode:
40651 case E_V4SFmode:
40652 /* In the 128-bit case, we've full freedom in the placement of
40653 the elements from the source operand. */
40654 for (i = 0; i < nelt; ++i)
40655 mask |= ipar[i] << (i * (nelt / 2));
40656 break;
40658 default:
40659 gcc_unreachable ();
40662 /* Make sure success has a non-zero value by adding one. */
40663 return mask + 1;
40666 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
40667 the expansion functions to turn the parallel back into a mask.
40668 The return value is 0 for no match and the imm8+1 for a match. */
40671 avx_vperm2f128_parallel (rtx par, machine_mode mode)
40673 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
40674 unsigned mask = 0;
40675 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
40677 if (XVECLEN (par, 0) != (int) nelt)
40678 return 0;
40680 /* Validate that all of the elements are constants, and not totally
40681 out of range. Copy the data into an integral array to make the
40682 subsequent checks easier. */
40683 for (i = 0; i < nelt; ++i)
40685 rtx er = XVECEXP (par, 0, i);
40686 unsigned HOST_WIDE_INT ei;
40688 if (!CONST_INT_P (er))
40689 return 0;
40690 ei = INTVAL (er);
40691 if (ei >= 2 * nelt)
40692 return 0;
40693 ipar[i] = ei;
40696 /* Validate that the halves of the permute are halves. */
40697 for (i = 0; i < nelt2 - 1; ++i)
40698 if (ipar[i] + 1 != ipar[i + 1])
40699 return 0;
40700 for (i = nelt2; i < nelt - 1; ++i)
40701 if (ipar[i] + 1 != ipar[i + 1])
40702 return 0;
40704 /* Reconstruct the mask. */
40705 for (i = 0; i < 2; ++i)
40707 unsigned e = ipar[i * nelt2];
40708 if (e % nelt2)
40709 return 0;
40710 e /= nelt2;
40711 mask |= e << (i * 4);
40714 /* Make sure success has a non-zero value by adding one. */
40715 return mask + 1;
40718 /* Return a register priority for hard reg REGNO. */
40719 static int
40720 ix86_register_priority (int hard_regno)
40722 /* ebp and r13 as the base always wants a displacement, r12 as the
40723 base always wants an index. So discourage their usage in an
40724 address. */
40725 if (hard_regno == R12_REG || hard_regno == R13_REG)
40726 return 0;
40727 if (hard_regno == BP_REG)
40728 return 1;
40729 /* New x86-64 int registers result in bigger code size. Discourage
40730 them. */
40731 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
40732 return 2;
40733 /* New x86-64 SSE registers result in bigger code size. Discourage
40734 them. */
40735 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
40736 return 2;
40737 /* Usage of AX register results in smaller code. Prefer it. */
40738 if (hard_regno == AX_REG)
40739 return 4;
40740 return 3;
40743 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
40745 Put float CONST_DOUBLE in the constant pool instead of fp regs.
40746 QImode must go into class Q_REGS.
40747 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
40748 movdf to do mem-to-mem moves through integer regs. */
40750 static reg_class_t
40751 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
40753 machine_mode mode = GET_MODE (x);
40755 /* We're only allowed to return a subclass of CLASS. Many of the
40756 following checks fail for NO_REGS, so eliminate that early. */
40757 if (regclass == NO_REGS)
40758 return NO_REGS;
40760 /* All classes can load zeros. */
40761 if (x == CONST0_RTX (mode))
40762 return regclass;
40764 /* Force constants into memory if we are loading a (nonzero) constant into
40765 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
40766 instructions to load from a constant. */
40767 if (CONSTANT_P (x)
40768 && (MAYBE_MMX_CLASS_P (regclass)
40769 || MAYBE_SSE_CLASS_P (regclass)
40770 || MAYBE_MASK_CLASS_P (regclass)))
40771 return NO_REGS;
40773 /* Floating-point constants need more complex checks. */
40774 if (CONST_DOUBLE_P (x))
40776 /* General regs can load everything. */
40777 if (INTEGER_CLASS_P (regclass))
40778 return regclass;
40780 /* Floats can load 0 and 1 plus some others. Note that we eliminated
40781 zero above. We only want to wind up preferring 80387 registers if
40782 we plan on doing computation with them. */
40783 if (IS_STACK_MODE (mode)
40784 && standard_80387_constant_p (x) > 0)
40786 /* Limit class to FP regs. */
40787 if (FLOAT_CLASS_P (regclass))
40788 return FLOAT_REGS;
40789 else if (regclass == FP_TOP_SSE_REGS)
40790 return FP_TOP_REG;
40791 else if (regclass == FP_SECOND_SSE_REGS)
40792 return FP_SECOND_REG;
40795 return NO_REGS;
40798 /* Prefer SSE regs only, if we can use them for math. */
40799 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40800 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
40802 /* Generally when we see PLUS here, it's the function invariant
40803 (plus soft-fp const_int). Which can only be computed into general
40804 regs. */
40805 if (GET_CODE (x) == PLUS)
40806 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
40808 /* QImode constants are easy to load, but non-constant QImode data
40809 must go into Q_REGS. */
40810 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
40812 if (Q_CLASS_P (regclass))
40813 return regclass;
40814 else if (reg_class_subset_p (Q_REGS, regclass))
40815 return Q_REGS;
40816 else
40817 return NO_REGS;
40820 return regclass;
40823 /* Discourage putting floating-point values in SSE registers unless
40824 SSE math is being used, and likewise for the 387 registers. */
40825 static reg_class_t
40826 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
40828 machine_mode mode = GET_MODE (x);
40830 /* Restrict the output reload class to the register bank that we are doing
40831 math on. If we would like not to return a subset of CLASS, reject this
40832 alternative: if reload cannot do this, it will still use its choice. */
40833 mode = GET_MODE (x);
40834 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40835 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
40837 if (IS_STACK_MODE (mode))
40839 if (regclass == FP_TOP_SSE_REGS)
40840 return FP_TOP_REG;
40841 else if (regclass == FP_SECOND_SSE_REGS)
40842 return FP_SECOND_REG;
40843 else
40844 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
40847 return regclass;
40850 static reg_class_t
40851 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
40852 machine_mode mode, secondary_reload_info *sri)
40854 /* Double-word spills from general registers to non-offsettable memory
40855 references (zero-extended addresses) require special handling. */
40856 if (TARGET_64BIT
40857 && MEM_P (x)
40858 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
40859 && INTEGER_CLASS_P (rclass)
40860 && !offsettable_memref_p (x))
40862 sri->icode = (in_p
40863 ? CODE_FOR_reload_noff_load
40864 : CODE_FOR_reload_noff_store);
40865 /* Add the cost of moving address to a temporary. */
40866 sri->extra_cost = 1;
40868 return NO_REGS;
40871 /* QImode spills from non-QI registers require
40872 intermediate register on 32bit targets. */
40873 if (mode == QImode
40874 && ((!TARGET_64BIT && !in_p
40875 && INTEGER_CLASS_P (rclass)
40876 && MAYBE_NON_Q_CLASS_P (rclass))
40877 || (!TARGET_AVX512DQ
40878 && MAYBE_MASK_CLASS_P (rclass))))
40880 int regno = true_regnum (x);
40882 /* Return Q_REGS if the operand is in memory. */
40883 if (regno == -1)
40884 return Q_REGS;
40886 return NO_REGS;
40889 /* This condition handles corner case where an expression involving
40890 pointers gets vectorized. We're trying to use the address of a
40891 stack slot as a vector initializer.
40893 (set (reg:V2DI 74 [ vect_cst_.2 ])
40894 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
40896 Eventually frame gets turned into sp+offset like this:
40898 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40899 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40900 (const_int 392 [0x188]))))
40902 That later gets turned into:
40904 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40905 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
40906 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
40908 We'll have the following reload recorded:
40910 Reload 0: reload_in (DI) =
40911 (plus:DI (reg/f:DI 7 sp)
40912 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
40913 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40914 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
40915 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
40916 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
40917 reload_reg_rtx: (reg:V2DI 22 xmm1)
40919 Which isn't going to work since SSE instructions can't handle scalar
40920 additions. Returning GENERAL_REGS forces the addition into integer
40921 register and reload can handle subsequent reloads without problems. */
40923 if (in_p && GET_CODE (x) == PLUS
40924 && SSE_CLASS_P (rclass)
40925 && SCALAR_INT_MODE_P (mode))
40926 return GENERAL_REGS;
40928 return NO_REGS;
40931 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
40933 static bool
40934 ix86_class_likely_spilled_p (reg_class_t rclass)
40936 switch (rclass)
40938 case AREG:
40939 case DREG:
40940 case CREG:
40941 case BREG:
40942 case AD_REGS:
40943 case SIREG:
40944 case DIREG:
40945 case SSE_FIRST_REG:
40946 case FP_TOP_REG:
40947 case FP_SECOND_REG:
40948 case BND_REGS:
40949 return true;
40951 default:
40952 break;
40955 return false;
40958 /* If we are copying between registers from different register sets
40959 (e.g. FP and integer), we may need a memory location.
40961 The function can't work reliably when one of the CLASSES is a class
40962 containing registers from multiple sets. We avoid this by never combining
40963 different sets in a single alternative in the machine description.
40964 Ensure that this constraint holds to avoid unexpected surprises.
40966 When STRICT is false, we are being called from REGISTER_MOVE_COST,
40967 so do not enforce these sanity checks.
40969 To optimize register_move_cost performance, define inline variant. */
40971 static inline bool
40972 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
40973 machine_mode mode, int strict)
40975 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
40976 return false;
40978 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
40979 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
40980 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
40981 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
40982 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
40983 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
40984 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
40985 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
40987 gcc_assert (!strict || lra_in_progress);
40988 return true;
40991 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
40992 return true;
40994 /* Between mask and general, we have moves no larger than word size. */
40995 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
40996 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
40997 return true;
40999 /* ??? This is a lie. We do have moves between mmx/general, and for
41000 mmx/sse2. But by saying we need secondary memory we discourage the
41001 register allocator from using the mmx registers unless needed. */
41002 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
41003 return true;
41005 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41007 /* SSE1 doesn't have any direct moves from other classes. */
41008 if (!TARGET_SSE2)
41009 return true;
41011 /* If the target says that inter-unit moves are more expensive
41012 than moving through memory, then don't generate them. */
41013 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
41014 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
41015 return true;
41017 /* Between SSE and general, we have moves no larger than word size. */
41018 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41019 return true;
41022 return false;
41025 bool
41026 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
41027 machine_mode mode, int strict)
41029 return inline_secondary_memory_needed (class1, class2, mode, strict);
41032 /* Implement the TARGET_CLASS_MAX_NREGS hook.
41034 On the 80386, this is the size of MODE in words,
41035 except in the FP regs, where a single reg is always enough. */
41037 static unsigned char
41038 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
41040 if (MAYBE_INTEGER_CLASS_P (rclass))
41042 if (mode == XFmode)
41043 return (TARGET_64BIT ? 2 : 3);
41044 else if (mode == XCmode)
41045 return (TARGET_64BIT ? 4 : 6);
41046 else
41047 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
41049 else
41051 if (COMPLEX_MODE_P (mode))
41052 return 2;
41053 else
41054 return 1;
41058 /* Return true if the registers in CLASS cannot represent the change from
41059 modes FROM to TO. */
41061 bool
41062 ix86_cannot_change_mode_class (machine_mode from, machine_mode to,
41063 enum reg_class regclass)
41065 if (from == to)
41066 return false;
41068 /* x87 registers can't do subreg at all, as all values are reformatted
41069 to extended precision. */
41070 if (MAYBE_FLOAT_CLASS_P (regclass))
41071 return true;
41073 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
41075 /* Vector registers do not support QI or HImode loads. If we don't
41076 disallow a change to these modes, reload will assume it's ok to
41077 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
41078 the vec_dupv4hi pattern. */
41079 if (GET_MODE_SIZE (from) < 4)
41080 return true;
41083 return false;
41086 /* Return the cost of moving data of mode M between a
41087 register and memory. A value of 2 is the default; this cost is
41088 relative to those in `REGISTER_MOVE_COST'.
41090 This function is used extensively by register_move_cost that is used to
41091 build tables at startup. Make it inline in this case.
41092 When IN is 2, return maximum of in and out move cost.
41094 If moving between registers and memory is more expensive than
41095 between two registers, you should define this macro to express the
41096 relative cost.
41098 Model also increased moving costs of QImode registers in non
41099 Q_REGS classes.
41101 static inline int
41102 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
41103 int in)
41105 int cost;
41106 if (FLOAT_CLASS_P (regclass))
41108 int index;
41109 switch (mode)
41111 case E_SFmode:
41112 index = 0;
41113 break;
41114 case E_DFmode:
41115 index = 1;
41116 break;
41117 case E_XFmode:
41118 index = 2;
41119 break;
41120 default:
41121 return 100;
41123 if (in == 2)
41124 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
41125 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
41127 if (SSE_CLASS_P (regclass))
41129 int index;
41130 switch (GET_MODE_SIZE (mode))
41132 case 4:
41133 index = 0;
41134 break;
41135 case 8:
41136 index = 1;
41137 break;
41138 case 16:
41139 index = 2;
41140 break;
41141 default:
41142 return 100;
41144 if (in == 2)
41145 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
41146 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
41148 if (MMX_CLASS_P (regclass))
41150 int index;
41151 switch (GET_MODE_SIZE (mode))
41153 case 4:
41154 index = 0;
41155 break;
41156 case 8:
41157 index = 1;
41158 break;
41159 default:
41160 return 100;
41162 if (in)
41163 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
41164 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
41166 switch (GET_MODE_SIZE (mode))
41168 case 1:
41169 if (Q_CLASS_P (regclass) || TARGET_64BIT)
41171 if (!in)
41172 return ix86_cost->int_store[0];
41173 if (TARGET_PARTIAL_REG_DEPENDENCY
41174 && optimize_function_for_speed_p (cfun))
41175 cost = ix86_cost->movzbl_load;
41176 else
41177 cost = ix86_cost->int_load[0];
41178 if (in == 2)
41179 return MAX (cost, ix86_cost->int_store[0]);
41180 return cost;
41182 else
41184 if (in == 2)
41185 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
41186 if (in)
41187 return ix86_cost->movzbl_load;
41188 else
41189 return ix86_cost->int_store[0] + 4;
41191 break;
41192 case 2:
41193 if (in == 2)
41194 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
41195 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
41196 default:
41197 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
41198 if (mode == TFmode)
41199 mode = XFmode;
41200 if (in == 2)
41201 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
41202 else if (in)
41203 cost = ix86_cost->int_load[2];
41204 else
41205 cost = ix86_cost->int_store[2];
41206 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
41210 static int
41211 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
41212 bool in)
41214 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
41218 /* Return the cost of moving data from a register in class CLASS1 to
41219 one in class CLASS2.
41221 It is not required that the cost always equal 2 when FROM is the same as TO;
41222 on some machines it is expensive to move between registers if they are not
41223 general registers. */
41225 static int
41226 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
41227 reg_class_t class2_i)
41229 enum reg_class class1 = (enum reg_class) class1_i;
41230 enum reg_class class2 = (enum reg_class) class2_i;
41232 /* In case we require secondary memory, compute cost of the store followed
41233 by load. In order to avoid bad register allocation choices, we need
41234 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
41236 if (inline_secondary_memory_needed (class1, class2, mode, 0))
41238 int cost = 1;
41240 cost += inline_memory_move_cost (mode, class1, 2);
41241 cost += inline_memory_move_cost (mode, class2, 2);
41243 /* In case of copying from general_purpose_register we may emit multiple
41244 stores followed by single load causing memory size mismatch stall.
41245 Count this as arbitrarily high cost of 20. */
41246 if (targetm.class_max_nregs (class1, mode)
41247 > targetm.class_max_nregs (class2, mode))
41248 cost += 20;
41250 /* In the case of FP/MMX moves, the registers actually overlap, and we
41251 have to switch modes in order to treat them differently. */
41252 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
41253 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
41254 cost += 20;
41256 return cost;
41259 /* Moves between SSE/MMX and integer unit are expensive. */
41260 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
41261 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
41263 /* ??? By keeping returned value relatively high, we limit the number
41264 of moves between integer and MMX/SSE registers for all targets.
41265 Additionally, high value prevents problem with x86_modes_tieable_p(),
41266 where integer modes in MMX/SSE registers are not tieable
41267 because of missing QImode and HImode moves to, from or between
41268 MMX/SSE registers. */
41269 return MAX (8, ix86_cost->mmxsse_to_integer);
41271 if (MAYBE_FLOAT_CLASS_P (class1))
41272 return ix86_cost->fp_move;
41273 if (MAYBE_SSE_CLASS_P (class1))
41274 return ix86_cost->sse_move;
41275 if (MAYBE_MMX_CLASS_P (class1))
41276 return ix86_cost->mmx_move;
41277 return 2;
41280 /* Return TRUE if hard register REGNO can hold a value of machine-mode
41281 MODE. */
41283 bool
41284 ix86_hard_regno_mode_ok (int regno, machine_mode mode)
41286 /* Flags and only flags can only hold CCmode values. */
41287 if (CC_REGNO_P (regno))
41288 return GET_MODE_CLASS (mode) == MODE_CC;
41289 if (GET_MODE_CLASS (mode) == MODE_CC
41290 || GET_MODE_CLASS (mode) == MODE_RANDOM
41291 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
41292 return false;
41293 if (STACK_REGNO_P (regno))
41294 return VALID_FP_MODE_P (mode);
41295 if (MASK_REGNO_P (regno))
41296 return (VALID_MASK_REG_MODE (mode)
41297 || (TARGET_AVX512BW
41298 && VALID_MASK_AVX512BW_MODE (mode)));
41299 if (BND_REGNO_P (regno))
41300 return VALID_BND_REG_MODE (mode);
41301 if (SSE_REGNO_P (regno))
41303 /* We implement the move patterns for all vector modes into and
41304 out of SSE registers, even when no operation instructions
41305 are available. */
41307 /* For AVX-512 we allow, regardless of regno:
41308 - XI mode
41309 - any of 512-bit wide vector mode
41310 - any scalar mode. */
41311 if (TARGET_AVX512F
41312 && (mode == XImode
41313 || VALID_AVX512F_REG_MODE (mode)
41314 || VALID_AVX512F_SCALAR_MODE (mode)))
41315 return true;
41317 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
41318 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41319 && MOD4_SSE_REGNO_P (regno)
41320 && mode == V64SFmode)
41321 return true;
41323 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
41324 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
41325 && MOD4_SSE_REGNO_P (regno)
41326 && mode == V64SImode)
41327 return true;
41329 /* TODO check for QI/HI scalars. */
41330 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
41331 if (TARGET_AVX512VL
41332 && (mode == OImode
41333 || mode == TImode
41334 || VALID_AVX256_REG_MODE (mode)
41335 || VALID_AVX512VL_128_REG_MODE (mode)))
41336 return true;
41338 /* xmm16-xmm31 are only available for AVX-512. */
41339 if (EXT_REX_SSE_REGNO_P (regno))
41340 return false;
41342 /* OImode and AVX modes are available only when AVX is enabled. */
41343 return ((TARGET_AVX
41344 && VALID_AVX256_REG_OR_OI_MODE (mode))
41345 || VALID_SSE_REG_MODE (mode)
41346 || VALID_SSE2_REG_MODE (mode)
41347 || VALID_MMX_REG_MODE (mode)
41348 || VALID_MMX_REG_MODE_3DNOW (mode));
41350 if (MMX_REGNO_P (regno))
41352 /* We implement the move patterns for 3DNOW modes even in MMX mode,
41353 so if the register is available at all, then we can move data of
41354 the given mode into or out of it. */
41355 return (VALID_MMX_REG_MODE (mode)
41356 || VALID_MMX_REG_MODE_3DNOW (mode));
41359 if (mode == QImode)
41361 /* Take care for QImode values - they can be in non-QI regs,
41362 but then they do cause partial register stalls. */
41363 if (ANY_QI_REGNO_P (regno))
41364 return true;
41365 if (!TARGET_PARTIAL_REG_STALL)
41366 return true;
41367 /* LRA checks if the hard register is OK for the given mode.
41368 QImode values can live in non-QI regs, so we allow all
41369 registers here. */
41370 if (lra_in_progress)
41371 return true;
41372 return !can_create_pseudo_p ();
41374 /* We handle both integer and floats in the general purpose registers. */
41375 else if (VALID_INT_MODE_P (mode))
41376 return true;
41377 else if (VALID_FP_MODE_P (mode))
41378 return true;
41379 else if (VALID_DFP_MODE_P (mode))
41380 return true;
41381 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
41382 on to use that value in smaller contexts, this can easily force a
41383 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
41384 supporting DImode, allow it. */
41385 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
41386 return true;
41388 return false;
41391 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
41392 tieable integer mode. */
41394 static bool
41395 ix86_tieable_integer_mode_p (machine_mode mode)
41397 switch (mode)
41399 case E_HImode:
41400 case E_SImode:
41401 return true;
41403 case E_QImode:
41404 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
41406 case E_DImode:
41407 return TARGET_64BIT;
41409 default:
41410 return false;
41414 /* Return true if MODE1 is accessible in a register that can hold MODE2
41415 without copying. That is, all register classes that can hold MODE2
41416 can also hold MODE1. */
41418 bool
41419 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
41421 if (mode1 == mode2)
41422 return true;
41424 if (ix86_tieable_integer_mode_p (mode1)
41425 && ix86_tieable_integer_mode_p (mode2))
41426 return true;
41428 /* MODE2 being XFmode implies fp stack or general regs, which means we
41429 can tie any smaller floating point modes to it. Note that we do not
41430 tie this with TFmode. */
41431 if (mode2 == XFmode)
41432 return mode1 == SFmode || mode1 == DFmode;
41434 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
41435 that we can tie it with SFmode. */
41436 if (mode2 == DFmode)
41437 return mode1 == SFmode;
41439 /* If MODE2 is only appropriate for an SSE register, then tie with
41440 any other mode acceptable to SSE registers. */
41441 if (GET_MODE_SIZE (mode2) == 32
41442 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41443 return (GET_MODE_SIZE (mode1) == 32
41444 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41445 if (GET_MODE_SIZE (mode2) == 16
41446 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
41447 return (GET_MODE_SIZE (mode1) == 16
41448 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
41450 /* If MODE2 is appropriate for an MMX register, then tie
41451 with any other mode acceptable to MMX registers. */
41452 if (GET_MODE_SIZE (mode2) == 8
41453 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
41454 return (GET_MODE_SIZE (mode1) == 8
41455 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
41457 return false;
41460 /* Return the cost of moving between two registers of mode MODE. */
41462 static int
41463 ix86_set_reg_reg_cost (machine_mode mode)
41465 unsigned int units = UNITS_PER_WORD;
41467 switch (GET_MODE_CLASS (mode))
41469 default:
41470 break;
41472 case MODE_CC:
41473 units = GET_MODE_SIZE (CCmode);
41474 break;
41476 case MODE_FLOAT:
41477 if ((TARGET_SSE && mode == TFmode)
41478 || (TARGET_80387 && mode == XFmode)
41479 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
41480 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
41481 units = GET_MODE_SIZE (mode);
41482 break;
41484 case MODE_COMPLEX_FLOAT:
41485 if ((TARGET_SSE && mode == TCmode)
41486 || (TARGET_80387 && mode == XCmode)
41487 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
41488 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
41489 units = GET_MODE_SIZE (mode);
41490 break;
41492 case MODE_VECTOR_INT:
41493 case MODE_VECTOR_FLOAT:
41494 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
41495 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
41496 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
41497 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
41498 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
41499 units = GET_MODE_SIZE (mode);
41502 /* Return the cost of moving between two registers of mode MODE,
41503 assuming that the move will be in pieces of at most UNITS bytes. */
41504 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
41507 /* Compute a (partial) cost for rtx X. Return true if the complete
41508 cost has been computed, and false if subexpressions should be
41509 scanned. In either case, *TOTAL contains the cost result. */
41511 static bool
41512 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
41513 int *total, bool speed)
41515 rtx mask;
41516 enum rtx_code code = GET_CODE (x);
41517 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
41518 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
41519 int src_cost;
41521 switch (code)
41523 case SET:
41524 if (register_operand (SET_DEST (x), VOIDmode)
41525 && reg_or_0_operand (SET_SRC (x), VOIDmode))
41527 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
41528 return true;
41531 if (register_operand (SET_SRC (x), VOIDmode))
41532 /* Avoid potentially incorrect high cost from rtx_costs
41533 for non-tieable SUBREGs. */
41534 src_cost = 0;
41535 else
41537 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
41539 if (CONSTANT_P (SET_SRC (x)))
41540 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
41541 a small value, possibly zero for cheap constants. */
41542 src_cost += COSTS_N_INSNS (1);
41545 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
41546 return true;
41548 case CONST_INT:
41549 case CONST:
41550 case LABEL_REF:
41551 case SYMBOL_REF:
41552 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
41553 *total = 3;
41554 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
41555 *total = 2;
41556 else if (flag_pic && SYMBOLIC_CONST (x)
41557 && !(TARGET_64BIT
41558 && (GET_CODE (x) == LABEL_REF
41559 || (GET_CODE (x) == SYMBOL_REF
41560 && SYMBOL_REF_LOCAL_P (x))))
41561 /* Use 0 cost for CONST to improve its propagation. */
41562 && (TARGET_64BIT || GET_CODE (x) != CONST))
41563 *total = 1;
41564 else
41565 *total = 0;
41566 return true;
41568 case CONST_DOUBLE:
41569 if (IS_STACK_MODE (mode))
41570 switch (standard_80387_constant_p (x))
41572 case -1:
41573 case 0:
41574 break;
41575 case 1: /* 0.0 */
41576 *total = 1;
41577 return true;
41578 default: /* Other constants */
41579 *total = 2;
41580 return true;
41582 /* FALLTHRU */
41584 case CONST_VECTOR:
41585 switch (standard_sse_constant_p (x, mode))
41587 case 0:
41588 break;
41589 case 1: /* 0: xor eliminates false dependency */
41590 *total = 0;
41591 return true;
41592 default: /* -1: cmp contains false dependency */
41593 *total = 1;
41594 return true;
41596 /* FALLTHRU */
41598 case CONST_WIDE_INT:
41599 /* Fall back to (MEM (SYMBOL_REF)), since that's where
41600 it'll probably end up. Add a penalty for size. */
41601 *total = (COSTS_N_INSNS (1)
41602 + (!TARGET_64BIT && flag_pic)
41603 + (GET_MODE_SIZE (mode) <= 4
41604 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
41605 return true;
41607 case ZERO_EXTEND:
41608 /* The zero extensions is often completely free on x86_64, so make
41609 it as cheap as possible. */
41610 if (TARGET_64BIT && mode == DImode
41611 && GET_MODE (XEXP (x, 0)) == SImode)
41612 *total = 1;
41613 else if (TARGET_ZERO_EXTEND_WITH_AND)
41614 *total = cost->add;
41615 else
41616 *total = cost->movzx;
41617 return false;
41619 case SIGN_EXTEND:
41620 *total = cost->movsx;
41621 return false;
41623 case ASHIFT:
41624 if (SCALAR_INT_MODE_P (mode)
41625 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
41626 && CONST_INT_P (XEXP (x, 1)))
41628 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41629 if (value == 1)
41631 *total = cost->add;
41632 return false;
41634 if ((value == 2 || value == 3)
41635 && cost->lea <= cost->shift_const)
41637 *total = cost->lea;
41638 return false;
41641 /* FALLTHRU */
41643 case ROTATE:
41644 case ASHIFTRT:
41645 case LSHIFTRT:
41646 case ROTATERT:
41647 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41649 /* ??? Should be SSE vector operation cost. */
41650 /* At least for published AMD latencies, this really is the same
41651 as the latency for a simple fpu operation like fabs. */
41652 /* V*QImode is emulated with 1-11 insns. */
41653 if (mode == V16QImode || mode == V32QImode)
41655 int count = 11;
41656 if (TARGET_XOP && mode == V16QImode)
41658 /* For XOP we use vpshab, which requires a broadcast of the
41659 value to the variable shift insn. For constants this
41660 means a V16Q const in mem; even when we can perform the
41661 shift with one insn set the cost to prefer paddb. */
41662 if (CONSTANT_P (XEXP (x, 1)))
41664 *total = (cost->fabs
41665 + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
41666 + (speed ? 2 : COSTS_N_BYTES (16)));
41667 return true;
41669 count = 3;
41671 else if (TARGET_SSSE3)
41672 count = 7;
41673 *total = cost->fabs * count;
41675 else
41676 *total = cost->fabs;
41678 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41680 if (CONST_INT_P (XEXP (x, 1)))
41682 if (INTVAL (XEXP (x, 1)) > 32)
41683 *total = cost->shift_const + COSTS_N_INSNS (2);
41684 else
41685 *total = cost->shift_const * 2;
41687 else
41689 if (GET_CODE (XEXP (x, 1)) == AND)
41690 *total = cost->shift_var * 2;
41691 else
41692 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
41695 else
41697 if (CONST_INT_P (XEXP (x, 1)))
41698 *total = cost->shift_const;
41699 else if (SUBREG_P (XEXP (x, 1))
41700 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
41702 /* Return the cost after shift-and truncation. */
41703 *total = cost->shift_var;
41704 return true;
41706 else
41707 *total = cost->shift_var;
41709 return false;
41711 case FMA:
41713 rtx sub;
41715 gcc_assert (FLOAT_MODE_P (mode));
41716 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
41718 /* ??? SSE scalar/vector cost should be used here. */
41719 /* ??? Bald assumption that fma has the same cost as fmul. */
41720 *total = cost->fmul;
41721 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
41723 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
41724 sub = XEXP (x, 0);
41725 if (GET_CODE (sub) == NEG)
41726 sub = XEXP (sub, 0);
41727 *total += rtx_cost (sub, mode, FMA, 0, speed);
41729 sub = XEXP (x, 2);
41730 if (GET_CODE (sub) == NEG)
41731 sub = XEXP (sub, 0);
41732 *total += rtx_cost (sub, mode, FMA, 2, speed);
41733 return true;
41736 case MULT:
41737 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41739 /* ??? SSE scalar cost should be used here. */
41740 *total = cost->fmul;
41741 return false;
41743 else if (X87_FLOAT_MODE_P (mode))
41745 *total = cost->fmul;
41746 return false;
41748 else if (FLOAT_MODE_P (mode))
41750 /* ??? SSE vector cost should be used here. */
41751 *total = cost->fmul;
41752 return false;
41754 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41756 /* V*QImode is emulated with 7-13 insns. */
41757 if (mode == V16QImode || mode == V32QImode)
41759 int extra = 11;
41760 if (TARGET_XOP && mode == V16QImode)
41761 extra = 5;
41762 else if (TARGET_SSSE3)
41763 extra = 6;
41764 *total = cost->fmul * 2 + cost->fabs * extra;
41766 /* V*DImode is emulated with 5-8 insns. */
41767 else if (mode == V2DImode || mode == V4DImode)
41769 if (TARGET_XOP && mode == V2DImode)
41770 *total = cost->fmul * 2 + cost->fabs * 3;
41771 else
41772 *total = cost->fmul * 3 + cost->fabs * 5;
41774 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
41775 insns, including two PMULUDQ. */
41776 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
41777 *total = cost->fmul * 2 + cost->fabs * 5;
41778 else
41779 *total = cost->fmul;
41780 return false;
41782 else
41784 rtx op0 = XEXP (x, 0);
41785 rtx op1 = XEXP (x, 1);
41786 int nbits;
41787 if (CONST_INT_P (XEXP (x, 1)))
41789 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
41790 for (nbits = 0; value != 0; value &= value - 1)
41791 nbits++;
41793 else
41794 /* This is arbitrary. */
41795 nbits = 7;
41797 /* Compute costs correctly for widening multiplication. */
41798 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
41799 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
41800 == GET_MODE_SIZE (mode))
41802 int is_mulwiden = 0;
41803 machine_mode inner_mode = GET_MODE (op0);
41805 if (GET_CODE (op0) == GET_CODE (op1))
41806 is_mulwiden = 1, op1 = XEXP (op1, 0);
41807 else if (CONST_INT_P (op1))
41809 if (GET_CODE (op0) == SIGN_EXTEND)
41810 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
41811 == INTVAL (op1);
41812 else
41813 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
41816 if (is_mulwiden)
41817 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
41820 *total = (cost->mult_init[MODE_INDEX (mode)]
41821 + nbits * cost->mult_bit
41822 + rtx_cost (op0, mode, outer_code, opno, speed)
41823 + rtx_cost (op1, mode, outer_code, opno, speed));
41825 return true;
41828 case DIV:
41829 case UDIV:
41830 case MOD:
41831 case UMOD:
41832 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41833 /* ??? SSE cost should be used here. */
41834 *total = cost->fdiv;
41835 else if (X87_FLOAT_MODE_P (mode))
41836 *total = cost->fdiv;
41837 else if (FLOAT_MODE_P (mode))
41838 /* ??? SSE vector cost should be used here. */
41839 *total = cost->fdiv;
41840 else
41841 *total = cost->divide[MODE_INDEX (mode)];
41842 return false;
41844 case PLUS:
41845 if (GET_MODE_CLASS (mode) == MODE_INT
41846 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
41848 if (GET_CODE (XEXP (x, 0)) == PLUS
41849 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
41850 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
41851 && CONSTANT_P (XEXP (x, 1)))
41853 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
41854 if (val == 2 || val == 4 || val == 8)
41856 *total = cost->lea;
41857 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41858 outer_code, opno, speed);
41859 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
41860 outer_code, opno, speed);
41861 *total += rtx_cost (XEXP (x, 1), mode,
41862 outer_code, opno, speed);
41863 return true;
41866 else if (GET_CODE (XEXP (x, 0)) == MULT
41867 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
41869 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
41870 if (val == 2 || val == 4 || val == 8)
41872 *total = cost->lea;
41873 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41874 outer_code, opno, speed);
41875 *total += rtx_cost (XEXP (x, 1), mode,
41876 outer_code, opno, speed);
41877 return true;
41880 else if (GET_CODE (XEXP (x, 0)) == PLUS)
41882 /* Add with carry, ignore the cost of adding a carry flag. */
41883 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
41884 *total = cost->add;
41885 else
41887 *total = cost->lea;
41888 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41889 outer_code, opno, speed);
41892 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
41893 outer_code, opno, speed);
41894 *total += rtx_cost (XEXP (x, 1), mode,
41895 outer_code, opno, speed);
41896 return true;
41899 /* FALLTHRU */
41901 case MINUS:
41902 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
41903 if (GET_MODE_CLASS (mode) == MODE_INT
41904 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
41905 && GET_CODE (XEXP (x, 0)) == MINUS
41906 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
41908 *total = cost->add;
41909 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
41910 outer_code, opno, speed);
41911 *total += rtx_cost (XEXP (x, 1), mode,
41912 outer_code, opno, speed);
41913 return true;
41916 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41918 /* ??? SSE cost should be used here. */
41919 *total = cost->fadd;
41920 return false;
41922 else if (X87_FLOAT_MODE_P (mode))
41924 *total = cost->fadd;
41925 return false;
41927 else if (FLOAT_MODE_P (mode))
41929 /* ??? SSE vector cost should be used here. */
41930 *total = cost->fadd;
41931 return false;
41933 /* FALLTHRU */
41935 case AND:
41936 case IOR:
41937 case XOR:
41938 if (GET_MODE_CLASS (mode) == MODE_INT
41939 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41941 *total = (cost->add * 2
41942 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
41943 << (GET_MODE (XEXP (x, 0)) != DImode))
41944 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
41945 << (GET_MODE (XEXP (x, 1)) != DImode)));
41946 return true;
41948 /* FALLTHRU */
41950 case NEG:
41951 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41953 /* ??? SSE cost should be used here. */
41954 *total = cost->fchs;
41955 return false;
41957 else if (X87_FLOAT_MODE_P (mode))
41959 *total = cost->fchs;
41960 return false;
41962 else if (FLOAT_MODE_P (mode))
41964 /* ??? SSE vector cost should be used here. */
41965 *total = cost->fchs;
41966 return false;
41968 /* FALLTHRU */
41970 case NOT:
41971 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41973 /* ??? Should be SSE vector operation cost. */
41974 /* At least for published AMD latencies, this really is the same
41975 as the latency for a simple fpu operation like fabs. */
41976 *total = cost->fabs;
41978 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41979 *total = cost->add * 2;
41980 else
41981 *total = cost->add;
41982 return false;
41984 case COMPARE:
41985 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41986 && XEXP (XEXP (x, 0), 1) == const1_rtx
41987 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41988 && XEXP (x, 1) == const0_rtx)
41990 /* This kind of construct is implemented using test[bwl].
41991 Treat it as if we had an AND. */
41992 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41993 *total = (cost->add
41994 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41995 opno, speed)
41996 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41997 return true;
42000 /* The embedded comparison operand is completely free. */
42001 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
42002 && XEXP (x, 1) == const0_rtx)
42003 *total = 0;
42005 return false;
42007 case FLOAT_EXTEND:
42008 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
42009 *total = 0;
42010 return false;
42012 case ABS:
42013 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42014 /* ??? SSE cost should be used here. */
42015 *total = cost->fabs;
42016 else if (X87_FLOAT_MODE_P (mode))
42017 *total = cost->fabs;
42018 else if (FLOAT_MODE_P (mode))
42019 /* ??? SSE vector cost should be used here. */
42020 *total = cost->fabs;
42021 return false;
42023 case SQRT:
42024 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
42025 /* ??? SSE cost should be used here. */
42026 *total = cost->fsqrt;
42027 else if (X87_FLOAT_MODE_P (mode))
42028 *total = cost->fsqrt;
42029 else if (FLOAT_MODE_P (mode))
42030 /* ??? SSE vector cost should be used here. */
42031 *total = cost->fsqrt;
42032 return false;
42034 case UNSPEC:
42035 if (XINT (x, 1) == UNSPEC_TP)
42036 *total = 0;
42037 return false;
42039 case VEC_SELECT:
42040 case VEC_CONCAT:
42041 case VEC_DUPLICATE:
42042 /* ??? Assume all of these vector manipulation patterns are
42043 recognizable. In which case they all pretty much have the
42044 same cost. */
42045 *total = cost->fabs;
42046 return true;
42047 case VEC_MERGE:
42048 mask = XEXP (x, 2);
42049 /* This is masked instruction, assume the same cost,
42050 as nonmasked variant. */
42051 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
42052 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
42053 else
42054 *total = cost->fabs;
42055 return true;
42057 default:
42058 return false;
42062 #if TARGET_MACHO
42064 static int current_machopic_label_num;
42066 /* Given a symbol name and its associated stub, write out the
42067 definition of the stub. */
42069 void
42070 machopic_output_stub (FILE *file, const char *symb, const char *stub)
42072 unsigned int length;
42073 char *binder_name, *symbol_name, lazy_ptr_name[32];
42074 int label = ++current_machopic_label_num;
42076 /* For 64-bit we shouldn't get here. */
42077 gcc_assert (!TARGET_64BIT);
42079 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
42080 symb = targetm.strip_name_encoding (symb);
42082 length = strlen (stub);
42083 binder_name = XALLOCAVEC (char, length + 32);
42084 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
42086 length = strlen (symb);
42087 symbol_name = XALLOCAVEC (char, length + 32);
42088 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
42090 sprintf (lazy_ptr_name, "L%d$lz", label);
42092 if (MACHOPIC_ATT_STUB)
42093 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
42094 else if (MACHOPIC_PURE)
42095 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
42096 else
42097 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
42099 fprintf (file, "%s:\n", stub);
42100 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42102 if (MACHOPIC_ATT_STUB)
42104 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
42106 else if (MACHOPIC_PURE)
42108 /* PIC stub. */
42109 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42110 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
42111 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
42112 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
42113 label, lazy_ptr_name, label);
42114 fprintf (file, "\tjmp\t*%%ecx\n");
42116 else
42117 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
42119 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
42120 it needs no stub-binding-helper. */
42121 if (MACHOPIC_ATT_STUB)
42122 return;
42124 fprintf (file, "%s:\n", binder_name);
42126 if (MACHOPIC_PURE)
42128 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
42129 fprintf (file, "\tpushl\t%%ecx\n");
42131 else
42132 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
42134 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
42136 /* N.B. Keep the correspondence of these
42137 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
42138 old-pic/new-pic/non-pic stubs; altering this will break
42139 compatibility with existing dylibs. */
42140 if (MACHOPIC_PURE)
42142 /* 25-byte PIC stub using "CALL get_pc_thunk". */
42143 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
42145 else
42146 /* 16-byte -mdynamic-no-pic stub. */
42147 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
42149 fprintf (file, "%s:\n", lazy_ptr_name);
42150 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
42151 fprintf (file, ASM_LONG "%s\n", binder_name);
42153 #endif /* TARGET_MACHO */
42155 /* Order the registers for register allocator. */
42157 void
42158 x86_order_regs_for_local_alloc (void)
42160 int pos = 0;
42161 int i;
42163 /* First allocate the local general purpose registers. */
42164 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42165 if (GENERAL_REGNO_P (i) && call_used_regs[i])
42166 reg_alloc_order [pos++] = i;
42168 /* Global general purpose registers. */
42169 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42170 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
42171 reg_alloc_order [pos++] = i;
42173 /* x87 registers come first in case we are doing FP math
42174 using them. */
42175 if (!TARGET_SSE_MATH)
42176 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42177 reg_alloc_order [pos++] = i;
42179 /* SSE registers. */
42180 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
42181 reg_alloc_order [pos++] = i;
42182 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
42183 reg_alloc_order [pos++] = i;
42185 /* Extended REX SSE registers. */
42186 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
42187 reg_alloc_order [pos++] = i;
42189 /* Mask register. */
42190 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
42191 reg_alloc_order [pos++] = i;
42193 /* MPX bound registers. */
42194 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
42195 reg_alloc_order [pos++] = i;
42197 /* x87 registers. */
42198 if (TARGET_SSE_MATH)
42199 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
42200 reg_alloc_order [pos++] = i;
42202 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
42203 reg_alloc_order [pos++] = i;
42205 /* Initialize the rest of array as we do not allocate some registers
42206 at all. */
42207 while (pos < FIRST_PSEUDO_REGISTER)
42208 reg_alloc_order [pos++] = 0;
42211 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
42212 in struct attribute_spec handler. */
42213 static tree
42214 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
42215 tree args,
42216 int,
42217 bool *no_add_attrs)
42219 if (TREE_CODE (*node) != FUNCTION_TYPE
42220 && TREE_CODE (*node) != METHOD_TYPE
42221 && TREE_CODE (*node) != FIELD_DECL
42222 && TREE_CODE (*node) != TYPE_DECL)
42224 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42225 name);
42226 *no_add_attrs = true;
42227 return NULL_TREE;
42229 if (TARGET_64BIT)
42231 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
42232 name);
42233 *no_add_attrs = true;
42234 return NULL_TREE;
42236 if (is_attribute_p ("callee_pop_aggregate_return", name))
42238 tree cst;
42240 cst = TREE_VALUE (args);
42241 if (TREE_CODE (cst) != INTEGER_CST)
42243 warning (OPT_Wattributes,
42244 "%qE attribute requires an integer constant argument",
42245 name);
42246 *no_add_attrs = true;
42248 else if (compare_tree_int (cst, 0) != 0
42249 && compare_tree_int (cst, 1) != 0)
42251 warning (OPT_Wattributes,
42252 "argument to %qE attribute is neither zero, nor one",
42253 name);
42254 *no_add_attrs = true;
42257 return NULL_TREE;
42260 return NULL_TREE;
42263 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
42264 struct attribute_spec.handler. */
42265 static tree
42266 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
42267 bool *no_add_attrs)
42269 if (TREE_CODE (*node) != FUNCTION_TYPE
42270 && TREE_CODE (*node) != METHOD_TYPE
42271 && TREE_CODE (*node) != FIELD_DECL
42272 && TREE_CODE (*node) != TYPE_DECL)
42274 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42275 name);
42276 *no_add_attrs = true;
42277 return NULL_TREE;
42280 /* Can combine regparm with all attributes but fastcall. */
42281 if (is_attribute_p ("ms_abi", name))
42283 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
42285 error ("ms_abi and sysv_abi attributes are not compatible");
42288 return NULL_TREE;
42290 else if (is_attribute_p ("sysv_abi", name))
42292 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
42294 error ("ms_abi and sysv_abi attributes are not compatible");
42297 return NULL_TREE;
42300 return NULL_TREE;
42303 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
42304 struct attribute_spec.handler. */
42305 static tree
42306 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
42307 bool *no_add_attrs)
42309 tree *type = NULL;
42310 if (DECL_P (*node))
42312 if (TREE_CODE (*node) == TYPE_DECL)
42313 type = &TREE_TYPE (*node);
42315 else
42316 type = node;
42318 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
42320 warning (OPT_Wattributes, "%qE attribute ignored",
42321 name);
42322 *no_add_attrs = true;
42325 else if ((is_attribute_p ("ms_struct", name)
42326 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
42327 || ((is_attribute_p ("gcc_struct", name)
42328 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
42330 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
42331 name);
42332 *no_add_attrs = true;
42335 return NULL_TREE;
42338 static tree
42339 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
42340 bool *no_add_attrs)
42342 if (TREE_CODE (*node) != FUNCTION_DECL)
42344 warning (OPT_Wattributes, "%qE attribute only applies to functions",
42345 name);
42346 *no_add_attrs = true;
42348 return NULL_TREE;
42351 static tree
42352 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
42353 int, bool *)
42355 return NULL_TREE;
42358 static tree
42359 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
42361 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
42362 but the function type contains args and return type data. */
42363 tree func_type = *node;
42364 tree return_type = TREE_TYPE (func_type);
42366 int nargs = 0;
42367 tree current_arg_type = TYPE_ARG_TYPES (func_type);
42368 while (current_arg_type
42369 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
42371 if (nargs == 0)
42373 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
42374 error ("interrupt service routine should have a pointer "
42375 "as the first argument");
42377 else if (nargs == 1)
42379 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
42380 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
42381 error ("interrupt service routine should have unsigned %s"
42382 "int as the second argument",
42383 TARGET_64BIT
42384 ? (TARGET_X32 ? "long long " : "long ")
42385 : "");
42387 nargs++;
42388 current_arg_type = TREE_CHAIN (current_arg_type);
42390 if (!nargs || nargs > 2)
42391 error ("interrupt service routine can only have a pointer argument "
42392 "and an optional integer argument");
42393 if (! VOID_TYPE_P (return_type))
42394 error ("interrupt service routine can't have non-void return value");
42396 return NULL_TREE;
42399 static bool
42400 ix86_ms_bitfield_layout_p (const_tree record_type)
42402 return ((TARGET_MS_BITFIELD_LAYOUT
42403 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
42404 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
42407 /* Returns an expression indicating where the this parameter is
42408 located on entry to the FUNCTION. */
42410 static rtx
42411 x86_this_parameter (tree function)
42413 tree type = TREE_TYPE (function);
42414 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
42415 int nregs;
42417 if (TARGET_64BIT)
42419 const int *parm_regs;
42421 if (ix86_function_type_abi (type) == MS_ABI)
42422 parm_regs = x86_64_ms_abi_int_parameter_registers;
42423 else
42424 parm_regs = x86_64_int_parameter_registers;
42425 return gen_rtx_REG (Pmode, parm_regs[aggr]);
42428 nregs = ix86_function_regparm (type, function);
42430 if (nregs > 0 && !stdarg_p (type))
42432 int regno;
42433 unsigned int ccvt = ix86_get_callcvt (type);
42435 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42436 regno = aggr ? DX_REG : CX_REG;
42437 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42439 regno = CX_REG;
42440 if (aggr)
42441 return gen_rtx_MEM (SImode,
42442 plus_constant (Pmode, stack_pointer_rtx, 4));
42444 else
42446 regno = AX_REG;
42447 if (aggr)
42449 regno = DX_REG;
42450 if (nregs == 1)
42451 return gen_rtx_MEM (SImode,
42452 plus_constant (Pmode,
42453 stack_pointer_rtx, 4));
42456 return gen_rtx_REG (SImode, regno);
42459 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
42460 aggr ? 8 : 4));
42463 /* Determine whether x86_output_mi_thunk can succeed. */
42465 static bool
42466 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
42467 const_tree function)
42469 /* 64-bit can handle anything. */
42470 if (TARGET_64BIT)
42471 return true;
42473 /* For 32-bit, everything's fine if we have one free register. */
42474 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
42475 return true;
42477 /* Need a free register for vcall_offset. */
42478 if (vcall_offset)
42479 return false;
42481 /* Need a free register for GOT references. */
42482 if (flag_pic && !targetm.binds_local_p (function))
42483 return false;
42485 /* Otherwise ok. */
42486 return true;
42489 /* Output the assembler code for a thunk function. THUNK_DECL is the
42490 declaration for the thunk function itself, FUNCTION is the decl for
42491 the target function. DELTA is an immediate constant offset to be
42492 added to THIS. If VCALL_OFFSET is nonzero, the word at
42493 *(*this + vcall_offset) should be added to THIS. */
42495 static void
42496 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
42497 HOST_WIDE_INT vcall_offset, tree function)
42499 rtx this_param = x86_this_parameter (function);
42500 rtx this_reg, tmp, fnaddr;
42501 unsigned int tmp_regno;
42502 rtx_insn *insn;
42504 if (TARGET_64BIT)
42505 tmp_regno = R10_REG;
42506 else
42508 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
42509 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
42510 tmp_regno = AX_REG;
42511 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
42512 tmp_regno = DX_REG;
42513 else
42514 tmp_regno = CX_REG;
42517 emit_note (NOTE_INSN_PROLOGUE_END);
42519 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
42520 pull it in now and let DELTA benefit. */
42521 if (REG_P (this_param))
42522 this_reg = this_param;
42523 else if (vcall_offset)
42525 /* Put the this parameter into %eax. */
42526 this_reg = gen_rtx_REG (Pmode, AX_REG);
42527 emit_move_insn (this_reg, this_param);
42529 else
42530 this_reg = NULL_RTX;
42532 /* Adjust the this parameter by a fixed constant. */
42533 if (delta)
42535 rtx delta_rtx = GEN_INT (delta);
42536 rtx delta_dst = this_reg ? this_reg : this_param;
42538 if (TARGET_64BIT)
42540 if (!x86_64_general_operand (delta_rtx, Pmode))
42542 tmp = gen_rtx_REG (Pmode, tmp_regno);
42543 emit_move_insn (tmp, delta_rtx);
42544 delta_rtx = tmp;
42548 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
42551 /* Adjust the this parameter by a value stored in the vtable. */
42552 if (vcall_offset)
42554 rtx vcall_addr, vcall_mem, this_mem;
42556 tmp = gen_rtx_REG (Pmode, tmp_regno);
42558 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
42559 if (Pmode != ptr_mode)
42560 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
42561 emit_move_insn (tmp, this_mem);
42563 /* Adjust the this parameter. */
42564 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
42565 if (TARGET_64BIT
42566 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
42568 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
42569 emit_move_insn (tmp2, GEN_INT (vcall_offset));
42570 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
42573 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
42574 if (Pmode != ptr_mode)
42575 emit_insn (gen_addsi_1_zext (this_reg,
42576 gen_rtx_REG (ptr_mode,
42577 REGNO (this_reg)),
42578 vcall_mem));
42579 else
42580 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
42583 /* If necessary, drop THIS back to its stack slot. */
42584 if (this_reg && this_reg != this_param)
42585 emit_move_insn (this_param, this_reg);
42587 fnaddr = XEXP (DECL_RTL (function), 0);
42588 if (TARGET_64BIT)
42590 if (!flag_pic || targetm.binds_local_p (function)
42591 || TARGET_PECOFF)
42593 else
42595 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
42596 tmp = gen_rtx_CONST (Pmode, tmp);
42597 fnaddr = gen_const_mem (Pmode, tmp);
42600 else
42602 if (!flag_pic || targetm.binds_local_p (function))
42604 #if TARGET_MACHO
42605 else if (TARGET_MACHO)
42607 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
42608 fnaddr = XEXP (fnaddr, 0);
42610 #endif /* TARGET_MACHO */
42611 else
42613 tmp = gen_rtx_REG (Pmode, CX_REG);
42614 output_set_got (tmp, NULL_RTX);
42616 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
42617 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
42618 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
42619 fnaddr = gen_const_mem (Pmode, fnaddr);
42623 /* Our sibling call patterns do not allow memories, because we have no
42624 predicate that can distinguish between frame and non-frame memory.
42625 For our purposes here, we can get away with (ab)using a jump pattern,
42626 because we're going to do no optimization. */
42627 if (MEM_P (fnaddr))
42629 if (sibcall_insn_operand (fnaddr, word_mode))
42631 fnaddr = XEXP (DECL_RTL (function), 0);
42632 tmp = gen_rtx_MEM (QImode, fnaddr);
42633 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42634 tmp = emit_call_insn (tmp);
42635 SIBLING_CALL_P (tmp) = 1;
42637 else
42638 emit_jump_insn (gen_indirect_jump (fnaddr));
42640 else
42642 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
42644 // CM_LARGE_PIC always uses pseudo PIC register which is
42645 // uninitialized. Since FUNCTION is local and calling it
42646 // doesn't go through PLT, we use scratch register %r11 as
42647 // PIC register and initialize it here.
42648 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
42649 ix86_init_large_pic_reg (tmp_regno);
42650 fnaddr = legitimize_pic_address (fnaddr,
42651 gen_rtx_REG (Pmode, tmp_regno));
42654 if (!sibcall_insn_operand (fnaddr, word_mode))
42656 tmp = gen_rtx_REG (word_mode, tmp_regno);
42657 if (GET_MODE (fnaddr) != word_mode)
42658 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
42659 emit_move_insn (tmp, fnaddr);
42660 fnaddr = tmp;
42663 tmp = gen_rtx_MEM (QImode, fnaddr);
42664 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
42665 tmp = emit_call_insn (tmp);
42666 SIBLING_CALL_P (tmp) = 1;
42668 emit_barrier ();
42670 /* Emit just enough of rest_of_compilation to get the insns emitted.
42671 Note that use_thunk calls assemble_start_function et al. */
42672 insn = get_insns ();
42673 shorten_branches (insn);
42674 final_start_function (insn, file, 1);
42675 final (insn, file, 1);
42676 final_end_function ();
42679 static void
42680 x86_file_start (void)
42682 default_file_start ();
42683 if (TARGET_16BIT)
42684 fputs ("\t.code16gcc\n", asm_out_file);
42685 #if TARGET_MACHO
42686 darwin_file_start ();
42687 #endif
42688 if (X86_FILE_START_VERSION_DIRECTIVE)
42689 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
42690 if (X86_FILE_START_FLTUSED)
42691 fputs ("\t.global\t__fltused\n", asm_out_file);
42692 if (ix86_asm_dialect == ASM_INTEL)
42693 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
42697 x86_field_alignment (tree type, int computed)
42699 machine_mode mode;
42701 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
42702 return computed;
42703 if (TARGET_IAMCU)
42704 return iamcu_alignment (type, computed);
42705 mode = TYPE_MODE (strip_array_types (type));
42706 if (mode == DFmode || mode == DCmode
42707 || GET_MODE_CLASS (mode) == MODE_INT
42708 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
42709 return MIN (32, computed);
42710 return computed;
42713 /* Print call to TARGET to FILE. */
42715 static void
42716 x86_print_call_or_nop (FILE *file, const char *target)
42718 if (flag_nop_mcount)
42719 fprintf (file, "1:\tnopl 0x00(%%eax,%%eax,1)\n"); /* 5 byte nop. */
42720 else
42721 fprintf (file, "1:\tcall\t%s\n", target);
42724 /* Output assembler code to FILE to increment profiler label # LABELNO
42725 for profiling a function entry. */
42726 void
42727 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
42729 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
42730 : MCOUNT_NAME);
42731 if (TARGET_64BIT)
42733 #ifndef NO_PROFILE_COUNTERS
42734 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
42735 #endif
42737 if (!TARGET_PECOFF && flag_pic)
42738 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
42739 else
42740 x86_print_call_or_nop (file, mcount_name);
42742 else if (flag_pic)
42744 #ifndef NO_PROFILE_COUNTERS
42745 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
42746 LPREFIX, labelno);
42747 #endif
42748 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
42750 else
42752 #ifndef NO_PROFILE_COUNTERS
42753 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
42754 LPREFIX, labelno);
42755 #endif
42756 x86_print_call_or_nop (file, mcount_name);
42759 if (flag_record_mcount)
42761 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
42762 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
42763 fprintf (file, "\t.previous\n");
42767 /* We don't have exact information about the insn sizes, but we may assume
42768 quite safely that we are informed about all 1 byte insns and memory
42769 address sizes. This is enough to eliminate unnecessary padding in
42770 99% of cases. */
42772 static int
42773 min_insn_size (rtx_insn *insn)
42775 int l = 0, len;
42777 if (!INSN_P (insn) || !active_insn_p (insn))
42778 return 0;
42780 /* Discard alignments we've emit and jump instructions. */
42781 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
42782 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
42783 return 0;
42785 /* Important case - calls are always 5 bytes.
42786 It is common to have many calls in the row. */
42787 if (CALL_P (insn)
42788 && symbolic_reference_mentioned_p (PATTERN (insn))
42789 && !SIBLING_CALL_P (insn))
42790 return 5;
42791 len = get_attr_length (insn);
42792 if (len <= 1)
42793 return 1;
42795 /* For normal instructions we rely on get_attr_length being exact,
42796 with a few exceptions. */
42797 if (!JUMP_P (insn))
42799 enum attr_type type = get_attr_type (insn);
42801 switch (type)
42803 case TYPE_MULTI:
42804 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
42805 || asm_noperands (PATTERN (insn)) >= 0)
42806 return 0;
42807 break;
42808 case TYPE_OTHER:
42809 case TYPE_FCMP:
42810 break;
42811 default:
42812 /* Otherwise trust get_attr_length. */
42813 return len;
42816 l = get_attr_length_address (insn);
42817 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
42818 l = 4;
42820 if (l)
42821 return 1+l;
42822 else
42823 return 2;
42826 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42828 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
42829 window. */
42831 static void
42832 ix86_avoid_jump_mispredicts (void)
42834 rtx_insn *insn, *start = get_insns ();
42835 int nbytes = 0, njumps = 0;
42836 bool isjump = false;
42838 /* Look for all minimal intervals of instructions containing 4 jumps.
42839 The intervals are bounded by START and INSN. NBYTES is the total
42840 size of instructions in the interval including INSN and not including
42841 START. When the NBYTES is smaller than 16 bytes, it is possible
42842 that the end of START and INSN ends up in the same 16byte page.
42844 The smallest offset in the page INSN can start is the case where START
42845 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
42846 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
42848 Don't consider asm goto as jump, while it can contain a jump, it doesn't
42849 have to, control transfer to label(s) can be performed through other
42850 means, and also we estimate minimum length of all asm stmts as 0. */
42851 for (insn = start; insn; insn = NEXT_INSN (insn))
42853 int min_size;
42855 if (LABEL_P (insn))
42857 int align = label_to_alignment (insn);
42858 int max_skip = label_to_max_skip (insn);
42860 if (max_skip > 15)
42861 max_skip = 15;
42862 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
42863 already in the current 16 byte page, because otherwise
42864 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
42865 bytes to reach 16 byte boundary. */
42866 if (align <= 0
42867 || (align <= 3 && max_skip != (1 << align) - 1))
42868 max_skip = 0;
42869 if (dump_file)
42870 fprintf (dump_file, "Label %i with max_skip %i\n",
42871 INSN_UID (insn), max_skip);
42872 if (max_skip)
42874 while (nbytes + max_skip >= 16)
42876 start = NEXT_INSN (start);
42877 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42878 || CALL_P (start))
42879 njumps--, isjump = true;
42880 else
42881 isjump = false;
42882 nbytes -= min_insn_size (start);
42885 continue;
42888 min_size = min_insn_size (insn);
42889 nbytes += min_size;
42890 if (dump_file)
42891 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
42892 INSN_UID (insn), min_size);
42893 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
42894 || CALL_P (insn))
42895 njumps++;
42896 else
42897 continue;
42899 while (njumps > 3)
42901 start = NEXT_INSN (start);
42902 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
42903 || CALL_P (start))
42904 njumps--, isjump = true;
42905 else
42906 isjump = false;
42907 nbytes -= min_insn_size (start);
42909 gcc_assert (njumps >= 0);
42910 if (dump_file)
42911 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
42912 INSN_UID (start), INSN_UID (insn), nbytes);
42914 if (njumps == 3 && isjump && nbytes < 16)
42916 int padsize = 15 - nbytes + min_insn_size (insn);
42918 if (dump_file)
42919 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42920 INSN_UID (insn), padsize);
42921 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42925 #endif
42927 /* AMD Athlon works faster
42928 when RET is not destination of conditional jump or directly preceded
42929 by other jump instruction. We avoid the penalty by inserting NOP just
42930 before the RET instructions in such cases. */
42931 static void
42932 ix86_pad_returns (void)
42934 edge e;
42935 edge_iterator ei;
42937 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42939 basic_block bb = e->src;
42940 rtx_insn *ret = BB_END (bb);
42941 rtx_insn *prev;
42942 bool replace = false;
42944 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42945 || optimize_bb_for_size_p (bb))
42946 continue;
42947 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42948 if (active_insn_p (prev) || LABEL_P (prev))
42949 break;
42950 if (prev && LABEL_P (prev))
42952 edge e;
42953 edge_iterator ei;
42955 FOR_EACH_EDGE (e, ei, bb->preds)
42956 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42957 && !(e->flags & EDGE_FALLTHRU))
42959 replace = true;
42960 break;
42963 if (!replace)
42965 prev = prev_active_insn (ret);
42966 if (prev
42967 && ((JUMP_P (prev) && any_condjump_p (prev))
42968 || CALL_P (prev)))
42969 replace = true;
42970 /* Empty functions get branch mispredict even when
42971 the jump destination is not visible to us. */
42972 if (!prev && !optimize_function_for_size_p (cfun))
42973 replace = true;
42975 if (replace)
42977 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42978 delete_insn (ret);
42983 /* Count the minimum number of instructions in BB. Return 4 if the
42984 number of instructions >= 4. */
42986 static int
42987 ix86_count_insn_bb (basic_block bb)
42989 rtx_insn *insn;
42990 int insn_count = 0;
42992 /* Count number of instructions in this block. Return 4 if the number
42993 of instructions >= 4. */
42994 FOR_BB_INSNS (bb, insn)
42996 /* Only happen in exit blocks. */
42997 if (JUMP_P (insn)
42998 && ANY_RETURN_P (PATTERN (insn)))
42999 break;
43001 if (NONDEBUG_INSN_P (insn)
43002 && GET_CODE (PATTERN (insn)) != USE
43003 && GET_CODE (PATTERN (insn)) != CLOBBER)
43005 insn_count++;
43006 if (insn_count >= 4)
43007 return insn_count;
43011 return insn_count;
43015 /* Count the minimum number of instructions in code path in BB.
43016 Return 4 if the number of instructions >= 4. */
43018 static int
43019 ix86_count_insn (basic_block bb)
43021 edge e;
43022 edge_iterator ei;
43023 int min_prev_count;
43025 /* Only bother counting instructions along paths with no
43026 more than 2 basic blocks between entry and exit. Given
43027 that BB has an edge to exit, determine if a predecessor
43028 of BB has an edge from entry. If so, compute the number
43029 of instructions in the predecessor block. If there
43030 happen to be multiple such blocks, compute the minimum. */
43031 min_prev_count = 4;
43032 FOR_EACH_EDGE (e, ei, bb->preds)
43034 edge prev_e;
43035 edge_iterator prev_ei;
43037 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43039 min_prev_count = 0;
43040 break;
43042 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
43044 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
43046 int count = ix86_count_insn_bb (e->src);
43047 if (count < min_prev_count)
43048 min_prev_count = count;
43049 break;
43054 if (min_prev_count < 4)
43055 min_prev_count += ix86_count_insn_bb (bb);
43057 return min_prev_count;
43060 /* Pad short function to 4 instructions. */
43062 static void
43063 ix86_pad_short_function (void)
43065 edge e;
43066 edge_iterator ei;
43068 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43070 rtx_insn *ret = BB_END (e->src);
43071 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
43073 int insn_count = ix86_count_insn (e->src);
43075 /* Pad short function. */
43076 if (insn_count < 4)
43078 rtx_insn *insn = ret;
43080 /* Find epilogue. */
43081 while (insn
43082 && (!NOTE_P (insn)
43083 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
43084 insn = PREV_INSN (insn);
43086 if (!insn)
43087 insn = ret;
43089 /* Two NOPs count as one instruction. */
43090 insn_count = 2 * (4 - insn_count);
43091 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
43097 /* Fix up a Windows system unwinder issue. If an EH region falls through into
43098 the epilogue, the Windows system unwinder will apply epilogue logic and
43099 produce incorrect offsets. This can be avoided by adding a nop between
43100 the last insn that can throw and the first insn of the epilogue. */
43102 static void
43103 ix86_seh_fixup_eh_fallthru (void)
43105 edge e;
43106 edge_iterator ei;
43108 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
43110 rtx_insn *insn, *next;
43112 /* Find the beginning of the epilogue. */
43113 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
43114 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
43115 break;
43116 if (insn == NULL)
43117 continue;
43119 /* We only care about preceding insns that can throw. */
43120 insn = prev_active_insn (insn);
43121 if (insn == NULL || !can_throw_internal (insn))
43122 continue;
43124 /* Do not separate calls from their debug information. */
43125 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
43126 if (NOTE_P (next)
43127 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
43128 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
43129 insn = next;
43130 else
43131 break;
43133 emit_insn_after (gen_nops (const1_rtx), insn);
43137 /* Given a register number BASE, the lowest of a group of registers, update
43138 regsets IN and OUT with the registers that should be avoided in input
43139 and output operands respectively when trying to avoid generating a modr/m
43140 byte for -fmitigate-rop. */
43142 static void
43143 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
43145 SET_HARD_REG_BIT (out, base);
43146 SET_HARD_REG_BIT (out, base + 1);
43147 SET_HARD_REG_BIT (in, base + 2);
43148 SET_HARD_REG_BIT (in, base + 3);
43151 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
43152 that certain encodings of modr/m bytes do not occur. */
43153 static void
43154 ix86_mitigate_rop (void)
43156 HARD_REG_SET input_risky;
43157 HARD_REG_SET output_risky;
43158 HARD_REG_SET inout_risky;
43160 CLEAR_HARD_REG_SET (output_risky);
43161 CLEAR_HARD_REG_SET (input_risky);
43162 SET_HARD_REG_BIT (output_risky, AX_REG);
43163 SET_HARD_REG_BIT (output_risky, CX_REG);
43164 SET_HARD_REG_BIT (input_risky, BX_REG);
43165 SET_HARD_REG_BIT (input_risky, DX_REG);
43166 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
43167 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
43168 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
43169 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
43170 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
43171 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
43172 COPY_HARD_REG_SET (inout_risky, input_risky);
43173 IOR_HARD_REG_SET (inout_risky, output_risky);
43175 df_note_add_problem ();
43176 /* Fix up what stack-regs did. */
43177 df_insn_rescan_all ();
43178 df_analyze ();
43180 regrename_init (true);
43181 regrename_analyze (NULL);
43183 auto_vec<du_head_p> cands;
43185 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
43187 if (!NONDEBUG_INSN_P (insn))
43188 continue;
43190 if (GET_CODE (PATTERN (insn)) == USE
43191 || GET_CODE (PATTERN (insn)) == CLOBBER)
43192 continue;
43194 extract_insn (insn);
43196 int opno0, opno1;
43197 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43198 recog_data.n_operands, &opno0,
43199 &opno1);
43201 if (!ix86_rop_should_change_byte_p (modrm))
43202 continue;
43204 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
43206 /* This happens when regrename has to fail a block. */
43207 if (!info->op_info)
43208 continue;
43210 if (info->op_info[opno0].n_chains != 0)
43212 gcc_assert (info->op_info[opno0].n_chains == 1);
43213 du_head_p op0c;
43214 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
43215 if (op0c->target_data_1 + op0c->target_data_2 == 0
43216 && !op0c->cannot_rename)
43217 cands.safe_push (op0c);
43219 op0c->target_data_1++;
43221 if (info->op_info[opno1].n_chains != 0)
43223 gcc_assert (info->op_info[opno1].n_chains == 1);
43224 du_head_p op1c;
43225 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
43226 if (op1c->target_data_1 + op1c->target_data_2 == 0
43227 && !op1c->cannot_rename)
43228 cands.safe_push (op1c);
43230 op1c->target_data_2++;
43234 int i;
43235 du_head_p head;
43236 FOR_EACH_VEC_ELT (cands, i, head)
43238 int old_reg, best_reg;
43239 HARD_REG_SET unavailable;
43241 CLEAR_HARD_REG_SET (unavailable);
43242 if (head->target_data_1)
43243 IOR_HARD_REG_SET (unavailable, output_risky);
43244 if (head->target_data_2)
43245 IOR_HARD_REG_SET (unavailable, input_risky);
43247 int n_uses;
43248 reg_class superclass = regrename_find_superclass (head, &n_uses,
43249 &unavailable);
43250 old_reg = head->regno;
43251 best_reg = find_rename_reg (head, superclass, &unavailable,
43252 old_reg, false);
43253 bool ok = regrename_do_replace (head, best_reg);
43254 gcc_assert (ok);
43255 if (dump_file)
43256 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
43257 reg_names[best_reg], reg_class_names[superclass]);
43261 regrename_finish ();
43263 df_analyze ();
43265 basic_block bb;
43266 regset_head live;
43268 INIT_REG_SET (&live);
43270 FOR_EACH_BB_FN (bb, cfun)
43272 rtx_insn *insn;
43274 COPY_REG_SET (&live, DF_LR_OUT (bb));
43275 df_simulate_initialize_backwards (bb, &live);
43277 FOR_BB_INSNS_REVERSE (bb, insn)
43279 if (!NONDEBUG_INSN_P (insn))
43280 continue;
43282 df_simulate_one_insn_backwards (bb, insn, &live);
43284 if (GET_CODE (PATTERN (insn)) == USE
43285 || GET_CODE (PATTERN (insn)) == CLOBBER)
43286 continue;
43288 extract_insn (insn);
43289 constrain_operands_cached (insn, reload_completed);
43290 int opno0, opno1;
43291 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
43292 recog_data.n_operands, &opno0,
43293 &opno1);
43294 if (modrm < 0
43295 || !ix86_rop_should_change_byte_p (modrm)
43296 || opno0 == opno1)
43297 continue;
43299 rtx oldreg = recog_data.operand[opno1];
43300 preprocess_constraints (insn);
43301 const operand_alternative *alt = which_op_alt ();
43303 int i;
43304 for (i = 0; i < recog_data.n_operands; i++)
43305 if (i != opno1
43306 && alt[i].earlyclobber
43307 && reg_overlap_mentioned_p (recog_data.operand[i],
43308 oldreg))
43309 break;
43311 if (i < recog_data.n_operands)
43312 continue;
43314 if (dump_file)
43315 fprintf (dump_file,
43316 "attempting to fix modrm byte in insn %d:"
43317 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
43318 reg_class_names[alt[opno1].cl]);
43320 HARD_REG_SET unavailable;
43321 REG_SET_TO_HARD_REG_SET (unavailable, &live);
43322 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
43323 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
43324 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
43325 IOR_HARD_REG_SET (unavailable, output_risky);
43326 IOR_COMPL_HARD_REG_SET (unavailable,
43327 reg_class_contents[alt[opno1].cl]);
43329 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
43330 if (!TEST_HARD_REG_BIT (unavailable, i))
43331 break;
43332 if (i == FIRST_PSEUDO_REGISTER)
43334 if (dump_file)
43335 fprintf (dump_file, ", none available\n");
43336 continue;
43338 if (dump_file)
43339 fprintf (dump_file, " -> %d\n", i);
43340 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
43341 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
43342 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
43347 /* Implement machine specific optimizations. We implement padding of returns
43348 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
43349 static void
43350 ix86_reorg (void)
43352 /* We are freeing block_for_insn in the toplev to keep compatibility
43353 with old MDEP_REORGS that are not CFG based. Recompute it now. */
43354 compute_bb_for_insn ();
43356 if (flag_mitigate_rop)
43357 ix86_mitigate_rop ();
43359 if (TARGET_SEH && current_function_has_exception_handlers ())
43360 ix86_seh_fixup_eh_fallthru ();
43362 if (optimize && optimize_function_for_speed_p (cfun))
43364 if (TARGET_PAD_SHORT_FUNCTION)
43365 ix86_pad_short_function ();
43366 else if (TARGET_PAD_RETURNS)
43367 ix86_pad_returns ();
43368 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
43369 if (TARGET_FOUR_JUMP_LIMIT)
43370 ix86_avoid_jump_mispredicts ();
43371 #endif
43375 /* Return nonzero when QImode register that must be represented via REX prefix
43376 is used. */
43377 bool
43378 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
43380 int i;
43381 extract_insn_cached (insn);
43382 for (i = 0; i < recog_data.n_operands; i++)
43383 if (GENERAL_REG_P (recog_data.operand[i])
43384 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
43385 return true;
43386 return false;
43389 /* Return true when INSN mentions register that must be encoded using REX
43390 prefix. */
43391 bool
43392 x86_extended_reg_mentioned_p (rtx insn)
43394 subrtx_iterator::array_type array;
43395 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
43397 const_rtx x = *iter;
43398 if (REG_P (x)
43399 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
43400 return true;
43402 return false;
43405 /* If profitable, negate (without causing overflow) integer constant
43406 of mode MODE at location LOC. Return true in this case. */
43407 bool
43408 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
43410 HOST_WIDE_INT val;
43412 if (!CONST_INT_P (*loc))
43413 return false;
43415 switch (mode)
43417 case E_DImode:
43418 /* DImode x86_64 constants must fit in 32 bits. */
43419 gcc_assert (x86_64_immediate_operand (*loc, mode));
43421 mode = SImode;
43422 break;
43424 case E_SImode:
43425 case E_HImode:
43426 case E_QImode:
43427 break;
43429 default:
43430 gcc_unreachable ();
43433 /* Avoid overflows. */
43434 if (mode_signbit_p (mode, *loc))
43435 return false;
43437 val = INTVAL (*loc);
43439 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
43440 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
43441 if ((val < 0 && val != -128)
43442 || val == 128)
43444 *loc = GEN_INT (-val);
43445 return true;
43448 return false;
43451 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
43452 optabs would emit if we didn't have TFmode patterns. */
43454 void
43455 x86_emit_floatuns (rtx operands[2])
43457 rtx_code_label *neglab, *donelab;
43458 rtx i0, i1, f0, in, out;
43459 machine_mode mode, inmode;
43461 inmode = GET_MODE (operands[1]);
43462 gcc_assert (inmode == SImode || inmode == DImode);
43464 out = operands[0];
43465 in = force_reg (inmode, operands[1]);
43466 mode = GET_MODE (out);
43467 neglab = gen_label_rtx ();
43468 donelab = gen_label_rtx ();
43469 f0 = gen_reg_rtx (mode);
43471 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
43473 expand_float (out, in, 0);
43475 emit_jump_insn (gen_jump (donelab));
43476 emit_barrier ();
43478 emit_label (neglab);
43480 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
43481 1, OPTAB_DIRECT);
43482 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
43483 1, OPTAB_DIRECT);
43484 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
43486 expand_float (f0, i0, 0);
43488 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
43490 emit_label (donelab);
43493 static bool canonicalize_perm (struct expand_vec_perm_d *d);
43494 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
43495 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
43496 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
43498 /* Get a vector mode of the same size as the original but with elements
43499 twice as wide. This is only guaranteed to apply to integral vectors. */
43501 static inline machine_mode
43502 get_mode_wider_vector (machine_mode o)
43504 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
43505 machine_mode n = GET_MODE_WIDER_MODE (o);
43506 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
43507 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
43508 return n;
43511 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
43512 fill target with val via vec_duplicate. */
43514 static bool
43515 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
43517 bool ok;
43518 rtx_insn *insn;
43519 rtx dup;
43521 /* First attempt to recognize VAL as-is. */
43522 dup = gen_rtx_VEC_DUPLICATE (mode, val);
43523 insn = emit_insn (gen_rtx_SET (target, dup));
43524 if (recog_memoized (insn) < 0)
43526 rtx_insn *seq;
43527 machine_mode innermode = GET_MODE_INNER (mode);
43528 rtx reg;
43530 /* If that fails, force VAL into a register. */
43532 start_sequence ();
43533 reg = force_reg (innermode, val);
43534 if (GET_MODE (reg) != innermode)
43535 reg = gen_lowpart (innermode, reg);
43536 XEXP (dup, 0) = reg;
43537 seq = get_insns ();
43538 end_sequence ();
43539 if (seq)
43540 emit_insn_before (seq, insn);
43542 ok = recog_memoized (insn) >= 0;
43543 gcc_assert (ok);
43545 return true;
43548 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43549 with all elements equal to VAR. Return true if successful. */
43551 static bool
43552 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
43553 rtx target, rtx val)
43555 bool ok;
43557 switch (mode)
43559 case E_V2SImode:
43560 case E_V2SFmode:
43561 if (!mmx_ok)
43562 return false;
43563 /* FALLTHRU */
43565 case E_V4DFmode:
43566 case E_V4DImode:
43567 case E_V8SFmode:
43568 case E_V8SImode:
43569 case E_V2DFmode:
43570 case E_V2DImode:
43571 case E_V4SFmode:
43572 case E_V4SImode:
43573 case E_V16SImode:
43574 case E_V8DImode:
43575 case E_V16SFmode:
43576 case E_V8DFmode:
43577 return ix86_vector_duplicate_value (mode, target, val);
43579 case E_V4HImode:
43580 if (!mmx_ok)
43581 return false;
43582 if (TARGET_SSE || TARGET_3DNOW_A)
43584 rtx x;
43586 val = gen_lowpart (SImode, val);
43587 x = gen_rtx_TRUNCATE (HImode, val);
43588 x = gen_rtx_VEC_DUPLICATE (mode, x);
43589 emit_insn (gen_rtx_SET (target, x));
43590 return true;
43592 goto widen;
43594 case E_V8QImode:
43595 if (!mmx_ok)
43596 return false;
43597 goto widen;
43599 case E_V8HImode:
43600 if (TARGET_AVX2)
43601 return ix86_vector_duplicate_value (mode, target, val);
43603 if (TARGET_SSE2)
43605 struct expand_vec_perm_d dperm;
43606 rtx tmp1, tmp2;
43608 permute:
43609 memset (&dperm, 0, sizeof (dperm));
43610 dperm.target = target;
43611 dperm.vmode = mode;
43612 dperm.nelt = GET_MODE_NUNITS (mode);
43613 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
43614 dperm.one_operand_p = true;
43616 /* Extend to SImode using a paradoxical SUBREG. */
43617 tmp1 = gen_reg_rtx (SImode);
43618 emit_move_insn (tmp1, gen_lowpart (SImode, val));
43620 /* Insert the SImode value as low element of a V4SImode vector. */
43621 tmp2 = gen_reg_rtx (V4SImode);
43622 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
43623 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
43625 ok = (expand_vec_perm_1 (&dperm)
43626 || expand_vec_perm_broadcast_1 (&dperm));
43627 gcc_assert (ok);
43628 return ok;
43630 goto widen;
43632 case E_V16QImode:
43633 if (TARGET_AVX2)
43634 return ix86_vector_duplicate_value (mode, target, val);
43636 if (TARGET_SSE2)
43637 goto permute;
43638 goto widen;
43640 widen:
43641 /* Replicate the value once into the next wider mode and recurse. */
43643 machine_mode smode, wsmode, wvmode;
43644 rtx x;
43646 smode = GET_MODE_INNER (mode);
43647 wvmode = get_mode_wider_vector (mode);
43648 wsmode = GET_MODE_INNER (wvmode);
43650 val = convert_modes (wsmode, smode, val, true);
43651 x = expand_simple_binop (wsmode, ASHIFT, val,
43652 GEN_INT (GET_MODE_BITSIZE (smode)),
43653 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43654 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
43656 x = gen_reg_rtx (wvmode);
43657 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
43658 gcc_assert (ok);
43659 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
43660 return ok;
43663 case E_V16HImode:
43664 case E_V32QImode:
43665 if (TARGET_AVX2)
43666 return ix86_vector_duplicate_value (mode, target, val);
43667 else
43669 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
43670 rtx x = gen_reg_rtx (hvmode);
43672 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43673 gcc_assert (ok);
43675 x = gen_rtx_VEC_CONCAT (mode, x, x);
43676 emit_insn (gen_rtx_SET (target, x));
43678 return true;
43680 case E_V64QImode:
43681 case E_V32HImode:
43682 if (TARGET_AVX512BW)
43683 return ix86_vector_duplicate_value (mode, target, val);
43684 else
43686 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
43687 rtx x = gen_reg_rtx (hvmode);
43689 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
43690 gcc_assert (ok);
43692 x = gen_rtx_VEC_CONCAT (mode, x, x);
43693 emit_insn (gen_rtx_SET (target, x));
43695 return true;
43697 default:
43698 return false;
43702 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43703 whose ONE_VAR element is VAR, and other elements are zero. Return true
43704 if successful. */
43706 static bool
43707 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
43708 rtx target, rtx var, int one_var)
43710 machine_mode vsimode;
43711 rtx new_target;
43712 rtx x, tmp;
43713 bool use_vector_set = false;
43715 switch (mode)
43717 case E_V2DImode:
43718 /* For SSE4.1, we normally use vector set. But if the second
43719 element is zero and inter-unit moves are OK, we use movq
43720 instead. */
43721 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
43722 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
43723 && one_var == 0));
43724 break;
43725 case E_V16QImode:
43726 case E_V4SImode:
43727 case E_V4SFmode:
43728 use_vector_set = TARGET_SSE4_1;
43729 break;
43730 case E_V8HImode:
43731 use_vector_set = TARGET_SSE2;
43732 break;
43733 case E_V4HImode:
43734 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
43735 break;
43736 case E_V32QImode:
43737 case E_V16HImode:
43738 case E_V8SImode:
43739 case E_V8SFmode:
43740 case E_V4DFmode:
43741 use_vector_set = TARGET_AVX;
43742 break;
43743 case E_V4DImode:
43744 /* Use ix86_expand_vector_set in 64bit mode only. */
43745 use_vector_set = TARGET_AVX && TARGET_64BIT;
43746 break;
43747 default:
43748 break;
43751 if (use_vector_set)
43753 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
43754 var = force_reg (GET_MODE_INNER (mode), var);
43755 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43756 return true;
43759 switch (mode)
43761 case E_V2SFmode:
43762 case E_V2SImode:
43763 if (!mmx_ok)
43764 return false;
43765 /* FALLTHRU */
43767 case E_V2DFmode:
43768 case E_V2DImode:
43769 if (one_var != 0)
43770 return false;
43771 var = force_reg (GET_MODE_INNER (mode), var);
43772 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
43773 emit_insn (gen_rtx_SET (target, x));
43774 return true;
43776 case E_V4SFmode:
43777 case E_V4SImode:
43778 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
43779 new_target = gen_reg_rtx (mode);
43780 else
43781 new_target = target;
43782 var = force_reg (GET_MODE_INNER (mode), var);
43783 x = gen_rtx_VEC_DUPLICATE (mode, var);
43784 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
43785 emit_insn (gen_rtx_SET (new_target, x));
43786 if (one_var != 0)
43788 /* We need to shuffle the value to the correct position, so
43789 create a new pseudo to store the intermediate result. */
43791 /* With SSE2, we can use the integer shuffle insns. */
43792 if (mode != V4SFmode && TARGET_SSE2)
43794 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
43795 const1_rtx,
43796 GEN_INT (one_var == 1 ? 0 : 1),
43797 GEN_INT (one_var == 2 ? 0 : 1),
43798 GEN_INT (one_var == 3 ? 0 : 1)));
43799 if (target != new_target)
43800 emit_move_insn (target, new_target);
43801 return true;
43804 /* Otherwise convert the intermediate result to V4SFmode and
43805 use the SSE1 shuffle instructions. */
43806 if (mode != V4SFmode)
43808 tmp = gen_reg_rtx (V4SFmode);
43809 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
43811 else
43812 tmp = new_target;
43814 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
43815 const1_rtx,
43816 GEN_INT (one_var == 1 ? 0 : 1),
43817 GEN_INT (one_var == 2 ? 0+4 : 1+4),
43818 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
43820 if (mode != V4SFmode)
43821 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
43822 else if (tmp != target)
43823 emit_move_insn (target, tmp);
43825 else if (target != new_target)
43826 emit_move_insn (target, new_target);
43827 return true;
43829 case E_V8HImode:
43830 case E_V16QImode:
43831 vsimode = V4SImode;
43832 goto widen;
43833 case E_V4HImode:
43834 case E_V8QImode:
43835 if (!mmx_ok)
43836 return false;
43837 vsimode = V2SImode;
43838 goto widen;
43839 widen:
43840 if (one_var != 0)
43841 return false;
43843 /* Zero extend the variable element to SImode and recurse. */
43844 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
43846 x = gen_reg_rtx (vsimode);
43847 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
43848 var, one_var))
43849 gcc_unreachable ();
43851 emit_move_insn (target, gen_lowpart (mode, x));
43852 return true;
43854 default:
43855 return false;
43859 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
43860 consisting of the values in VALS. It is known that all elements
43861 except ONE_VAR are constants. Return true if successful. */
43863 static bool
43864 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
43865 rtx target, rtx vals, int one_var)
43867 rtx var = XVECEXP (vals, 0, one_var);
43868 machine_mode wmode;
43869 rtx const_vec, x;
43871 const_vec = copy_rtx (vals);
43872 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
43873 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
43875 switch (mode)
43877 case E_V2DFmode:
43878 case E_V2DImode:
43879 case E_V2SFmode:
43880 case E_V2SImode:
43881 /* For the two element vectors, it's just as easy to use
43882 the general case. */
43883 return false;
43885 case E_V4DImode:
43886 /* Use ix86_expand_vector_set in 64bit mode only. */
43887 if (!TARGET_64BIT)
43888 return false;
43889 /* FALLTHRU */
43890 case E_V4DFmode:
43891 case E_V8SFmode:
43892 case E_V8SImode:
43893 case E_V16HImode:
43894 case E_V32QImode:
43895 case E_V4SFmode:
43896 case E_V4SImode:
43897 case E_V8HImode:
43898 case E_V4HImode:
43899 break;
43901 case E_V16QImode:
43902 if (TARGET_SSE4_1)
43903 break;
43904 wmode = V8HImode;
43905 goto widen;
43906 case E_V8QImode:
43907 wmode = V4HImode;
43908 goto widen;
43909 widen:
43910 /* There's no way to set one QImode entry easily. Combine
43911 the variable value with its adjacent constant value, and
43912 promote to an HImode set. */
43913 x = XVECEXP (vals, 0, one_var ^ 1);
43914 if (one_var & 1)
43916 var = convert_modes (HImode, QImode, var, true);
43917 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43918 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43919 x = GEN_INT (INTVAL (x) & 0xff);
43921 else
43923 var = convert_modes (HImode, QImode, var, true);
43924 x = gen_int_mode (INTVAL (x) << 8, HImode);
43926 if (x != const0_rtx)
43927 var = expand_simple_binop (HImode, IOR, var, x, var,
43928 1, OPTAB_LIB_WIDEN);
43930 x = gen_reg_rtx (wmode);
43931 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43932 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43934 emit_move_insn (target, gen_lowpart (mode, x));
43935 return true;
43937 default:
43938 return false;
43941 emit_move_insn (target, const_vec);
43942 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43943 return true;
43946 /* A subroutine of ix86_expand_vector_init_general. Use vector
43947 concatenate to handle the most general case: all values variable,
43948 and none identical. */
43950 static void
43951 ix86_expand_vector_init_concat (machine_mode mode,
43952 rtx target, rtx *ops, int n)
43954 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43955 rtx first[16], second[8], third[4];
43956 rtvec v;
43957 int i, j;
43959 switch (n)
43961 case 2:
43962 switch (mode)
43964 case E_V16SImode:
43965 cmode = V8SImode;
43966 break;
43967 case E_V16SFmode:
43968 cmode = V8SFmode;
43969 break;
43970 case E_V8DImode:
43971 cmode = V4DImode;
43972 break;
43973 case E_V8DFmode:
43974 cmode = V4DFmode;
43975 break;
43976 case E_V8SImode:
43977 cmode = V4SImode;
43978 break;
43979 case E_V8SFmode:
43980 cmode = V4SFmode;
43981 break;
43982 case E_V4DImode:
43983 cmode = V2DImode;
43984 break;
43985 case E_V4DFmode:
43986 cmode = V2DFmode;
43987 break;
43988 case E_V4SImode:
43989 cmode = V2SImode;
43990 break;
43991 case E_V4SFmode:
43992 cmode = V2SFmode;
43993 break;
43994 case E_V2DImode:
43995 cmode = DImode;
43996 break;
43997 case E_V2SImode:
43998 cmode = SImode;
43999 break;
44000 case E_V2DFmode:
44001 cmode = DFmode;
44002 break;
44003 case E_V2SFmode:
44004 cmode = SFmode;
44005 break;
44006 default:
44007 gcc_unreachable ();
44010 if (!register_operand (ops[1], cmode))
44011 ops[1] = force_reg (cmode, ops[1]);
44012 if (!register_operand (ops[0], cmode))
44013 ops[0] = force_reg (cmode, ops[0]);
44014 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
44015 ops[1])));
44016 break;
44018 case 4:
44019 switch (mode)
44021 case E_V4DImode:
44022 cmode = V2DImode;
44023 break;
44024 case E_V4DFmode:
44025 cmode = V2DFmode;
44026 break;
44027 case E_V4SImode:
44028 cmode = V2SImode;
44029 break;
44030 case E_V4SFmode:
44031 cmode = V2SFmode;
44032 break;
44033 default:
44034 gcc_unreachable ();
44036 goto half;
44038 case 8:
44039 switch (mode)
44041 case E_V8DImode:
44042 cmode = V2DImode;
44043 hmode = V4DImode;
44044 break;
44045 case E_V8DFmode:
44046 cmode = V2DFmode;
44047 hmode = V4DFmode;
44048 break;
44049 case E_V8SImode:
44050 cmode = V2SImode;
44051 hmode = V4SImode;
44052 break;
44053 case E_V8SFmode:
44054 cmode = V2SFmode;
44055 hmode = V4SFmode;
44056 break;
44057 default:
44058 gcc_unreachable ();
44060 goto half;
44062 case 16:
44063 switch (mode)
44065 case E_V16SImode:
44066 cmode = V2SImode;
44067 hmode = V4SImode;
44068 gmode = V8SImode;
44069 break;
44070 case E_V16SFmode:
44071 cmode = V2SFmode;
44072 hmode = V4SFmode;
44073 gmode = V8SFmode;
44074 break;
44075 default:
44076 gcc_unreachable ();
44078 goto half;
44080 half:
44081 /* FIXME: We process inputs backward to help RA. PR 36222. */
44082 i = n - 1;
44083 j = (n >> 1) - 1;
44084 for (; i > 0; i -= 2, j--)
44086 first[j] = gen_reg_rtx (cmode);
44087 v = gen_rtvec (2, ops[i - 1], ops[i]);
44088 ix86_expand_vector_init (false, first[j],
44089 gen_rtx_PARALLEL (cmode, v));
44092 n >>= 1;
44093 if (n > 4)
44095 gcc_assert (hmode != VOIDmode);
44096 gcc_assert (gmode != VOIDmode);
44097 for (i = j = 0; i < n; i += 2, j++)
44099 second[j] = gen_reg_rtx (hmode);
44100 ix86_expand_vector_init_concat (hmode, second [j],
44101 &first [i], 2);
44103 n >>= 1;
44104 for (i = j = 0; i < n; i += 2, j++)
44106 third[j] = gen_reg_rtx (gmode);
44107 ix86_expand_vector_init_concat (gmode, third[j],
44108 &second[i], 2);
44110 n >>= 1;
44111 ix86_expand_vector_init_concat (mode, target, third, n);
44113 else if (n > 2)
44115 gcc_assert (hmode != VOIDmode);
44116 for (i = j = 0; i < n; i += 2, j++)
44118 second[j] = gen_reg_rtx (hmode);
44119 ix86_expand_vector_init_concat (hmode, second [j],
44120 &first [i], 2);
44122 n >>= 1;
44123 ix86_expand_vector_init_concat (mode, target, second, n);
44125 else
44126 ix86_expand_vector_init_concat (mode, target, first, n);
44127 break;
44129 default:
44130 gcc_unreachable ();
44134 /* A subroutine of ix86_expand_vector_init_general. Use vector
44135 interleave to handle the most general case: all values variable,
44136 and none identical. */
44138 static void
44139 ix86_expand_vector_init_interleave (machine_mode mode,
44140 rtx target, rtx *ops, int n)
44142 machine_mode first_imode, second_imode, third_imode, inner_mode;
44143 int i, j;
44144 rtx op0, op1;
44145 rtx (*gen_load_even) (rtx, rtx, rtx);
44146 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
44147 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
44149 switch (mode)
44151 case E_V8HImode:
44152 gen_load_even = gen_vec_setv8hi;
44153 gen_interleave_first_low = gen_vec_interleave_lowv4si;
44154 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44155 inner_mode = HImode;
44156 first_imode = V4SImode;
44157 second_imode = V2DImode;
44158 third_imode = VOIDmode;
44159 break;
44160 case E_V16QImode:
44161 gen_load_even = gen_vec_setv16qi;
44162 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
44163 gen_interleave_second_low = gen_vec_interleave_lowv4si;
44164 inner_mode = QImode;
44165 first_imode = V8HImode;
44166 second_imode = V4SImode;
44167 third_imode = V2DImode;
44168 break;
44169 default:
44170 gcc_unreachable ();
44173 for (i = 0; i < n; i++)
44175 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
44176 op0 = gen_reg_rtx (SImode);
44177 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
44179 /* Insert the SImode value as low element of V4SImode vector. */
44180 op1 = gen_reg_rtx (V4SImode);
44181 op0 = gen_rtx_VEC_MERGE (V4SImode,
44182 gen_rtx_VEC_DUPLICATE (V4SImode,
44183 op0),
44184 CONST0_RTX (V4SImode),
44185 const1_rtx);
44186 emit_insn (gen_rtx_SET (op1, op0));
44188 /* Cast the V4SImode vector back to a vector in orignal mode. */
44189 op0 = gen_reg_rtx (mode);
44190 emit_move_insn (op0, gen_lowpart (mode, op1));
44192 /* Load even elements into the second position. */
44193 emit_insn (gen_load_even (op0,
44194 force_reg (inner_mode,
44195 ops [i + i + 1]),
44196 const1_rtx));
44198 /* Cast vector to FIRST_IMODE vector. */
44199 ops[i] = gen_reg_rtx (first_imode);
44200 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
44203 /* Interleave low FIRST_IMODE vectors. */
44204 for (i = j = 0; i < n; i += 2, j++)
44206 op0 = gen_reg_rtx (first_imode);
44207 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
44209 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
44210 ops[j] = gen_reg_rtx (second_imode);
44211 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
44214 /* Interleave low SECOND_IMODE vectors. */
44215 switch (second_imode)
44217 case E_V4SImode:
44218 for (i = j = 0; i < n / 2; i += 2, j++)
44220 op0 = gen_reg_rtx (second_imode);
44221 emit_insn (gen_interleave_second_low (op0, ops[i],
44222 ops[i + 1]));
44224 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
44225 vector. */
44226 ops[j] = gen_reg_rtx (third_imode);
44227 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
44229 second_imode = V2DImode;
44230 gen_interleave_second_low = gen_vec_interleave_lowv2di;
44231 /* FALLTHRU */
44233 case E_V2DImode:
44234 op0 = gen_reg_rtx (second_imode);
44235 emit_insn (gen_interleave_second_low (op0, ops[0],
44236 ops[1]));
44238 /* Cast the SECOND_IMODE vector back to a vector on original
44239 mode. */
44240 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
44241 break;
44243 default:
44244 gcc_unreachable ();
44248 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
44249 all values variable, and none identical. */
44251 static void
44252 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
44253 rtx target, rtx vals)
44255 rtx ops[64], op0, op1, op2, op3, op4, op5;
44256 machine_mode half_mode = VOIDmode;
44257 machine_mode quarter_mode = VOIDmode;
44258 int n, i;
44260 switch (mode)
44262 case E_V2SFmode:
44263 case E_V2SImode:
44264 if (!mmx_ok && !TARGET_SSE)
44265 break;
44266 /* FALLTHRU */
44268 case E_V16SImode:
44269 case E_V16SFmode:
44270 case E_V8DFmode:
44271 case E_V8DImode:
44272 case E_V8SFmode:
44273 case E_V8SImode:
44274 case E_V4DFmode:
44275 case E_V4DImode:
44276 case E_V4SFmode:
44277 case E_V4SImode:
44278 case E_V2DFmode:
44279 case E_V2DImode:
44280 n = GET_MODE_NUNITS (mode);
44281 for (i = 0; i < n; i++)
44282 ops[i] = XVECEXP (vals, 0, i);
44283 ix86_expand_vector_init_concat (mode, target, ops, n);
44284 return;
44286 case E_V2TImode:
44287 for (i = 0; i < 2; i++)
44288 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44289 op0 = gen_reg_rtx (V4DImode);
44290 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
44291 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44292 return;
44294 case E_V4TImode:
44295 for (i = 0; i < 4; i++)
44296 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
44297 ops[4] = gen_reg_rtx (V4DImode);
44298 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
44299 ops[5] = gen_reg_rtx (V4DImode);
44300 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
44301 op0 = gen_reg_rtx (V8DImode);
44302 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
44303 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
44304 return;
44306 case E_V32QImode:
44307 half_mode = V16QImode;
44308 goto half;
44310 case E_V16HImode:
44311 half_mode = V8HImode;
44312 goto half;
44314 half:
44315 n = GET_MODE_NUNITS (mode);
44316 for (i = 0; i < n; i++)
44317 ops[i] = XVECEXP (vals, 0, i);
44318 op0 = gen_reg_rtx (half_mode);
44319 op1 = gen_reg_rtx (half_mode);
44320 ix86_expand_vector_init_interleave (half_mode, op0, ops,
44321 n >> 2);
44322 ix86_expand_vector_init_interleave (half_mode, op1,
44323 &ops [n >> 1], n >> 2);
44324 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
44325 return;
44327 case E_V64QImode:
44328 quarter_mode = V16QImode;
44329 half_mode = V32QImode;
44330 goto quarter;
44332 case E_V32HImode:
44333 quarter_mode = V8HImode;
44334 half_mode = V16HImode;
44335 goto quarter;
44337 quarter:
44338 n = GET_MODE_NUNITS (mode);
44339 for (i = 0; i < n; i++)
44340 ops[i] = XVECEXP (vals, 0, i);
44341 op0 = gen_reg_rtx (quarter_mode);
44342 op1 = gen_reg_rtx (quarter_mode);
44343 op2 = gen_reg_rtx (quarter_mode);
44344 op3 = gen_reg_rtx (quarter_mode);
44345 op4 = gen_reg_rtx (half_mode);
44346 op5 = gen_reg_rtx (half_mode);
44347 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
44348 n >> 3);
44349 ix86_expand_vector_init_interleave (quarter_mode, op1,
44350 &ops [n >> 2], n >> 3);
44351 ix86_expand_vector_init_interleave (quarter_mode, op2,
44352 &ops [n >> 1], n >> 3);
44353 ix86_expand_vector_init_interleave (quarter_mode, op3,
44354 &ops [(n >> 1) | (n >> 2)], n >> 3);
44355 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
44356 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
44357 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
44358 return;
44360 case E_V16QImode:
44361 if (!TARGET_SSE4_1)
44362 break;
44363 /* FALLTHRU */
44365 case E_V8HImode:
44366 if (!TARGET_SSE2)
44367 break;
44369 /* Don't use ix86_expand_vector_init_interleave if we can't
44370 move from GPR to SSE register directly. */
44371 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
44372 break;
44374 n = GET_MODE_NUNITS (mode);
44375 for (i = 0; i < n; i++)
44376 ops[i] = XVECEXP (vals, 0, i);
44377 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
44378 return;
44380 case E_V4HImode:
44381 case E_V8QImode:
44382 break;
44384 default:
44385 gcc_unreachable ();
44389 int i, j, n_elts, n_words, n_elt_per_word;
44390 machine_mode inner_mode;
44391 rtx words[4], shift;
44393 inner_mode = GET_MODE_INNER (mode);
44394 n_elts = GET_MODE_NUNITS (mode);
44395 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
44396 n_elt_per_word = n_elts / n_words;
44397 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
44399 for (i = 0; i < n_words; ++i)
44401 rtx word = NULL_RTX;
44403 for (j = 0; j < n_elt_per_word; ++j)
44405 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
44406 elt = convert_modes (word_mode, inner_mode, elt, true);
44408 if (j == 0)
44409 word = elt;
44410 else
44412 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
44413 word, 1, OPTAB_LIB_WIDEN);
44414 word = expand_simple_binop (word_mode, IOR, word, elt,
44415 word, 1, OPTAB_LIB_WIDEN);
44419 words[i] = word;
44422 if (n_words == 1)
44423 emit_move_insn (target, gen_lowpart (mode, words[0]));
44424 else if (n_words == 2)
44426 rtx tmp = gen_reg_rtx (mode);
44427 emit_clobber (tmp);
44428 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
44429 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
44430 emit_move_insn (target, tmp);
44432 else if (n_words == 4)
44434 rtx tmp = gen_reg_rtx (V4SImode);
44435 gcc_assert (word_mode == SImode);
44436 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
44437 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
44438 emit_move_insn (target, gen_lowpart (mode, tmp));
44440 else
44441 gcc_unreachable ();
44445 /* Initialize vector TARGET via VALS. Suppress the use of MMX
44446 instructions unless MMX_OK is true. */
44448 void
44449 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
44451 machine_mode mode = GET_MODE (target);
44452 machine_mode inner_mode = GET_MODE_INNER (mode);
44453 int n_elts = GET_MODE_NUNITS (mode);
44454 int n_var = 0, one_var = -1;
44455 bool all_same = true, all_const_zero = true;
44456 int i;
44457 rtx x;
44459 /* Handle first initialization from vector elts. */
44460 if (n_elts != XVECLEN (vals, 0))
44462 rtx subtarget = target;
44463 x = XVECEXP (vals, 0, 0);
44464 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
44465 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
44467 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
44468 if (inner_mode == QImode || inner_mode == HImode)
44470 mode = mode_for_vector (SImode,
44471 n_elts * GET_MODE_SIZE (inner_mode) / 4);
44472 inner_mode
44473 = mode_for_vector (SImode,
44474 n_elts * GET_MODE_SIZE (inner_mode) / 8);
44475 ops[0] = gen_lowpart (inner_mode, ops[0]);
44476 ops[1] = gen_lowpart (inner_mode, ops[1]);
44477 subtarget = gen_reg_rtx (mode);
44479 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
44480 if (subtarget != target)
44481 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
44482 return;
44484 gcc_unreachable ();
44487 for (i = 0; i < n_elts; ++i)
44489 x = XVECEXP (vals, 0, i);
44490 if (!(CONST_SCALAR_INT_P (x)
44491 || CONST_DOUBLE_P (x)
44492 || CONST_FIXED_P (x)))
44493 n_var++, one_var = i;
44494 else if (x != CONST0_RTX (inner_mode))
44495 all_const_zero = false;
44496 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
44497 all_same = false;
44500 /* Constants are best loaded from the constant pool. */
44501 if (n_var == 0)
44503 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
44504 return;
44507 /* If all values are identical, broadcast the value. */
44508 if (all_same
44509 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
44510 XVECEXP (vals, 0, 0)))
44511 return;
44513 /* Values where only one field is non-constant are best loaded from
44514 the pool and overwritten via move later. */
44515 if (n_var == 1)
44517 if (all_const_zero
44518 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
44519 XVECEXP (vals, 0, one_var),
44520 one_var))
44521 return;
44523 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
44524 return;
44527 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
44530 void
44531 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
44533 machine_mode mode = GET_MODE (target);
44534 machine_mode inner_mode = GET_MODE_INNER (mode);
44535 machine_mode half_mode;
44536 bool use_vec_merge = false;
44537 rtx tmp;
44538 static rtx (*gen_extract[6][2]) (rtx, rtx)
44540 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
44541 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
44542 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
44543 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
44544 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
44545 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
44547 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
44549 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
44550 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
44551 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
44552 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
44553 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
44554 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
44556 int i, j, n;
44557 machine_mode mmode = VOIDmode;
44558 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
44560 switch (mode)
44562 case E_V2SFmode:
44563 case E_V2SImode:
44564 if (mmx_ok)
44566 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44567 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
44568 if (elt == 0)
44569 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44570 else
44571 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44572 emit_insn (gen_rtx_SET (target, tmp));
44573 return;
44575 break;
44577 case E_V2DImode:
44578 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
44579 if (use_vec_merge)
44580 break;
44582 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
44583 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
44584 if (elt == 0)
44585 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
44586 else
44587 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
44588 emit_insn (gen_rtx_SET (target, tmp));
44589 return;
44591 case E_V2DFmode:
44593 rtx op0, op1;
44595 /* For the two element vectors, we implement a VEC_CONCAT with
44596 the extraction of the other element. */
44598 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
44599 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
44601 if (elt == 0)
44602 op0 = val, op1 = tmp;
44603 else
44604 op0 = tmp, op1 = val;
44606 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
44607 emit_insn (gen_rtx_SET (target, tmp));
44609 return;
44611 case E_V4SFmode:
44612 use_vec_merge = TARGET_SSE4_1;
44613 if (use_vec_merge)
44614 break;
44616 switch (elt)
44618 case 0:
44619 use_vec_merge = true;
44620 break;
44622 case 1:
44623 /* tmp = target = A B C D */
44624 tmp = copy_to_reg (target);
44625 /* target = A A B B */
44626 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
44627 /* target = X A B B */
44628 ix86_expand_vector_set (false, target, val, 0);
44629 /* target = A X C D */
44630 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44631 const1_rtx, const0_rtx,
44632 GEN_INT (2+4), GEN_INT (3+4)));
44633 return;
44635 case 2:
44636 /* tmp = target = A B C D */
44637 tmp = copy_to_reg (target);
44638 /* tmp = X B C D */
44639 ix86_expand_vector_set (false, tmp, val, 0);
44640 /* target = A B X D */
44641 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44642 const0_rtx, const1_rtx,
44643 GEN_INT (0+4), GEN_INT (3+4)));
44644 return;
44646 case 3:
44647 /* tmp = target = A B C D */
44648 tmp = copy_to_reg (target);
44649 /* tmp = X B C D */
44650 ix86_expand_vector_set (false, tmp, val, 0);
44651 /* target = A B X D */
44652 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
44653 const0_rtx, const1_rtx,
44654 GEN_INT (2+4), GEN_INT (0+4)));
44655 return;
44657 default:
44658 gcc_unreachable ();
44660 break;
44662 case E_V4SImode:
44663 use_vec_merge = TARGET_SSE4_1;
44664 if (use_vec_merge)
44665 break;
44667 /* Element 0 handled by vec_merge below. */
44668 if (elt == 0)
44670 use_vec_merge = true;
44671 break;
44674 if (TARGET_SSE2)
44676 /* With SSE2, use integer shuffles to swap element 0 and ELT,
44677 store into element 0, then shuffle them back. */
44679 rtx order[4];
44681 order[0] = GEN_INT (elt);
44682 order[1] = const1_rtx;
44683 order[2] = const2_rtx;
44684 order[3] = GEN_INT (3);
44685 order[elt] = const0_rtx;
44687 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44688 order[1], order[2], order[3]));
44690 ix86_expand_vector_set (false, target, val, 0);
44692 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
44693 order[1], order[2], order[3]));
44695 else
44697 /* For SSE1, we have to reuse the V4SF code. */
44698 rtx t = gen_reg_rtx (V4SFmode);
44699 emit_move_insn (t, gen_lowpart (V4SFmode, target));
44700 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
44701 emit_move_insn (target, gen_lowpart (mode, t));
44703 return;
44705 case E_V8HImode:
44706 use_vec_merge = TARGET_SSE2;
44707 break;
44708 case E_V4HImode:
44709 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44710 break;
44712 case E_V16QImode:
44713 use_vec_merge = TARGET_SSE4_1;
44714 break;
44716 case E_V8QImode:
44717 break;
44719 case E_V32QImode:
44720 half_mode = V16QImode;
44721 j = 0;
44722 n = 16;
44723 goto half;
44725 case E_V16HImode:
44726 half_mode = V8HImode;
44727 j = 1;
44728 n = 8;
44729 goto half;
44731 case E_V8SImode:
44732 half_mode = V4SImode;
44733 j = 2;
44734 n = 4;
44735 goto half;
44737 case E_V4DImode:
44738 half_mode = V2DImode;
44739 j = 3;
44740 n = 2;
44741 goto half;
44743 case E_V8SFmode:
44744 half_mode = V4SFmode;
44745 j = 4;
44746 n = 4;
44747 goto half;
44749 case E_V4DFmode:
44750 half_mode = V2DFmode;
44751 j = 5;
44752 n = 2;
44753 goto half;
44755 half:
44756 /* Compute offset. */
44757 i = elt / n;
44758 elt %= n;
44760 gcc_assert (i <= 1);
44762 /* Extract the half. */
44763 tmp = gen_reg_rtx (half_mode);
44764 emit_insn (gen_extract[j][i] (tmp, target));
44766 /* Put val in tmp at elt. */
44767 ix86_expand_vector_set (false, tmp, val, elt);
44769 /* Put it back. */
44770 emit_insn (gen_insert[j][i] (target, target, tmp));
44771 return;
44773 case E_V8DFmode:
44774 if (TARGET_AVX512F)
44776 mmode = QImode;
44777 gen_blendm = gen_avx512f_blendmv8df;
44779 break;
44781 case E_V8DImode:
44782 if (TARGET_AVX512F)
44784 mmode = QImode;
44785 gen_blendm = gen_avx512f_blendmv8di;
44787 break;
44789 case E_V16SFmode:
44790 if (TARGET_AVX512F)
44792 mmode = HImode;
44793 gen_blendm = gen_avx512f_blendmv16sf;
44795 break;
44797 case E_V16SImode:
44798 if (TARGET_AVX512F)
44800 mmode = HImode;
44801 gen_blendm = gen_avx512f_blendmv16si;
44803 break;
44805 case E_V32HImode:
44806 if (TARGET_AVX512F && TARGET_AVX512BW)
44808 mmode = SImode;
44809 gen_blendm = gen_avx512bw_blendmv32hi;
44811 break;
44813 case E_V64QImode:
44814 if (TARGET_AVX512F && TARGET_AVX512BW)
44816 mmode = DImode;
44817 gen_blendm = gen_avx512bw_blendmv64qi;
44819 break;
44821 default:
44822 break;
44825 if (mmode != VOIDmode)
44827 tmp = gen_reg_rtx (mode);
44828 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
44829 /* The avx512*_blendm<mode> expanders have different operand order
44830 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
44831 elements where the mask is set and second input operand otherwise,
44832 in {sse,avx}*_*blend* the first input operand is used for elements
44833 where the mask is clear and second input operand otherwise. */
44834 emit_insn (gen_blendm (target, target, tmp,
44835 force_reg (mmode,
44836 gen_int_mode (1 << elt, mmode))));
44838 else if (use_vec_merge)
44840 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44841 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
44842 emit_insn (gen_rtx_SET (target, tmp));
44844 else
44846 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44848 emit_move_insn (mem, target);
44850 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44851 emit_move_insn (tmp, val);
44853 emit_move_insn (target, mem);
44857 void
44858 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44860 machine_mode mode = GET_MODE (vec);
44861 machine_mode inner_mode = GET_MODE_INNER (mode);
44862 bool use_vec_extr = false;
44863 rtx tmp;
44865 switch (mode)
44867 case E_V2SImode:
44868 case E_V2SFmode:
44869 if (!mmx_ok)
44870 break;
44871 /* FALLTHRU */
44873 case E_V2DFmode:
44874 case E_V2DImode:
44875 case E_V2TImode:
44876 case E_V4TImode:
44877 use_vec_extr = true;
44878 break;
44880 case E_V4SFmode:
44881 use_vec_extr = TARGET_SSE4_1;
44882 if (use_vec_extr)
44883 break;
44885 switch (elt)
44887 case 0:
44888 tmp = vec;
44889 break;
44891 case 1:
44892 case 3:
44893 tmp = gen_reg_rtx (mode);
44894 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44895 GEN_INT (elt), GEN_INT (elt),
44896 GEN_INT (elt+4), GEN_INT (elt+4)));
44897 break;
44899 case 2:
44900 tmp = gen_reg_rtx (mode);
44901 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44902 break;
44904 default:
44905 gcc_unreachable ();
44907 vec = tmp;
44908 use_vec_extr = true;
44909 elt = 0;
44910 break;
44912 case E_V4SImode:
44913 use_vec_extr = TARGET_SSE4_1;
44914 if (use_vec_extr)
44915 break;
44917 if (TARGET_SSE2)
44919 switch (elt)
44921 case 0:
44922 tmp = vec;
44923 break;
44925 case 1:
44926 case 3:
44927 tmp = gen_reg_rtx (mode);
44928 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44929 GEN_INT (elt), GEN_INT (elt),
44930 GEN_INT (elt), GEN_INT (elt)));
44931 break;
44933 case 2:
44934 tmp = gen_reg_rtx (mode);
44935 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44936 break;
44938 default:
44939 gcc_unreachable ();
44941 vec = tmp;
44942 use_vec_extr = true;
44943 elt = 0;
44945 else
44947 /* For SSE1, we have to reuse the V4SF code. */
44948 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44949 gen_lowpart (V4SFmode, vec), elt);
44950 return;
44952 break;
44954 case E_V8HImode:
44955 use_vec_extr = TARGET_SSE2;
44956 break;
44957 case E_V4HImode:
44958 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44959 break;
44961 case E_V16QImode:
44962 use_vec_extr = TARGET_SSE4_1;
44963 break;
44965 case E_V8SFmode:
44966 if (TARGET_AVX)
44968 tmp = gen_reg_rtx (V4SFmode);
44969 if (elt < 4)
44970 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44971 else
44972 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44973 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44974 return;
44976 break;
44978 case E_V4DFmode:
44979 if (TARGET_AVX)
44981 tmp = gen_reg_rtx (V2DFmode);
44982 if (elt < 2)
44983 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44984 else
44985 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44986 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44987 return;
44989 break;
44991 case E_V32QImode:
44992 if (TARGET_AVX)
44994 tmp = gen_reg_rtx (V16QImode);
44995 if (elt < 16)
44996 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44997 else
44998 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44999 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45000 return;
45002 break;
45004 case E_V16HImode:
45005 if (TARGET_AVX)
45007 tmp = gen_reg_rtx (V8HImode);
45008 if (elt < 8)
45009 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
45010 else
45011 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
45012 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45013 return;
45015 break;
45017 case E_V8SImode:
45018 if (TARGET_AVX)
45020 tmp = gen_reg_rtx (V4SImode);
45021 if (elt < 4)
45022 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
45023 else
45024 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
45025 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45026 return;
45028 break;
45030 case E_V4DImode:
45031 if (TARGET_AVX)
45033 tmp = gen_reg_rtx (V2DImode);
45034 if (elt < 2)
45035 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
45036 else
45037 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
45038 ix86_expand_vector_extract (false, target, tmp, elt & 1);
45039 return;
45041 break;
45043 case E_V32HImode:
45044 if (TARGET_AVX512BW)
45046 tmp = gen_reg_rtx (V16HImode);
45047 if (elt < 16)
45048 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
45049 else
45050 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
45051 ix86_expand_vector_extract (false, target, tmp, elt & 15);
45052 return;
45054 break;
45056 case E_V64QImode:
45057 if (TARGET_AVX512BW)
45059 tmp = gen_reg_rtx (V32QImode);
45060 if (elt < 32)
45061 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
45062 else
45063 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
45064 ix86_expand_vector_extract (false, target, tmp, elt & 31);
45065 return;
45067 break;
45069 case E_V16SFmode:
45070 tmp = gen_reg_rtx (V8SFmode);
45071 if (elt < 8)
45072 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
45073 else
45074 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
45075 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45076 return;
45078 case E_V8DFmode:
45079 tmp = gen_reg_rtx (V4DFmode);
45080 if (elt < 4)
45081 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
45082 else
45083 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
45084 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45085 return;
45087 case E_V16SImode:
45088 tmp = gen_reg_rtx (V8SImode);
45089 if (elt < 8)
45090 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
45091 else
45092 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
45093 ix86_expand_vector_extract (false, target, tmp, elt & 7);
45094 return;
45096 case E_V8DImode:
45097 tmp = gen_reg_rtx (V4DImode);
45098 if (elt < 4)
45099 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
45100 else
45101 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
45102 ix86_expand_vector_extract (false, target, tmp, elt & 3);
45103 return;
45105 case E_V8QImode:
45106 /* ??? Could extract the appropriate HImode element and shift. */
45107 default:
45108 break;
45111 if (use_vec_extr)
45113 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
45114 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
45116 /* Let the rtl optimizers know about the zero extension performed. */
45117 if (inner_mode == QImode || inner_mode == HImode)
45119 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
45120 target = gen_lowpart (SImode, target);
45123 emit_insn (gen_rtx_SET (target, tmp));
45125 else
45127 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
45129 emit_move_insn (mem, vec);
45131 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
45132 emit_move_insn (target, tmp);
45136 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
45137 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
45138 The upper bits of DEST are undefined, though they shouldn't cause
45139 exceptions (some bits from src or all zeros are ok). */
45141 static void
45142 emit_reduc_half (rtx dest, rtx src, int i)
45144 rtx tem, d = dest;
45145 switch (GET_MODE (src))
45147 case E_V4SFmode:
45148 if (i == 128)
45149 tem = gen_sse_movhlps (dest, src, src);
45150 else
45151 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
45152 GEN_INT (1 + 4), GEN_INT (1 + 4));
45153 break;
45154 case E_V2DFmode:
45155 tem = gen_vec_interleave_highv2df (dest, src, src);
45156 break;
45157 case E_V16QImode:
45158 case E_V8HImode:
45159 case E_V4SImode:
45160 case E_V2DImode:
45161 d = gen_reg_rtx (V1TImode);
45162 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
45163 GEN_INT (i / 2));
45164 break;
45165 case E_V8SFmode:
45166 if (i == 256)
45167 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
45168 else
45169 tem = gen_avx_shufps256 (dest, src, src,
45170 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
45171 break;
45172 case E_V4DFmode:
45173 if (i == 256)
45174 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
45175 else
45176 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
45177 break;
45178 case E_V32QImode:
45179 case E_V16HImode:
45180 case E_V8SImode:
45181 case E_V4DImode:
45182 if (i == 256)
45184 if (GET_MODE (dest) != V4DImode)
45185 d = gen_reg_rtx (V4DImode);
45186 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
45187 gen_lowpart (V4DImode, src),
45188 const1_rtx);
45190 else
45192 d = gen_reg_rtx (V2TImode);
45193 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
45194 GEN_INT (i / 2));
45196 break;
45197 case E_V64QImode:
45198 case E_V32HImode:
45199 case E_V16SImode:
45200 case E_V16SFmode:
45201 case E_V8DImode:
45202 case E_V8DFmode:
45203 if (i > 128)
45204 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
45205 gen_lowpart (V16SImode, src),
45206 gen_lowpart (V16SImode, src),
45207 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
45208 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
45209 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
45210 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
45211 GEN_INT (0xC), GEN_INT (0xD),
45212 GEN_INT (0xE), GEN_INT (0xF),
45213 GEN_INT (0x10), GEN_INT (0x11),
45214 GEN_INT (0x12), GEN_INT (0x13),
45215 GEN_INT (0x14), GEN_INT (0x15),
45216 GEN_INT (0x16), GEN_INT (0x17));
45217 else
45218 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
45219 gen_lowpart (V16SImode, src),
45220 GEN_INT (i == 128 ? 0x2 : 0x1),
45221 GEN_INT (0x3),
45222 GEN_INT (0x3),
45223 GEN_INT (0x3),
45224 GEN_INT (i == 128 ? 0x6 : 0x5),
45225 GEN_INT (0x7),
45226 GEN_INT (0x7),
45227 GEN_INT (0x7),
45228 GEN_INT (i == 128 ? 0xA : 0x9),
45229 GEN_INT (0xB),
45230 GEN_INT (0xB),
45231 GEN_INT (0xB),
45232 GEN_INT (i == 128 ? 0xE : 0xD),
45233 GEN_INT (0xF),
45234 GEN_INT (0xF),
45235 GEN_INT (0xF));
45236 break;
45237 default:
45238 gcc_unreachable ();
45240 emit_insn (tem);
45241 if (d != dest)
45242 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
45245 /* Expand a vector reduction. FN is the binary pattern to reduce;
45246 DEST is the destination; IN is the input vector. */
45248 void
45249 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
45251 rtx half, dst, vec = in;
45252 machine_mode mode = GET_MODE (in);
45253 int i;
45255 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
45256 if (TARGET_SSE4_1
45257 && mode == V8HImode
45258 && fn == gen_uminv8hi3)
45260 emit_insn (gen_sse4_1_phminposuw (dest, in));
45261 return;
45264 for (i = GET_MODE_BITSIZE (mode);
45265 i > GET_MODE_UNIT_BITSIZE (mode);
45266 i >>= 1)
45268 half = gen_reg_rtx (mode);
45269 emit_reduc_half (half, vec, i);
45270 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
45271 dst = dest;
45272 else
45273 dst = gen_reg_rtx (mode);
45274 emit_insn (fn (dst, half, vec));
45275 vec = dst;
45279 /* Target hook for scalar_mode_supported_p. */
45280 static bool
45281 ix86_scalar_mode_supported_p (machine_mode mode)
45283 if (DECIMAL_FLOAT_MODE_P (mode))
45284 return default_decimal_float_supported_p ();
45285 else if (mode == TFmode)
45286 return true;
45287 else
45288 return default_scalar_mode_supported_p (mode);
45291 /* Implements target hook vector_mode_supported_p. */
45292 static bool
45293 ix86_vector_mode_supported_p (machine_mode mode)
45295 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
45296 return true;
45297 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
45298 return true;
45299 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
45300 return true;
45301 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
45302 return true;
45303 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
45304 return true;
45305 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
45306 return true;
45307 return false;
45310 /* Target hook for c_mode_for_suffix. */
45311 static machine_mode
45312 ix86_c_mode_for_suffix (char suffix)
45314 if (suffix == 'q')
45315 return TFmode;
45316 if (suffix == 'w')
45317 return XFmode;
45319 return VOIDmode;
45322 /* Worker function for TARGET_MD_ASM_ADJUST.
45324 We implement asm flag outputs, and maintain source compatibility
45325 with the old cc0-based compiler. */
45327 static rtx_insn *
45328 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
45329 vec<const char *> &constraints,
45330 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
45332 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
45333 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
45335 bool saw_asm_flag = false;
45337 start_sequence ();
45338 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
45340 const char *con = constraints[i];
45341 if (strncmp (con, "=@cc", 4) != 0)
45342 continue;
45343 con += 4;
45344 if (strchr (con, ',') != NULL)
45346 error ("alternatives not allowed in asm flag output");
45347 continue;
45350 bool invert = false;
45351 if (con[0] == 'n')
45352 invert = true, con++;
45354 machine_mode mode = CCmode;
45355 rtx_code code = UNKNOWN;
45357 switch (con[0])
45359 case 'a':
45360 if (con[1] == 0)
45361 mode = CCAmode, code = EQ;
45362 else if (con[1] == 'e' && con[2] == 0)
45363 mode = CCCmode, code = NE;
45364 break;
45365 case 'b':
45366 if (con[1] == 0)
45367 mode = CCCmode, code = EQ;
45368 else if (con[1] == 'e' && con[2] == 0)
45369 mode = CCAmode, code = NE;
45370 break;
45371 case 'c':
45372 if (con[1] == 0)
45373 mode = CCCmode, code = EQ;
45374 break;
45375 case 'e':
45376 if (con[1] == 0)
45377 mode = CCZmode, code = EQ;
45378 break;
45379 case 'g':
45380 if (con[1] == 0)
45381 mode = CCGCmode, code = GT;
45382 else if (con[1] == 'e' && con[2] == 0)
45383 mode = CCGCmode, code = GE;
45384 break;
45385 case 'l':
45386 if (con[1] == 0)
45387 mode = CCGCmode, code = LT;
45388 else if (con[1] == 'e' && con[2] == 0)
45389 mode = CCGCmode, code = LE;
45390 break;
45391 case 'o':
45392 if (con[1] == 0)
45393 mode = CCOmode, code = EQ;
45394 break;
45395 case 'p':
45396 if (con[1] == 0)
45397 mode = CCPmode, code = EQ;
45398 break;
45399 case 's':
45400 if (con[1] == 0)
45401 mode = CCSmode, code = EQ;
45402 break;
45403 case 'z':
45404 if (con[1] == 0)
45405 mode = CCZmode, code = EQ;
45406 break;
45408 if (code == UNKNOWN)
45410 error ("unknown asm flag output %qs", constraints[i]);
45411 continue;
45413 if (invert)
45414 code = reverse_condition (code);
45416 rtx dest = outputs[i];
45417 if (!saw_asm_flag)
45419 /* This is the first asm flag output. Here we put the flags
45420 register in as the real output and adjust the condition to
45421 allow it. */
45422 constraints[i] = "=Bf";
45423 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
45424 saw_asm_flag = true;
45426 else
45428 /* We don't need the flags register as output twice. */
45429 constraints[i] = "=X";
45430 outputs[i] = gen_rtx_SCRATCH (SImode);
45433 rtx x = gen_rtx_REG (mode, FLAGS_REG);
45434 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
45436 machine_mode dest_mode = GET_MODE (dest);
45437 if (!SCALAR_INT_MODE_P (dest_mode))
45439 error ("invalid type for asm flag output");
45440 continue;
45443 if (dest_mode == DImode && !TARGET_64BIT)
45444 dest_mode = SImode;
45446 if (dest_mode != QImode)
45448 rtx destqi = gen_reg_rtx (QImode);
45449 emit_insn (gen_rtx_SET (destqi, x));
45451 if (TARGET_ZERO_EXTEND_WITH_AND
45452 && optimize_function_for_speed_p (cfun))
45454 x = force_reg (dest_mode, const0_rtx);
45456 emit_insn (gen_movstrictqi
45457 (gen_lowpart (QImode, x), destqi));
45459 else
45460 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
45463 if (dest_mode != GET_MODE (dest))
45465 rtx tmp = gen_reg_rtx (SImode);
45467 emit_insn (gen_rtx_SET (tmp, x));
45468 emit_insn (gen_zero_extendsidi2 (dest, tmp));
45470 else
45471 emit_insn (gen_rtx_SET (dest, x));
45473 rtx_insn *seq = get_insns ();
45474 end_sequence ();
45476 if (saw_asm_flag)
45477 return seq;
45478 else
45480 /* If we had no asm flag outputs, clobber the flags. */
45481 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
45482 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
45483 return NULL;
45487 /* Implements target vector targetm.asm.encode_section_info. */
45489 static void ATTRIBUTE_UNUSED
45490 ix86_encode_section_info (tree decl, rtx rtl, int first)
45492 default_encode_section_info (decl, rtl, first);
45494 if (ix86_in_large_data_p (decl))
45495 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
45498 /* Worker function for REVERSE_CONDITION. */
45500 enum rtx_code
45501 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
45503 return (mode != CCFPmode && mode != CCFPUmode
45504 ? reverse_condition (code)
45505 : reverse_condition_maybe_unordered (code));
45508 /* Output code to perform an x87 FP register move, from OPERANDS[1]
45509 to OPERANDS[0]. */
45511 const char *
45512 output_387_reg_move (rtx_insn *insn, rtx *operands)
45514 if (REG_P (operands[0]))
45516 if (REG_P (operands[1])
45517 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45519 if (REGNO (operands[0]) == FIRST_STACK_REG)
45520 return output_387_ffreep (operands, 0);
45521 return "fstp\t%y0";
45523 if (STACK_TOP_P (operands[0]))
45524 return "fld%Z1\t%y1";
45525 return "fst\t%y0";
45527 else if (MEM_P (operands[0]))
45529 gcc_assert (REG_P (operands[1]));
45530 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
45531 return "fstp%Z0\t%y0";
45532 else
45534 /* There is no non-popping store to memory for XFmode.
45535 So if we need one, follow the store with a load. */
45536 if (GET_MODE (operands[0]) == XFmode)
45537 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
45538 else
45539 return "fst%Z0\t%y0";
45542 else
45543 gcc_unreachable();
45546 /* Output code to perform a conditional jump to LABEL, if C2 flag in
45547 FP status register is set. */
45549 void
45550 ix86_emit_fp_unordered_jump (rtx label)
45552 rtx reg = gen_reg_rtx (HImode);
45553 rtx temp;
45555 emit_insn (gen_x86_fnstsw_1 (reg));
45557 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
45559 emit_insn (gen_x86_sahf_1 (reg));
45561 temp = gen_rtx_REG (CCmode, FLAGS_REG);
45562 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
45564 else
45566 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
45568 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
45569 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
45572 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
45573 gen_rtx_LABEL_REF (VOIDmode, label),
45574 pc_rtx);
45575 temp = gen_rtx_SET (pc_rtx, temp);
45577 emit_jump_insn (temp);
45578 predict_jump (REG_BR_PROB_BASE * 10 / 100);
45581 /* Output code to perform a log1p XFmode calculation. */
45583 void ix86_emit_i387_log1p (rtx op0, rtx op1)
45585 rtx_code_label *label1 = gen_label_rtx ();
45586 rtx_code_label *label2 = gen_label_rtx ();
45588 rtx tmp = gen_reg_rtx (XFmode);
45589 rtx tmp2 = gen_reg_rtx (XFmode);
45590 rtx test;
45592 emit_insn (gen_absxf2 (tmp, op1));
45593 test = gen_rtx_GE (VOIDmode, tmp,
45594 const_double_from_real_value (
45595 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
45596 XFmode));
45597 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
45599 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45600 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
45601 emit_jump (label2);
45603 emit_label (label1);
45604 emit_move_insn (tmp, CONST1_RTX (XFmode));
45605 emit_insn (gen_addxf3 (tmp, op1, tmp));
45606 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
45607 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
45609 emit_label (label2);
45612 /* Emit code for round calculation. */
45613 void ix86_emit_i387_round (rtx op0, rtx op1)
45615 machine_mode inmode = GET_MODE (op1);
45616 machine_mode outmode = GET_MODE (op0);
45617 rtx e1, e2, res, tmp, tmp1, half;
45618 rtx scratch = gen_reg_rtx (HImode);
45619 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
45620 rtx_code_label *jump_label = gen_label_rtx ();
45621 rtx insn;
45622 rtx (*gen_abs) (rtx, rtx);
45623 rtx (*gen_neg) (rtx, rtx);
45625 switch (inmode)
45627 case E_SFmode:
45628 gen_abs = gen_abssf2;
45629 break;
45630 case E_DFmode:
45631 gen_abs = gen_absdf2;
45632 break;
45633 case E_XFmode:
45634 gen_abs = gen_absxf2;
45635 break;
45636 default:
45637 gcc_unreachable ();
45640 switch (outmode)
45642 case E_SFmode:
45643 gen_neg = gen_negsf2;
45644 break;
45645 case E_DFmode:
45646 gen_neg = gen_negdf2;
45647 break;
45648 case E_XFmode:
45649 gen_neg = gen_negxf2;
45650 break;
45651 case E_HImode:
45652 gen_neg = gen_neghi2;
45653 break;
45654 case E_SImode:
45655 gen_neg = gen_negsi2;
45656 break;
45657 case E_DImode:
45658 gen_neg = gen_negdi2;
45659 break;
45660 default:
45661 gcc_unreachable ();
45664 e1 = gen_reg_rtx (inmode);
45665 e2 = gen_reg_rtx (inmode);
45666 res = gen_reg_rtx (outmode);
45668 half = const_double_from_real_value (dconsthalf, inmode);
45670 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
45672 /* scratch = fxam(op1) */
45673 emit_insn (gen_rtx_SET (scratch,
45674 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
45675 UNSPEC_FXAM)));
45676 /* e1 = fabs(op1) */
45677 emit_insn (gen_abs (e1, op1));
45679 /* e2 = e1 + 0.5 */
45680 half = force_reg (inmode, half);
45681 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
45683 /* res = floor(e2) */
45684 if (inmode != XFmode)
45686 tmp1 = gen_reg_rtx (XFmode);
45688 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
45690 else
45691 tmp1 = e2;
45693 switch (outmode)
45695 case E_SFmode:
45696 case E_DFmode:
45698 rtx tmp0 = gen_reg_rtx (XFmode);
45700 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
45702 emit_insn (gen_rtx_SET (res,
45703 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
45704 UNSPEC_TRUNC_NOOP)));
45706 break;
45707 case E_XFmode:
45708 emit_insn (gen_frndintxf2_floor (res, tmp1));
45709 break;
45710 case E_HImode:
45711 emit_insn (gen_lfloorxfhi2 (res, tmp1));
45712 break;
45713 case E_SImode:
45714 emit_insn (gen_lfloorxfsi2 (res, tmp1));
45715 break;
45716 case E_DImode:
45717 emit_insn (gen_lfloorxfdi2 (res, tmp1));
45718 break;
45719 default:
45720 gcc_unreachable ();
45723 /* flags = signbit(a) */
45724 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
45726 /* if (flags) then res = -res */
45727 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
45728 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
45729 gen_rtx_LABEL_REF (VOIDmode, jump_label),
45730 pc_rtx);
45731 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45732 predict_jump (REG_BR_PROB_BASE * 50 / 100);
45733 JUMP_LABEL (insn) = jump_label;
45735 emit_insn (gen_neg (res, res));
45737 emit_label (jump_label);
45738 LABEL_NUSES (jump_label) = 1;
45740 emit_move_insn (op0, res);
45743 /* Output code to perform a Newton-Rhapson approximation of a single precision
45744 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
45746 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
45748 rtx x0, x1, e0, e1;
45750 x0 = gen_reg_rtx (mode);
45751 e0 = gen_reg_rtx (mode);
45752 e1 = gen_reg_rtx (mode);
45753 x1 = gen_reg_rtx (mode);
45755 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
45757 b = force_reg (mode, b);
45759 /* x0 = rcp(b) estimate */
45760 if (mode == V16SFmode || mode == V8DFmode)
45762 if (TARGET_AVX512ER)
45764 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45765 UNSPEC_RCP28)));
45766 /* res = a * x0 */
45767 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
45768 return;
45770 else
45771 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45772 UNSPEC_RCP14)));
45774 else
45775 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
45776 UNSPEC_RCP)));
45778 /* e0 = x0 * b */
45779 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
45781 /* e0 = x0 * e0 */
45782 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
45784 /* e1 = x0 + x0 */
45785 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
45787 /* x1 = e1 - e0 */
45788 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
45790 /* res = a * x1 */
45791 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
45794 /* Output code to perform a Newton-Rhapson approximation of a
45795 single precision floating point [reciprocal] square root. */
45797 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
45799 rtx x0, e0, e1, e2, e3, mthree, mhalf;
45800 REAL_VALUE_TYPE r;
45801 int unspec;
45803 x0 = gen_reg_rtx (mode);
45804 e0 = gen_reg_rtx (mode);
45805 e1 = gen_reg_rtx (mode);
45806 e2 = gen_reg_rtx (mode);
45807 e3 = gen_reg_rtx (mode);
45809 if (TARGET_AVX512ER && mode == V16SFmode)
45811 if (recip)
45812 /* res = rsqrt28(a) estimate */
45813 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45814 UNSPEC_RSQRT28)));
45815 else
45817 /* x0 = rsqrt28(a) estimate */
45818 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45819 UNSPEC_RSQRT28)));
45820 /* res = rcp28(x0) estimate */
45821 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
45822 UNSPEC_RCP28)));
45824 return;
45827 real_from_integer (&r, VOIDmode, -3, SIGNED);
45828 mthree = const_double_from_real_value (r, SFmode);
45830 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
45831 mhalf = const_double_from_real_value (r, SFmode);
45832 unspec = UNSPEC_RSQRT;
45834 if (VECTOR_MODE_P (mode))
45836 mthree = ix86_build_const_vector (mode, true, mthree);
45837 mhalf = ix86_build_const_vector (mode, true, mhalf);
45838 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45839 if (GET_MODE_SIZE (mode) == 64)
45840 unspec = UNSPEC_RSQRT14;
45843 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45844 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45846 a = force_reg (mode, a);
45848 /* x0 = rsqrt(a) estimate */
45849 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45850 unspec)));
45852 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45853 if (!recip)
45855 rtx zero = force_reg (mode, CONST0_RTX(mode));
45856 rtx mask;
45858 /* Handle masked compare. */
45859 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45861 mask = gen_reg_rtx (HImode);
45862 /* Imm value 0x4 corresponds to not-equal comparison. */
45863 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45864 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45866 else
45868 mask = gen_reg_rtx (mode);
45869 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45870 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45874 /* e0 = x0 * a */
45875 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45876 /* e1 = e0 * x0 */
45877 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45879 /* e2 = e1 - 3. */
45880 mthree = force_reg (mode, mthree);
45881 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45883 mhalf = force_reg (mode, mhalf);
45884 if (recip)
45885 /* e3 = -.5 * x0 */
45886 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45887 else
45888 /* e3 = -.5 * e0 */
45889 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45890 /* ret = e2 * e3 */
45891 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45894 #ifdef TARGET_SOLARIS
45895 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45897 static void
45898 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45899 tree decl)
45901 /* With Binutils 2.15, the "@unwind" marker must be specified on
45902 every occurrence of the ".eh_frame" section, not just the first
45903 one. */
45904 if (TARGET_64BIT
45905 && strcmp (name, ".eh_frame") == 0)
45907 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45908 flags & SECTION_WRITE ? "aw" : "a");
45909 return;
45912 #ifndef USE_GAS
45913 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45915 solaris_elf_asm_comdat_section (name, flags, decl);
45916 return;
45918 #endif
45920 default_elf_asm_named_section (name, flags, decl);
45922 #endif /* TARGET_SOLARIS */
45924 /* Return the mangling of TYPE if it is an extended fundamental type. */
45926 static const char *
45927 ix86_mangle_type (const_tree type)
45929 type = TYPE_MAIN_VARIANT (type);
45931 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45932 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45933 return NULL;
45935 switch (TYPE_MODE (type))
45937 case E_TFmode:
45938 /* __float128 is "g". */
45939 return "g";
45940 case E_XFmode:
45941 /* "long double" or __float80 is "e". */
45942 return "e";
45943 default:
45944 return NULL;
45948 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45950 static tree
45951 ix86_stack_protect_guard (void)
45953 if (TARGET_SSP_TLS_GUARD)
45955 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45956 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45957 tree type = build_qualified_type (type_node, qual);
45958 tree t;
45960 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45962 t = ix86_tls_stack_chk_guard_decl;
45964 if (t == NULL)
45966 rtx x;
45968 t = build_decl
45969 (UNKNOWN_LOCATION, VAR_DECL,
45970 get_identifier (ix86_stack_protector_guard_symbol_str),
45971 type);
45972 TREE_STATIC (t) = 1;
45973 TREE_PUBLIC (t) = 1;
45974 DECL_EXTERNAL (t) = 1;
45975 TREE_USED (t) = 1;
45976 TREE_THIS_VOLATILE (t) = 1;
45977 DECL_ARTIFICIAL (t) = 1;
45978 DECL_IGNORED_P (t) = 1;
45980 /* Do not share RTL as the declaration is visible outside of
45981 current function. */
45982 x = DECL_RTL (t);
45983 RTX_FLAG (x, used) = 1;
45985 ix86_tls_stack_chk_guard_decl = t;
45988 else
45990 tree asptrtype = build_pointer_type (type);
45992 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45993 t = build2 (MEM_REF, asptrtype, t,
45994 build_int_cst (asptrtype, 0));
45997 return t;
46000 return default_stack_protect_guard ();
46003 /* For 32-bit code we can save PIC register setup by using
46004 __stack_chk_fail_local hidden function instead of calling
46005 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
46006 register, so it is better to call __stack_chk_fail directly. */
46008 static tree ATTRIBUTE_UNUSED
46009 ix86_stack_protect_fail (void)
46011 return TARGET_64BIT
46012 ? default_external_stack_protect_fail ()
46013 : default_hidden_stack_protect_fail ();
46016 /* Select a format to encode pointers in exception handling data. CODE
46017 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
46018 true if the symbol may be affected by dynamic relocations.
46020 ??? All x86 object file formats are capable of representing this.
46021 After all, the relocation needed is the same as for the call insn.
46022 Whether or not a particular assembler allows us to enter such, I
46023 guess we'll have to see. */
46025 asm_preferred_eh_data_format (int code, int global)
46027 if (flag_pic)
46029 int type = DW_EH_PE_sdata8;
46030 if (!TARGET_64BIT
46031 || ix86_cmodel == CM_SMALL_PIC
46032 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
46033 type = DW_EH_PE_sdata4;
46034 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
46036 if (ix86_cmodel == CM_SMALL
46037 || (ix86_cmodel == CM_MEDIUM && code))
46038 return DW_EH_PE_udata4;
46039 return DW_EH_PE_absptr;
46042 /* Expand copysign from SIGN to the positive value ABS_VALUE
46043 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
46044 the sign-bit. */
46045 static void
46046 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
46048 machine_mode mode = GET_MODE (sign);
46049 rtx sgn = gen_reg_rtx (mode);
46050 if (mask == NULL_RTX)
46052 machine_mode vmode;
46054 if (mode == SFmode)
46055 vmode = V4SFmode;
46056 else if (mode == DFmode)
46057 vmode = V2DFmode;
46058 else
46059 vmode = mode;
46061 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
46062 if (!VECTOR_MODE_P (mode))
46064 /* We need to generate a scalar mode mask in this case. */
46065 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46066 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46067 mask = gen_reg_rtx (mode);
46068 emit_insn (gen_rtx_SET (mask, tmp));
46071 else
46072 mask = gen_rtx_NOT (mode, mask);
46073 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
46074 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
46077 /* Expand fabs (OP0) and return a new rtx that holds the result. The
46078 mask for masking out the sign-bit is stored in *SMASK, if that is
46079 non-null. */
46080 static rtx
46081 ix86_expand_sse_fabs (rtx op0, rtx *smask)
46083 machine_mode vmode, mode = GET_MODE (op0);
46084 rtx xa, mask;
46086 xa = gen_reg_rtx (mode);
46087 if (mode == SFmode)
46088 vmode = V4SFmode;
46089 else if (mode == DFmode)
46090 vmode = V2DFmode;
46091 else
46092 vmode = mode;
46093 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
46094 if (!VECTOR_MODE_P (mode))
46096 /* We need to generate a scalar mode mask in this case. */
46097 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
46098 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
46099 mask = gen_reg_rtx (mode);
46100 emit_insn (gen_rtx_SET (mask, tmp));
46102 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
46104 if (smask)
46105 *smask = mask;
46107 return xa;
46110 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
46111 swapping the operands if SWAP_OPERANDS is true. The expanded
46112 code is a forward jump to a newly created label in case the
46113 comparison is true. The generated label rtx is returned. */
46114 static rtx_code_label *
46115 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
46116 bool swap_operands)
46118 machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
46119 rtx_code_label *label;
46120 rtx tmp;
46122 if (swap_operands)
46123 std::swap (op0, op1);
46125 label = gen_label_rtx ();
46126 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
46127 emit_insn (gen_rtx_SET (tmp, gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
46128 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
46129 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
46130 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
46131 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
46132 JUMP_LABEL (tmp) = label;
46134 return label;
46137 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
46138 using comparison code CODE. Operands are swapped for the comparison if
46139 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
46140 static rtx
46141 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
46142 bool swap_operands)
46144 rtx (*insn)(rtx, rtx, rtx, rtx);
46145 machine_mode mode = GET_MODE (op0);
46146 rtx mask = gen_reg_rtx (mode);
46148 if (swap_operands)
46149 std::swap (op0, op1);
46151 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
46153 emit_insn (insn (mask, op0, op1,
46154 gen_rtx_fmt_ee (code, mode, op0, op1)));
46155 return mask;
46158 /* Generate and return a rtx of mode MODE for 2**n where n is the number
46159 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
46160 static rtx
46161 ix86_gen_TWO52 (machine_mode mode)
46163 REAL_VALUE_TYPE TWO52r;
46164 rtx TWO52;
46166 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
46167 TWO52 = const_double_from_real_value (TWO52r, mode);
46168 TWO52 = force_reg (mode, TWO52);
46170 return TWO52;
46173 /* Expand SSE sequence for computing lround from OP1 storing
46174 into OP0. */
46175 void
46176 ix86_expand_lround (rtx op0, rtx op1)
46178 /* C code for the stuff we're doing below:
46179 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
46180 return (long)tmp;
46182 machine_mode mode = GET_MODE (op1);
46183 const struct real_format *fmt;
46184 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46185 rtx adj;
46187 /* load nextafter (0.5, 0.0) */
46188 fmt = REAL_MODE_FORMAT (mode);
46189 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46190 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46192 /* adj = copysign (0.5, op1) */
46193 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
46194 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
46196 /* adj = op1 + adj */
46197 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
46199 /* op0 = (imode)adj */
46200 expand_fix (op0, adj, 0);
46203 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
46204 into OPERAND0. */
46205 void
46206 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
46208 /* C code for the stuff we're doing below (for do_floor):
46209 xi = (long)op1;
46210 xi -= (double)xi > op1 ? 1 : 0;
46211 return xi;
46213 machine_mode fmode = GET_MODE (op1);
46214 machine_mode imode = GET_MODE (op0);
46215 rtx ireg, freg, tmp;
46216 rtx_code_label *label;
46218 /* reg = (long)op1 */
46219 ireg = gen_reg_rtx (imode);
46220 expand_fix (ireg, op1, 0);
46222 /* freg = (double)reg */
46223 freg = gen_reg_rtx (fmode);
46224 expand_float (freg, ireg, 0);
46226 /* ireg = (freg > op1) ? ireg - 1 : ireg */
46227 label = ix86_expand_sse_compare_and_jump (UNLE,
46228 freg, op1, !do_floor);
46229 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
46230 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
46231 emit_move_insn (ireg, tmp);
46233 emit_label (label);
46234 LABEL_NUSES (label) = 1;
46236 emit_move_insn (op0, ireg);
46239 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
46240 result in OPERAND0. */
46241 void
46242 ix86_expand_rint (rtx operand0, rtx operand1)
46244 /* C code for the stuff we're doing below:
46245 xa = fabs (operand1);
46246 if (!isless (xa, 2**52))
46247 return operand1;
46248 xa = xa + 2**52 - 2**52;
46249 return copysign (xa, operand1);
46251 machine_mode mode = GET_MODE (operand0);
46252 rtx res, xa, TWO52, mask;
46253 rtx_code_label *label;
46255 res = gen_reg_rtx (mode);
46256 emit_move_insn (res, operand1);
46258 /* xa = abs (operand1) */
46259 xa = ix86_expand_sse_fabs (res, &mask);
46261 /* if (!isless (xa, TWO52)) goto label; */
46262 TWO52 = ix86_gen_TWO52 (mode);
46263 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46265 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46266 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46268 ix86_sse_copysign_to_positive (res, xa, res, mask);
46270 emit_label (label);
46271 LABEL_NUSES (label) = 1;
46273 emit_move_insn (operand0, res);
46276 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46277 into OPERAND0. */
46278 void
46279 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
46281 /* C code for the stuff we expand below.
46282 double xa = fabs (x), x2;
46283 if (!isless (xa, TWO52))
46284 return x;
46285 xa = xa + TWO52 - TWO52;
46286 x2 = copysign (xa, x);
46287 Compensate. Floor:
46288 if (x2 > x)
46289 x2 -= 1;
46290 Compensate. Ceil:
46291 if (x2 < x)
46292 x2 -= -1;
46293 return x2;
46295 machine_mode mode = GET_MODE (operand0);
46296 rtx xa, TWO52, tmp, one, res, mask;
46297 rtx_code_label *label;
46299 TWO52 = ix86_gen_TWO52 (mode);
46301 /* Temporary for holding the result, initialized to the input
46302 operand to ease control flow. */
46303 res = gen_reg_rtx (mode);
46304 emit_move_insn (res, operand1);
46306 /* xa = abs (operand1) */
46307 xa = ix86_expand_sse_fabs (res, &mask);
46309 /* if (!isless (xa, TWO52)) goto label; */
46310 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46312 /* xa = xa + TWO52 - TWO52; */
46313 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46314 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
46316 /* xa = copysign (xa, operand1) */
46317 ix86_sse_copysign_to_positive (xa, xa, res, mask);
46319 /* generate 1.0 or -1.0 */
46320 one = force_reg (mode,
46321 const_double_from_real_value (do_floor
46322 ? dconst1 : dconstm1, mode));
46324 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46325 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46326 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46327 /* We always need to subtract here to preserve signed zero. */
46328 tmp = expand_simple_binop (mode, MINUS,
46329 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46330 emit_move_insn (res, tmp);
46332 emit_label (label);
46333 LABEL_NUSES (label) = 1;
46335 emit_move_insn (operand0, res);
46338 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
46339 into OPERAND0. */
46340 void
46341 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
46343 /* C code for the stuff we expand below.
46344 double xa = fabs (x), x2;
46345 if (!isless (xa, TWO52))
46346 return x;
46347 x2 = (double)(long)x;
46348 Compensate. Floor:
46349 if (x2 > x)
46350 x2 -= 1;
46351 Compensate. Ceil:
46352 if (x2 < x)
46353 x2 += 1;
46354 if (HONOR_SIGNED_ZEROS (mode))
46355 return copysign (x2, x);
46356 return x2;
46358 machine_mode mode = GET_MODE (operand0);
46359 rtx xa, xi, TWO52, tmp, one, res, mask;
46360 rtx_code_label *label;
46362 TWO52 = ix86_gen_TWO52 (mode);
46364 /* Temporary for holding the result, initialized to the input
46365 operand to ease control flow. */
46366 res = gen_reg_rtx (mode);
46367 emit_move_insn (res, operand1);
46369 /* xa = abs (operand1) */
46370 xa = ix86_expand_sse_fabs (res, &mask);
46372 /* if (!isless (xa, TWO52)) goto label; */
46373 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46375 /* xa = (double)(long)x */
46376 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46377 expand_fix (xi, res, 0);
46378 expand_float (xa, xi, 0);
46380 /* generate 1.0 */
46381 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46383 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
46384 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
46385 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46386 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
46387 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46388 emit_move_insn (res, tmp);
46390 if (HONOR_SIGNED_ZEROS (mode))
46391 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46393 emit_label (label);
46394 LABEL_NUSES (label) = 1;
46396 emit_move_insn (operand0, res);
46399 /* Expand SSE sequence for computing round from OPERAND1 storing
46400 into OPERAND0. Sequence that works without relying on DImode truncation
46401 via cvttsd2siq that is only available on 64bit targets. */
46402 void
46403 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
46405 /* C code for the stuff we expand below.
46406 double xa = fabs (x), xa2, x2;
46407 if (!isless (xa, TWO52))
46408 return x;
46409 Using the absolute value and copying back sign makes
46410 -0.0 -> -0.0 correct.
46411 xa2 = xa + TWO52 - TWO52;
46412 Compensate.
46413 dxa = xa2 - xa;
46414 if (dxa <= -0.5)
46415 xa2 += 1;
46416 else if (dxa > 0.5)
46417 xa2 -= 1;
46418 x2 = copysign (xa2, x);
46419 return x2;
46421 machine_mode mode = GET_MODE (operand0);
46422 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
46423 rtx_code_label *label;
46425 TWO52 = ix86_gen_TWO52 (mode);
46427 /* Temporary for holding the result, initialized to the input
46428 operand to ease control flow. */
46429 res = gen_reg_rtx (mode);
46430 emit_move_insn (res, operand1);
46432 /* xa = abs (operand1) */
46433 xa = ix86_expand_sse_fabs (res, &mask);
46435 /* if (!isless (xa, TWO52)) goto label; */
46436 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46438 /* xa2 = xa + TWO52 - TWO52; */
46439 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46440 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
46442 /* dxa = xa2 - xa; */
46443 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
46445 /* generate 0.5, 1.0 and -0.5 */
46446 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
46447 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
46448 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
46449 0, OPTAB_DIRECT);
46451 /* Compensate. */
46452 tmp = gen_reg_rtx (mode);
46453 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
46454 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
46455 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46456 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46457 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
46458 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
46459 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
46460 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
46462 /* res = copysign (xa2, operand1) */
46463 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
46465 emit_label (label);
46466 LABEL_NUSES (label) = 1;
46468 emit_move_insn (operand0, res);
46471 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46472 into OPERAND0. */
46473 void
46474 ix86_expand_trunc (rtx operand0, rtx operand1)
46476 /* C code for SSE variant we expand below.
46477 double xa = fabs (x), x2;
46478 if (!isless (xa, TWO52))
46479 return x;
46480 x2 = (double)(long)x;
46481 if (HONOR_SIGNED_ZEROS (mode))
46482 return copysign (x2, x);
46483 return x2;
46485 machine_mode mode = GET_MODE (operand0);
46486 rtx xa, xi, TWO52, res, mask;
46487 rtx_code_label *label;
46489 TWO52 = ix86_gen_TWO52 (mode);
46491 /* Temporary for holding the result, initialized to the input
46492 operand to ease control flow. */
46493 res = gen_reg_rtx (mode);
46494 emit_move_insn (res, operand1);
46496 /* xa = abs (operand1) */
46497 xa = ix86_expand_sse_fabs (res, &mask);
46499 /* if (!isless (xa, TWO52)) goto label; */
46500 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46502 /* x = (double)(long)x */
46503 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46504 expand_fix (xi, res, 0);
46505 expand_float (res, xi, 0);
46507 if (HONOR_SIGNED_ZEROS (mode))
46508 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
46510 emit_label (label);
46511 LABEL_NUSES (label) = 1;
46513 emit_move_insn (operand0, res);
46516 /* Expand SSE sequence for computing trunc from OPERAND1 storing
46517 into OPERAND0. */
46518 void
46519 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
46521 machine_mode mode = GET_MODE (operand0);
46522 rtx xa, mask, TWO52, one, res, smask, tmp;
46523 rtx_code_label *label;
46525 /* C code for SSE variant we expand below.
46526 double xa = fabs (x), x2;
46527 if (!isless (xa, TWO52))
46528 return x;
46529 xa2 = xa + TWO52 - TWO52;
46530 Compensate:
46531 if (xa2 > xa)
46532 xa2 -= 1.0;
46533 x2 = copysign (xa2, x);
46534 return x2;
46537 TWO52 = ix86_gen_TWO52 (mode);
46539 /* Temporary for holding the result, initialized to the input
46540 operand to ease control flow. */
46541 res = gen_reg_rtx (mode);
46542 emit_move_insn (res, operand1);
46544 /* xa = abs (operand1) */
46545 xa = ix86_expand_sse_fabs (res, &smask);
46547 /* if (!isless (xa, TWO52)) goto label; */
46548 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46550 /* res = xa + TWO52 - TWO52; */
46551 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
46552 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
46553 emit_move_insn (res, tmp);
46555 /* generate 1.0 */
46556 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
46558 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
46559 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
46560 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
46561 tmp = expand_simple_binop (mode, MINUS,
46562 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
46563 emit_move_insn (res, tmp);
46565 /* res = copysign (res, operand1) */
46566 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
46568 emit_label (label);
46569 LABEL_NUSES (label) = 1;
46571 emit_move_insn (operand0, res);
46574 /* Expand SSE sequence for computing round from OPERAND1 storing
46575 into OPERAND0. */
46576 void
46577 ix86_expand_round (rtx operand0, rtx operand1)
46579 /* C code for the stuff we're doing below:
46580 double xa = fabs (x);
46581 if (!isless (xa, TWO52))
46582 return x;
46583 xa = (double)(long)(xa + nextafter (0.5, 0.0));
46584 return copysign (xa, x);
46586 machine_mode mode = GET_MODE (operand0);
46587 rtx res, TWO52, xa, xi, half, mask;
46588 rtx_code_label *label;
46589 const struct real_format *fmt;
46590 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46592 /* Temporary for holding the result, initialized to the input
46593 operand to ease control flow. */
46594 res = gen_reg_rtx (mode);
46595 emit_move_insn (res, operand1);
46597 TWO52 = ix86_gen_TWO52 (mode);
46598 xa = ix86_expand_sse_fabs (res, &mask);
46599 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
46601 /* load nextafter (0.5, 0.0) */
46602 fmt = REAL_MODE_FORMAT (mode);
46603 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46604 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46606 /* xa = xa + 0.5 */
46607 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
46608 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
46610 /* xa = (double)(int64_t)xa */
46611 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
46612 expand_fix (xi, xa, 0);
46613 expand_float (xa, xi, 0);
46615 /* res = copysign (xa, operand1) */
46616 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
46618 emit_label (label);
46619 LABEL_NUSES (label) = 1;
46621 emit_move_insn (operand0, res);
46624 /* Expand SSE sequence for computing round
46625 from OP1 storing into OP0 using sse4 round insn. */
46626 void
46627 ix86_expand_round_sse4 (rtx op0, rtx op1)
46629 machine_mode mode = GET_MODE (op0);
46630 rtx e1, e2, res, half;
46631 const struct real_format *fmt;
46632 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
46633 rtx (*gen_copysign) (rtx, rtx, rtx);
46634 rtx (*gen_round) (rtx, rtx, rtx);
46636 switch (mode)
46638 case E_SFmode:
46639 gen_copysign = gen_copysignsf3;
46640 gen_round = gen_sse4_1_roundsf2;
46641 break;
46642 case E_DFmode:
46643 gen_copysign = gen_copysigndf3;
46644 gen_round = gen_sse4_1_rounddf2;
46645 break;
46646 default:
46647 gcc_unreachable ();
46650 /* round (a) = trunc (a + copysign (0.5, a)) */
46652 /* load nextafter (0.5, 0.0) */
46653 fmt = REAL_MODE_FORMAT (mode);
46654 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
46655 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
46656 half = const_double_from_real_value (pred_half, mode);
46658 /* e1 = copysign (0.5, op1) */
46659 e1 = gen_reg_rtx (mode);
46660 emit_insn (gen_copysign (e1, half, op1));
46662 /* e2 = op1 + e1 */
46663 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
46665 /* res = trunc (e2) */
46666 res = gen_reg_rtx (mode);
46667 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
46669 emit_move_insn (op0, res);
46673 /* Table of valid machine attributes. */
46674 static const struct attribute_spec ix86_attribute_table[] =
46676 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
46677 affects_type_identity } */
46678 /* Stdcall attribute says callee is responsible for popping arguments
46679 if they are not variable. */
46680 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46681 true },
46682 /* Fastcall attribute says callee is responsible for popping arguments
46683 if they are not variable. */
46684 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46685 true },
46686 /* Thiscall attribute says callee is responsible for popping arguments
46687 if they are not variable. */
46688 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46689 true },
46690 /* Cdecl attribute says the callee is a normal C declaration */
46691 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46692 true },
46693 /* Regparm attribute specifies how many integer arguments are to be
46694 passed in registers. */
46695 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
46696 true },
46697 /* Sseregparm attribute says we are using x86_64 calling conventions
46698 for FP arguments. */
46699 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
46700 true },
46701 /* The transactional memory builtins are implicitly regparm or fastcall
46702 depending on the ABI. Override the generic do-nothing attribute that
46703 these builtins were declared with. */
46704 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
46705 true },
46706 /* force_align_arg_pointer says this function realigns the stack at entry. */
46707 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
46708 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
46709 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
46710 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
46711 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
46712 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
46713 false },
46714 #endif
46715 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46716 false },
46717 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
46718 false },
46719 #ifdef SUBTARGET_ATTRIBUTE_TABLE
46720 SUBTARGET_ATTRIBUTE_TABLE,
46721 #endif
46722 /* ms_abi and sysv_abi calling convention function attributes. */
46723 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46724 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
46725 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
46726 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
46727 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
46728 false },
46729 { "callee_pop_aggregate_return", 1, 1, false, true, true,
46730 ix86_handle_callee_pop_aggregate_return, true },
46731 { "interrupt", 0, 0, false, true, true,
46732 ix86_handle_interrupt_attribute, false },
46733 { "no_caller_saved_registers", 0, 0, false, true, true,
46734 ix86_handle_no_caller_saved_registers_attribute, false },
46735 { "naked", 0, 0, true, false, false,
46736 ix86_handle_fndecl_attribute, false },
46738 /* End element. */
46739 { NULL, 0, 0, false, false, false, NULL, false }
46742 /* Implement targetm.vectorize.builtin_vectorization_cost. */
46743 static int
46744 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
46745 tree vectype, int)
46747 switch (type_of_cost)
46749 case scalar_stmt:
46750 return ix86_cost->scalar_stmt_cost;
46752 case scalar_load:
46753 return ix86_cost->scalar_load_cost;
46755 case scalar_store:
46756 return ix86_cost->scalar_store_cost;
46758 case vector_stmt:
46759 return ix86_cost->vec_stmt_cost;
46761 case vector_load:
46762 return ix86_cost->vec_align_load_cost;
46764 case vector_store:
46765 return ix86_cost->vec_store_cost;
46767 case vec_to_scalar:
46768 return ix86_cost->vec_to_scalar_cost;
46770 case scalar_to_vec:
46771 return ix86_cost->scalar_to_vec_cost;
46773 case unaligned_load:
46774 case unaligned_store:
46775 return ix86_cost->vec_unalign_load_cost;
46777 case cond_branch_taken:
46778 return ix86_cost->cond_taken_branch_cost;
46780 case cond_branch_not_taken:
46781 return ix86_cost->cond_not_taken_branch_cost;
46783 case vec_perm:
46784 case vec_promote_demote:
46785 return ix86_cost->vec_stmt_cost;
46787 case vec_construct:
46788 return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
46790 default:
46791 gcc_unreachable ();
46795 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46796 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46797 insn every time. */
46799 static GTY(()) rtx_insn *vselect_insn;
46801 /* Initialize vselect_insn. */
46803 static void
46804 init_vselect_insn (void)
46806 unsigned i;
46807 rtx x;
46809 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46810 for (i = 0; i < MAX_VECT_LEN; ++i)
46811 XVECEXP (x, 0, i) = const0_rtx;
46812 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46813 const0_rtx), x);
46814 x = gen_rtx_SET (const0_rtx, x);
46815 start_sequence ();
46816 vselect_insn = emit_insn (x);
46817 end_sequence ();
46820 /* Construct (set target (vec_select op0 (parallel perm))) and
46821 return true if that's a valid instruction in the active ISA. */
46823 static bool
46824 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46825 unsigned nelt, bool testing_p)
46827 unsigned int i;
46828 rtx x, save_vconcat;
46829 int icode;
46831 if (vselect_insn == NULL_RTX)
46832 init_vselect_insn ();
46834 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46835 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46836 for (i = 0; i < nelt; ++i)
46837 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46838 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46839 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46840 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46841 SET_DEST (PATTERN (vselect_insn)) = target;
46842 icode = recog_memoized (vselect_insn);
46844 if (icode >= 0 && !testing_p)
46845 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46847 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46848 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46849 INSN_CODE (vselect_insn) = -1;
46851 return icode >= 0;
46854 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46856 static bool
46857 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46858 const unsigned char *perm, unsigned nelt,
46859 bool testing_p)
46861 machine_mode v2mode;
46862 rtx x;
46863 bool ok;
46865 if (vselect_insn == NULL_RTX)
46866 init_vselect_insn ();
46868 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
46869 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46870 PUT_MODE (x, v2mode);
46871 XEXP (x, 0) = op0;
46872 XEXP (x, 1) = op1;
46873 ok = expand_vselect (target, x, perm, nelt, testing_p);
46874 XEXP (x, 0) = const0_rtx;
46875 XEXP (x, 1) = const0_rtx;
46876 return ok;
46879 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46880 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46882 static bool
46883 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46885 machine_mode mmode, vmode = d->vmode;
46886 unsigned i, mask, nelt = d->nelt;
46887 rtx target, op0, op1, maskop, x;
46888 rtx rperm[32], vperm;
46890 if (d->one_operand_p)
46891 return false;
46892 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46893 && (TARGET_AVX512BW
46894 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46896 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46898 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46900 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46902 else
46903 return false;
46905 /* This is a blend, not a permute. Elements must stay in their
46906 respective lanes. */
46907 for (i = 0; i < nelt; ++i)
46909 unsigned e = d->perm[i];
46910 if (!(e == i || e == i + nelt))
46911 return false;
46914 if (d->testing_p)
46915 return true;
46917 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46918 decision should be extracted elsewhere, so that we only try that
46919 sequence once all budget==3 options have been tried. */
46920 target = d->target;
46921 op0 = d->op0;
46922 op1 = d->op1;
46923 mask = 0;
46925 switch (vmode)
46927 case E_V8DFmode:
46928 case E_V16SFmode:
46929 case E_V4DFmode:
46930 case E_V8SFmode:
46931 case E_V2DFmode:
46932 case E_V4SFmode:
46933 case E_V8HImode:
46934 case E_V8SImode:
46935 case E_V32HImode:
46936 case E_V64QImode:
46937 case E_V16SImode:
46938 case E_V8DImode:
46939 for (i = 0; i < nelt; ++i)
46940 mask |= (d->perm[i] >= nelt) << i;
46941 break;
46943 case E_V2DImode:
46944 for (i = 0; i < 2; ++i)
46945 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46946 vmode = V8HImode;
46947 goto do_subreg;
46949 case E_V4SImode:
46950 for (i = 0; i < 4; ++i)
46951 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46952 vmode = V8HImode;
46953 goto do_subreg;
46955 case E_V16QImode:
46956 /* See if bytes move in pairs so we can use pblendw with
46957 an immediate argument, rather than pblendvb with a vector
46958 argument. */
46959 for (i = 0; i < 16; i += 2)
46960 if (d->perm[i] + 1 != d->perm[i + 1])
46962 use_pblendvb:
46963 for (i = 0; i < nelt; ++i)
46964 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46966 finish_pblendvb:
46967 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46968 vperm = force_reg (vmode, vperm);
46970 if (GET_MODE_SIZE (vmode) == 16)
46971 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46972 else
46973 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46974 if (target != d->target)
46975 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46976 return true;
46979 for (i = 0; i < 8; ++i)
46980 mask |= (d->perm[i * 2] >= 16) << i;
46981 vmode = V8HImode;
46982 /* FALLTHRU */
46984 do_subreg:
46985 target = gen_reg_rtx (vmode);
46986 op0 = gen_lowpart (vmode, op0);
46987 op1 = gen_lowpart (vmode, op1);
46988 break;
46990 case E_V32QImode:
46991 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46992 for (i = 0; i < 32; i += 2)
46993 if (d->perm[i] + 1 != d->perm[i + 1])
46994 goto use_pblendvb;
46995 /* See if bytes move in quadruplets. If yes, vpblendd
46996 with immediate can be used. */
46997 for (i = 0; i < 32; i += 4)
46998 if (d->perm[i] + 2 != d->perm[i + 2])
46999 break;
47000 if (i < 32)
47002 /* See if bytes move the same in both lanes. If yes,
47003 vpblendw with immediate can be used. */
47004 for (i = 0; i < 16; i += 2)
47005 if (d->perm[i] + 16 != d->perm[i + 16])
47006 goto use_pblendvb;
47008 /* Use vpblendw. */
47009 for (i = 0; i < 16; ++i)
47010 mask |= (d->perm[i * 2] >= 32) << i;
47011 vmode = V16HImode;
47012 goto do_subreg;
47015 /* Use vpblendd. */
47016 for (i = 0; i < 8; ++i)
47017 mask |= (d->perm[i * 4] >= 32) << i;
47018 vmode = V8SImode;
47019 goto do_subreg;
47021 case E_V16HImode:
47022 /* See if words move in pairs. If yes, vpblendd can be used. */
47023 for (i = 0; i < 16; i += 2)
47024 if (d->perm[i] + 1 != d->perm[i + 1])
47025 break;
47026 if (i < 16)
47028 /* See if words move the same in both lanes. If not,
47029 vpblendvb must be used. */
47030 for (i = 0; i < 8; i++)
47031 if (d->perm[i] + 8 != d->perm[i + 8])
47033 /* Use vpblendvb. */
47034 for (i = 0; i < 32; ++i)
47035 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
47037 vmode = V32QImode;
47038 nelt = 32;
47039 target = gen_reg_rtx (vmode);
47040 op0 = gen_lowpart (vmode, op0);
47041 op1 = gen_lowpart (vmode, op1);
47042 goto finish_pblendvb;
47045 /* Use vpblendw. */
47046 for (i = 0; i < 16; ++i)
47047 mask |= (d->perm[i] >= 16) << i;
47048 break;
47051 /* Use vpblendd. */
47052 for (i = 0; i < 8; ++i)
47053 mask |= (d->perm[i * 2] >= 16) << i;
47054 vmode = V8SImode;
47055 goto do_subreg;
47057 case E_V4DImode:
47058 /* Use vpblendd. */
47059 for (i = 0; i < 4; ++i)
47060 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
47061 vmode = V8SImode;
47062 goto do_subreg;
47064 default:
47065 gcc_unreachable ();
47068 switch (vmode)
47070 case E_V8DFmode:
47071 case E_V8DImode:
47072 mmode = QImode;
47073 break;
47074 case E_V16SFmode:
47075 case E_V16SImode:
47076 mmode = HImode;
47077 break;
47078 case E_V32HImode:
47079 mmode = SImode;
47080 break;
47081 case E_V64QImode:
47082 mmode = DImode;
47083 break;
47084 default:
47085 mmode = VOIDmode;
47088 if (mmode != VOIDmode)
47089 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
47090 else
47091 maskop = GEN_INT (mask);
47093 /* This matches five different patterns with the different modes. */
47094 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
47095 x = gen_rtx_SET (target, x);
47096 emit_insn (x);
47097 if (target != d->target)
47098 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47100 return true;
47103 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47104 in terms of the variable form of vpermilps.
47106 Note that we will have already failed the immediate input vpermilps,
47107 which requires that the high and low part shuffle be identical; the
47108 variable form doesn't require that. */
47110 static bool
47111 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
47113 rtx rperm[8], vperm;
47114 unsigned i;
47116 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
47117 return false;
47119 /* We can only permute within the 128-bit lane. */
47120 for (i = 0; i < 8; ++i)
47122 unsigned e = d->perm[i];
47123 if (i < 4 ? e >= 4 : e < 4)
47124 return false;
47127 if (d->testing_p)
47128 return true;
47130 for (i = 0; i < 8; ++i)
47132 unsigned e = d->perm[i];
47134 /* Within each 128-bit lane, the elements of op0 are numbered
47135 from 0 and the elements of op1 are numbered from 4. */
47136 if (e >= 8 + 4)
47137 e -= 8;
47138 else if (e >= 4)
47139 e -= 4;
47141 rperm[i] = GEN_INT (e);
47144 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
47145 vperm = force_reg (V8SImode, vperm);
47146 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
47148 return true;
47151 /* Return true if permutation D can be performed as VMODE permutation
47152 instead. */
47154 static bool
47155 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
47157 unsigned int i, j, chunk;
47159 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
47160 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
47161 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
47162 return false;
47164 if (GET_MODE_NUNITS (vmode) >= d->nelt)
47165 return true;
47167 chunk = d->nelt / GET_MODE_NUNITS (vmode);
47168 for (i = 0; i < d->nelt; i += chunk)
47169 if (d->perm[i] & (chunk - 1))
47170 return false;
47171 else
47172 for (j = 1; j < chunk; ++j)
47173 if (d->perm[i] + j != d->perm[i + j])
47174 return false;
47176 return true;
47179 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47180 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
47182 static bool
47183 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
47185 unsigned i, nelt, eltsz, mask;
47186 unsigned char perm[64];
47187 machine_mode vmode = V16QImode;
47188 rtx rperm[64], vperm, target, op0, op1;
47190 nelt = d->nelt;
47192 if (!d->one_operand_p)
47194 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
47196 if (TARGET_AVX2
47197 && valid_perm_using_mode_p (V2TImode, d))
47199 if (d->testing_p)
47200 return true;
47202 /* Use vperm2i128 insn. The pattern uses
47203 V4DImode instead of V2TImode. */
47204 target = d->target;
47205 if (d->vmode != V4DImode)
47206 target = gen_reg_rtx (V4DImode);
47207 op0 = gen_lowpart (V4DImode, d->op0);
47208 op1 = gen_lowpart (V4DImode, d->op1);
47209 rperm[0]
47210 = GEN_INT ((d->perm[0] / (nelt / 2))
47211 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
47212 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
47213 if (target != d->target)
47214 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47215 return true;
47217 return false;
47220 else
47222 if (GET_MODE_SIZE (d->vmode) == 16)
47224 if (!TARGET_SSSE3)
47225 return false;
47227 else if (GET_MODE_SIZE (d->vmode) == 32)
47229 if (!TARGET_AVX2)
47230 return false;
47232 /* V4DImode should be already handled through
47233 expand_vselect by vpermq instruction. */
47234 gcc_assert (d->vmode != V4DImode);
47236 vmode = V32QImode;
47237 if (d->vmode == V8SImode
47238 || d->vmode == V16HImode
47239 || d->vmode == V32QImode)
47241 /* First see if vpermq can be used for
47242 V8SImode/V16HImode/V32QImode. */
47243 if (valid_perm_using_mode_p (V4DImode, d))
47245 for (i = 0; i < 4; i++)
47246 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
47247 if (d->testing_p)
47248 return true;
47249 target = gen_reg_rtx (V4DImode);
47250 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
47251 perm, 4, false))
47253 emit_move_insn (d->target,
47254 gen_lowpart (d->vmode, target));
47255 return true;
47257 return false;
47260 /* Next see if vpermd can be used. */
47261 if (valid_perm_using_mode_p (V8SImode, d))
47262 vmode = V8SImode;
47264 /* Or if vpermps can be used. */
47265 else if (d->vmode == V8SFmode)
47266 vmode = V8SImode;
47268 if (vmode == V32QImode)
47270 /* vpshufb only works intra lanes, it is not
47271 possible to shuffle bytes in between the lanes. */
47272 for (i = 0; i < nelt; ++i)
47273 if ((d->perm[i] ^ i) & (nelt / 2))
47274 return false;
47277 else if (GET_MODE_SIZE (d->vmode) == 64)
47279 if (!TARGET_AVX512BW)
47280 return false;
47282 /* If vpermq didn't work, vpshufb won't work either. */
47283 if (d->vmode == V8DFmode || d->vmode == V8DImode)
47284 return false;
47286 vmode = V64QImode;
47287 if (d->vmode == V16SImode
47288 || d->vmode == V32HImode
47289 || d->vmode == V64QImode)
47291 /* First see if vpermq can be used for
47292 V16SImode/V32HImode/V64QImode. */
47293 if (valid_perm_using_mode_p (V8DImode, d))
47295 for (i = 0; i < 8; i++)
47296 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
47297 if (d->testing_p)
47298 return true;
47299 target = gen_reg_rtx (V8DImode);
47300 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
47301 perm, 8, false))
47303 emit_move_insn (d->target,
47304 gen_lowpart (d->vmode, target));
47305 return true;
47307 return false;
47310 /* Next see if vpermd can be used. */
47311 if (valid_perm_using_mode_p (V16SImode, d))
47312 vmode = V16SImode;
47314 /* Or if vpermps can be used. */
47315 else if (d->vmode == V16SFmode)
47316 vmode = V16SImode;
47317 if (vmode == V64QImode)
47319 /* vpshufb only works intra lanes, it is not
47320 possible to shuffle bytes in between the lanes. */
47321 for (i = 0; i < nelt; ++i)
47322 if ((d->perm[i] ^ i) & (nelt / 4))
47323 return false;
47326 else
47327 return false;
47330 if (d->testing_p)
47331 return true;
47333 if (vmode == V8SImode)
47334 for (i = 0; i < 8; ++i)
47335 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
47336 else if (vmode == V16SImode)
47337 for (i = 0; i < 16; ++i)
47338 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
47339 else
47341 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47342 if (!d->one_operand_p)
47343 mask = 2 * nelt - 1;
47344 else if (vmode == V16QImode)
47345 mask = nelt - 1;
47346 else if (vmode == V64QImode)
47347 mask = nelt / 4 - 1;
47348 else
47349 mask = nelt / 2 - 1;
47351 for (i = 0; i < nelt; ++i)
47353 unsigned j, e = d->perm[i] & mask;
47354 for (j = 0; j < eltsz; ++j)
47355 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
47359 vperm = gen_rtx_CONST_VECTOR (vmode,
47360 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
47361 vperm = force_reg (vmode, vperm);
47363 target = d->target;
47364 if (d->vmode != vmode)
47365 target = gen_reg_rtx (vmode);
47366 op0 = gen_lowpart (vmode, d->op0);
47367 if (d->one_operand_p)
47369 if (vmode == V16QImode)
47370 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
47371 else if (vmode == V32QImode)
47372 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
47373 else if (vmode == V64QImode)
47374 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
47375 else if (vmode == V8SFmode)
47376 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
47377 else if (vmode == V8SImode)
47378 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
47379 else if (vmode == V16SFmode)
47380 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
47381 else if (vmode == V16SImode)
47382 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
47383 else
47384 gcc_unreachable ();
47386 else
47388 op1 = gen_lowpart (vmode, d->op1);
47389 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
47391 if (target != d->target)
47392 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
47394 return true;
47397 /* For V*[QHS]Imode permutations, check if the same permutation
47398 can't be performed in a 2x, 4x or 8x wider inner mode. */
47400 static bool
47401 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
47402 struct expand_vec_perm_d *nd)
47404 int i;
47405 machine_mode mode = VOIDmode;
47407 switch (d->vmode)
47409 case E_V16QImode: mode = V8HImode; break;
47410 case E_V32QImode: mode = V16HImode; break;
47411 case E_V64QImode: mode = V32HImode; break;
47412 case E_V8HImode: mode = V4SImode; break;
47413 case E_V16HImode: mode = V8SImode; break;
47414 case E_V32HImode: mode = V16SImode; break;
47415 case E_V4SImode: mode = V2DImode; break;
47416 case E_V8SImode: mode = V4DImode; break;
47417 case E_V16SImode: mode = V8DImode; break;
47418 default: return false;
47420 for (i = 0; i < d->nelt; i += 2)
47421 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
47422 return false;
47423 nd->vmode = mode;
47424 nd->nelt = d->nelt / 2;
47425 for (i = 0; i < nd->nelt; i++)
47426 nd->perm[i] = d->perm[2 * i] / 2;
47427 if (GET_MODE_INNER (mode) != DImode)
47428 canonicalize_vector_int_perm (nd, nd);
47429 if (nd != d)
47431 nd->one_operand_p = d->one_operand_p;
47432 nd->testing_p = d->testing_p;
47433 if (d->op0 == d->op1)
47434 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
47435 else
47437 nd->op0 = gen_lowpart (nd->vmode, d->op0);
47438 nd->op1 = gen_lowpart (nd->vmode, d->op1);
47440 if (d->testing_p)
47441 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
47442 else
47443 nd->target = gen_reg_rtx (nd->vmode);
47445 return true;
47448 /* Try to expand one-operand permutation with constant mask. */
47450 static bool
47451 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
47453 machine_mode mode = GET_MODE (d->op0);
47454 machine_mode maskmode = mode;
47455 rtx (*gen) (rtx, rtx, rtx) = NULL;
47456 rtx target, op0, mask;
47457 rtx vec[64];
47459 if (!rtx_equal_p (d->op0, d->op1))
47460 return false;
47462 if (!TARGET_AVX512F)
47463 return false;
47465 switch (mode)
47467 case E_V16SImode:
47468 gen = gen_avx512f_permvarv16si;
47469 break;
47470 case E_V16SFmode:
47471 gen = gen_avx512f_permvarv16sf;
47472 maskmode = V16SImode;
47473 break;
47474 case E_V8DImode:
47475 gen = gen_avx512f_permvarv8di;
47476 break;
47477 case E_V8DFmode:
47478 gen = gen_avx512f_permvarv8df;
47479 maskmode = V8DImode;
47480 break;
47481 default:
47482 return false;
47485 target = d->target;
47486 op0 = d->op0;
47487 for (int i = 0; i < d->nelt; ++i)
47488 vec[i] = GEN_INT (d->perm[i]);
47489 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
47490 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
47491 return true;
47494 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
47495 in a single instruction. */
47497 static bool
47498 expand_vec_perm_1 (struct expand_vec_perm_d *d)
47500 unsigned i, nelt = d->nelt;
47501 struct expand_vec_perm_d nd;
47503 /* Check plain VEC_SELECT first, because AVX has instructions that could
47504 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
47505 input where SEL+CONCAT may not. */
47506 if (d->one_operand_p)
47508 int mask = nelt - 1;
47509 bool identity_perm = true;
47510 bool broadcast_perm = true;
47512 for (i = 0; i < nelt; i++)
47514 nd.perm[i] = d->perm[i] & mask;
47515 if (nd.perm[i] != i)
47516 identity_perm = false;
47517 if (nd.perm[i])
47518 broadcast_perm = false;
47521 if (identity_perm)
47523 if (!d->testing_p)
47524 emit_move_insn (d->target, d->op0);
47525 return true;
47527 else if (broadcast_perm && TARGET_AVX2)
47529 /* Use vpbroadcast{b,w,d}. */
47530 rtx (*gen) (rtx, rtx) = NULL;
47531 switch (d->vmode)
47533 case E_V64QImode:
47534 if (TARGET_AVX512BW)
47535 gen = gen_avx512bw_vec_dupv64qi_1;
47536 break;
47537 case E_V32QImode:
47538 gen = gen_avx2_pbroadcastv32qi_1;
47539 break;
47540 case E_V32HImode:
47541 if (TARGET_AVX512BW)
47542 gen = gen_avx512bw_vec_dupv32hi_1;
47543 break;
47544 case E_V16HImode:
47545 gen = gen_avx2_pbroadcastv16hi_1;
47546 break;
47547 case E_V16SImode:
47548 if (TARGET_AVX512F)
47549 gen = gen_avx512f_vec_dupv16si_1;
47550 break;
47551 case E_V8SImode:
47552 gen = gen_avx2_pbroadcastv8si_1;
47553 break;
47554 case E_V16QImode:
47555 gen = gen_avx2_pbroadcastv16qi;
47556 break;
47557 case E_V8HImode:
47558 gen = gen_avx2_pbroadcastv8hi;
47559 break;
47560 case E_V16SFmode:
47561 if (TARGET_AVX512F)
47562 gen = gen_avx512f_vec_dupv16sf_1;
47563 break;
47564 case E_V8SFmode:
47565 gen = gen_avx2_vec_dupv8sf_1;
47566 break;
47567 case E_V8DFmode:
47568 if (TARGET_AVX512F)
47569 gen = gen_avx512f_vec_dupv8df_1;
47570 break;
47571 case E_V8DImode:
47572 if (TARGET_AVX512F)
47573 gen = gen_avx512f_vec_dupv8di_1;
47574 break;
47575 /* For other modes prefer other shuffles this function creates. */
47576 default: break;
47578 if (gen != NULL)
47580 if (!d->testing_p)
47581 emit_insn (gen (d->target, d->op0));
47582 return true;
47586 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
47587 return true;
47589 /* There are plenty of patterns in sse.md that are written for
47590 SEL+CONCAT and are not replicated for a single op. Perhaps
47591 that should be changed, to avoid the nastiness here. */
47593 /* Recognize interleave style patterns, which means incrementing
47594 every other permutation operand. */
47595 for (i = 0; i < nelt; i += 2)
47597 nd.perm[i] = d->perm[i] & mask;
47598 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
47600 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47601 d->testing_p))
47602 return true;
47604 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
47605 if (nelt >= 4)
47607 for (i = 0; i < nelt; i += 4)
47609 nd.perm[i + 0] = d->perm[i + 0] & mask;
47610 nd.perm[i + 1] = d->perm[i + 1] & mask;
47611 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
47612 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
47615 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
47616 d->testing_p))
47617 return true;
47621 /* Finally, try the fully general two operand permute. */
47622 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
47623 d->testing_p))
47624 return true;
47626 /* Recognize interleave style patterns with reversed operands. */
47627 if (!d->one_operand_p)
47629 for (i = 0; i < nelt; ++i)
47631 unsigned e = d->perm[i];
47632 if (e >= nelt)
47633 e -= nelt;
47634 else
47635 e += nelt;
47636 nd.perm[i] = e;
47639 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
47640 d->testing_p))
47641 return true;
47644 /* Try the SSE4.1 blend variable merge instructions. */
47645 if (expand_vec_perm_blend (d))
47646 return true;
47648 /* Try one of the AVX vpermil variable permutations. */
47649 if (expand_vec_perm_vpermil (d))
47650 return true;
47652 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
47653 vpshufb, vpermd, vpermps or vpermq variable permutation. */
47654 if (expand_vec_perm_pshufb (d))
47655 return true;
47657 /* Try the AVX2 vpalignr instruction. */
47658 if (expand_vec_perm_palignr (d, true))
47659 return true;
47661 /* Try the AVX512F vperm{s,d} instructions. */
47662 if (ix86_expand_vec_one_operand_perm_avx512 (d))
47663 return true;
47665 /* Try the AVX512F vpermi2 instructions. */
47666 if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
47667 return true;
47669 /* See if we can get the same permutation in different vector integer
47670 mode. */
47671 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47673 if (!d->testing_p)
47674 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47675 return true;
47677 return false;
47680 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
47681 in terms of a pair of pshuflw + pshufhw instructions. */
47683 static bool
47684 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
47686 unsigned char perm2[MAX_VECT_LEN];
47687 unsigned i;
47688 bool ok;
47690 if (d->vmode != V8HImode || !d->one_operand_p)
47691 return false;
47693 /* The two permutations only operate in 64-bit lanes. */
47694 for (i = 0; i < 4; ++i)
47695 if (d->perm[i] >= 4)
47696 return false;
47697 for (i = 4; i < 8; ++i)
47698 if (d->perm[i] < 4)
47699 return false;
47701 if (d->testing_p)
47702 return true;
47704 /* Emit the pshuflw. */
47705 memcpy (perm2, d->perm, 4);
47706 for (i = 4; i < 8; ++i)
47707 perm2[i] = i;
47708 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
47709 gcc_assert (ok);
47711 /* Emit the pshufhw. */
47712 memcpy (perm2 + 4, d->perm + 4, 4);
47713 for (i = 0; i < 4; ++i)
47714 perm2[i] = i;
47715 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
47716 gcc_assert (ok);
47718 return true;
47721 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47722 the permutation using the SSSE3 palignr instruction. This succeeds
47723 when all of the elements in PERM fit within one vector and we merely
47724 need to shift them down so that a single vector permutation has a
47725 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
47726 the vpalignr instruction itself can perform the requested permutation. */
47728 static bool
47729 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47731 unsigned i, nelt = d->nelt;
47732 unsigned min, max, minswap, maxswap;
47733 bool in_order, ok, swap = false;
47734 rtx shift, target;
47735 struct expand_vec_perm_d dcopy;
47737 /* Even with AVX, palignr only operates on 128-bit vectors,
47738 in AVX2 palignr operates on both 128-bit lanes. */
47739 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47740 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47741 return false;
47743 min = 2 * nelt;
47744 max = 0;
47745 minswap = 2 * nelt;
47746 maxswap = 0;
47747 for (i = 0; i < nelt; ++i)
47749 unsigned e = d->perm[i];
47750 unsigned eswap = d->perm[i] ^ nelt;
47751 if (GET_MODE_SIZE (d->vmode) == 32)
47753 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47754 eswap = e ^ (nelt / 2);
47756 if (e < min)
47757 min = e;
47758 if (e > max)
47759 max = e;
47760 if (eswap < minswap)
47761 minswap = eswap;
47762 if (eswap > maxswap)
47763 maxswap = eswap;
47765 if (min == 0
47766 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47768 if (d->one_operand_p
47769 || minswap == 0
47770 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47771 ? nelt / 2 : nelt))
47772 return false;
47773 swap = true;
47774 min = minswap;
47775 max = maxswap;
47778 /* Given that we have SSSE3, we know we'll be able to implement the
47779 single operand permutation after the palignr with pshufb for
47780 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47781 first. */
47782 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47783 return true;
47785 dcopy = *d;
47786 if (swap)
47788 dcopy.op0 = d->op1;
47789 dcopy.op1 = d->op0;
47790 for (i = 0; i < nelt; ++i)
47791 dcopy.perm[i] ^= nelt;
47794 in_order = true;
47795 for (i = 0; i < nelt; ++i)
47797 unsigned e = dcopy.perm[i];
47798 if (GET_MODE_SIZE (d->vmode) == 32
47799 && e >= nelt
47800 && (e & (nelt / 2 - 1)) < min)
47801 e = e - min - (nelt / 2);
47802 else
47803 e = e - min;
47804 if (e != i)
47805 in_order = false;
47806 dcopy.perm[i] = e;
47808 dcopy.one_operand_p = true;
47810 if (single_insn_only_p && !in_order)
47811 return false;
47813 /* For AVX2, test whether we can permute the result in one instruction. */
47814 if (d->testing_p)
47816 if (in_order)
47817 return true;
47818 dcopy.op1 = dcopy.op0;
47819 return expand_vec_perm_1 (&dcopy);
47822 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47823 if (GET_MODE_SIZE (d->vmode) == 16)
47825 target = gen_reg_rtx (TImode);
47826 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47827 gen_lowpart (TImode, dcopy.op0), shift));
47829 else
47831 target = gen_reg_rtx (V2TImode);
47832 emit_insn (gen_avx2_palignrv2ti (target,
47833 gen_lowpart (V2TImode, dcopy.op1),
47834 gen_lowpart (V2TImode, dcopy.op0),
47835 shift));
47838 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47840 /* Test for the degenerate case where the alignment by itself
47841 produces the desired permutation. */
47842 if (in_order)
47844 emit_move_insn (d->target, dcopy.op0);
47845 return true;
47848 ok = expand_vec_perm_1 (&dcopy);
47849 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47851 return ok;
47854 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47855 the permutation using the SSE4_1 pblendv instruction. Potentially
47856 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47858 static bool
47859 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47861 unsigned i, which, nelt = d->nelt;
47862 struct expand_vec_perm_d dcopy, dcopy1;
47863 machine_mode vmode = d->vmode;
47864 bool ok;
47866 /* Use the same checks as in expand_vec_perm_blend. */
47867 if (d->one_operand_p)
47868 return false;
47869 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47871 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47873 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47875 else
47876 return false;
47878 /* Figure out where permutation elements stay not in their
47879 respective lanes. */
47880 for (i = 0, which = 0; i < nelt; ++i)
47882 unsigned e = d->perm[i];
47883 if (e != i)
47884 which |= (e < nelt ? 1 : 2);
47886 /* We can pblend the part where elements stay not in their
47887 respective lanes only when these elements are all in one
47888 half of a permutation.
47889 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47890 lanes, but both 8 and 9 >= 8
47891 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47892 respective lanes and 8 >= 8, but 2 not. */
47893 if (which != 1 && which != 2)
47894 return false;
47895 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47896 return true;
47898 /* First we apply one operand permutation to the part where
47899 elements stay not in their respective lanes. */
47900 dcopy = *d;
47901 if (which == 2)
47902 dcopy.op0 = dcopy.op1 = d->op1;
47903 else
47904 dcopy.op0 = dcopy.op1 = d->op0;
47905 if (!d->testing_p)
47906 dcopy.target = gen_reg_rtx (vmode);
47907 dcopy.one_operand_p = true;
47909 for (i = 0; i < nelt; ++i)
47910 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47912 ok = expand_vec_perm_1 (&dcopy);
47913 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47914 return false;
47915 else
47916 gcc_assert (ok);
47917 if (d->testing_p)
47918 return true;
47920 /* Next we put permuted elements into their positions. */
47921 dcopy1 = *d;
47922 if (which == 2)
47923 dcopy1.op1 = dcopy.target;
47924 else
47925 dcopy1.op0 = dcopy.target;
47927 for (i = 0; i < nelt; ++i)
47928 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47930 ok = expand_vec_perm_blend (&dcopy1);
47931 gcc_assert (ok);
47933 return true;
47936 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47938 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47939 a two vector permutation into a single vector permutation by using
47940 an interleave operation to merge the vectors. */
47942 static bool
47943 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47945 struct expand_vec_perm_d dremap, dfinal;
47946 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47947 unsigned HOST_WIDE_INT contents;
47948 unsigned char remap[2 * MAX_VECT_LEN];
47949 rtx_insn *seq;
47950 bool ok, same_halves = false;
47952 if (GET_MODE_SIZE (d->vmode) == 16)
47954 if (d->one_operand_p)
47955 return false;
47957 else if (GET_MODE_SIZE (d->vmode) == 32)
47959 if (!TARGET_AVX)
47960 return false;
47961 /* For 32-byte modes allow even d->one_operand_p.
47962 The lack of cross-lane shuffling in some instructions
47963 might prevent a single insn shuffle. */
47964 dfinal = *d;
47965 dfinal.testing_p = true;
47966 /* If expand_vec_perm_interleave3 can expand this into
47967 a 3 insn sequence, give up and let it be expanded as
47968 3 insn sequence. While that is one insn longer,
47969 it doesn't need a memory operand and in the common
47970 case that both interleave low and high permutations
47971 with the same operands are adjacent needs 4 insns
47972 for both after CSE. */
47973 if (expand_vec_perm_interleave3 (&dfinal))
47974 return false;
47976 else
47977 return false;
47979 /* Examine from whence the elements come. */
47980 contents = 0;
47981 for (i = 0; i < nelt; ++i)
47982 contents |= HOST_WIDE_INT_1U << d->perm[i];
47984 memset (remap, 0xff, sizeof (remap));
47985 dremap = *d;
47987 if (GET_MODE_SIZE (d->vmode) == 16)
47989 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47991 /* Split the two input vectors into 4 halves. */
47992 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47993 h2 = h1 << nelt2;
47994 h3 = h2 << nelt2;
47995 h4 = h3 << nelt2;
47997 /* If the elements from the low halves use interleave low, and similarly
47998 for interleave high. If the elements are from mis-matched halves, we
47999 can use shufps for V4SF/V4SI or do a DImode shuffle. */
48000 if ((contents & (h1 | h3)) == contents)
48002 /* punpckl* */
48003 for (i = 0; i < nelt2; ++i)
48005 remap[i] = i * 2;
48006 remap[i + nelt] = i * 2 + 1;
48007 dremap.perm[i * 2] = i;
48008 dremap.perm[i * 2 + 1] = i + nelt;
48010 if (!TARGET_SSE2 && d->vmode == V4SImode)
48011 dremap.vmode = V4SFmode;
48013 else if ((contents & (h2 | h4)) == contents)
48015 /* punpckh* */
48016 for (i = 0; i < nelt2; ++i)
48018 remap[i + nelt2] = i * 2;
48019 remap[i + nelt + nelt2] = i * 2 + 1;
48020 dremap.perm[i * 2] = i + nelt2;
48021 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
48023 if (!TARGET_SSE2 && d->vmode == V4SImode)
48024 dremap.vmode = V4SFmode;
48026 else if ((contents & (h1 | h4)) == contents)
48028 /* shufps */
48029 for (i = 0; i < nelt2; ++i)
48031 remap[i] = i;
48032 remap[i + nelt + nelt2] = i + nelt2;
48033 dremap.perm[i] = i;
48034 dremap.perm[i + nelt2] = i + nelt + nelt2;
48036 if (nelt != 4)
48038 /* shufpd */
48039 dremap.vmode = V2DImode;
48040 dremap.nelt = 2;
48041 dremap.perm[0] = 0;
48042 dremap.perm[1] = 3;
48045 else if ((contents & (h2 | h3)) == contents)
48047 /* shufps */
48048 for (i = 0; i < nelt2; ++i)
48050 remap[i + nelt2] = i;
48051 remap[i + nelt] = i + nelt2;
48052 dremap.perm[i] = i + nelt2;
48053 dremap.perm[i + nelt2] = i + nelt;
48055 if (nelt != 4)
48057 /* shufpd */
48058 dremap.vmode = V2DImode;
48059 dremap.nelt = 2;
48060 dremap.perm[0] = 1;
48061 dremap.perm[1] = 2;
48064 else
48065 return false;
48067 else
48069 unsigned int nelt4 = nelt / 4, nzcnt = 0;
48070 unsigned HOST_WIDE_INT q[8];
48071 unsigned int nonzero_halves[4];
48073 /* Split the two input vectors into 8 quarters. */
48074 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
48075 for (i = 1; i < 8; ++i)
48076 q[i] = q[0] << (nelt4 * i);
48077 for (i = 0; i < 4; ++i)
48078 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
48080 nonzero_halves[nzcnt] = i;
48081 ++nzcnt;
48084 if (nzcnt == 1)
48086 gcc_assert (d->one_operand_p);
48087 nonzero_halves[1] = nonzero_halves[0];
48088 same_halves = true;
48090 else if (d->one_operand_p)
48092 gcc_assert (nonzero_halves[0] == 0);
48093 gcc_assert (nonzero_halves[1] == 1);
48096 if (nzcnt <= 2)
48098 if (d->perm[0] / nelt2 == nonzero_halves[1])
48100 /* Attempt to increase the likelihood that dfinal
48101 shuffle will be intra-lane. */
48102 std::swap (nonzero_halves[0], nonzero_halves[1]);
48105 /* vperm2f128 or vperm2i128. */
48106 for (i = 0; i < nelt2; ++i)
48108 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
48109 remap[i + nonzero_halves[0] * nelt2] = i;
48110 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
48111 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
48114 if (d->vmode != V8SFmode
48115 && d->vmode != V4DFmode
48116 && d->vmode != V8SImode)
48118 dremap.vmode = V8SImode;
48119 dremap.nelt = 8;
48120 for (i = 0; i < 4; ++i)
48122 dremap.perm[i] = i + nonzero_halves[0] * 4;
48123 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
48127 else if (d->one_operand_p)
48128 return false;
48129 else if (TARGET_AVX2
48130 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
48132 /* vpunpckl* */
48133 for (i = 0; i < nelt4; ++i)
48135 remap[i] = i * 2;
48136 remap[i + nelt] = i * 2 + 1;
48137 remap[i + nelt2] = i * 2 + nelt2;
48138 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
48139 dremap.perm[i * 2] = i;
48140 dremap.perm[i * 2 + 1] = i + nelt;
48141 dremap.perm[i * 2 + nelt2] = i + nelt2;
48142 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
48145 else if (TARGET_AVX2
48146 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
48148 /* vpunpckh* */
48149 for (i = 0; i < nelt4; ++i)
48151 remap[i + nelt4] = i * 2;
48152 remap[i + nelt + nelt4] = i * 2 + 1;
48153 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
48154 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
48155 dremap.perm[i * 2] = i + nelt4;
48156 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
48157 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
48158 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
48161 else
48162 return false;
48165 /* Use the remapping array set up above to move the elements from their
48166 swizzled locations into their final destinations. */
48167 dfinal = *d;
48168 for (i = 0; i < nelt; ++i)
48170 unsigned e = remap[d->perm[i]];
48171 gcc_assert (e < nelt);
48172 /* If same_halves is true, both halves of the remapped vector are the
48173 same. Avoid cross-lane accesses if possible. */
48174 if (same_halves && i >= nelt2)
48176 gcc_assert (e < nelt2);
48177 dfinal.perm[i] = e + nelt2;
48179 else
48180 dfinal.perm[i] = e;
48182 if (!d->testing_p)
48184 dremap.target = gen_reg_rtx (dremap.vmode);
48185 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48187 dfinal.op1 = dfinal.op0;
48188 dfinal.one_operand_p = true;
48190 /* Test if the final remap can be done with a single insn. For V4SFmode or
48191 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
48192 start_sequence ();
48193 ok = expand_vec_perm_1 (&dfinal);
48194 seq = get_insns ();
48195 end_sequence ();
48197 if (!ok)
48198 return false;
48200 if (d->testing_p)
48201 return true;
48203 if (dremap.vmode != dfinal.vmode)
48205 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
48206 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
48209 ok = expand_vec_perm_1 (&dremap);
48210 gcc_assert (ok);
48212 emit_insn (seq);
48213 return true;
48216 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48217 a single vector cross-lane permutation into vpermq followed
48218 by any of the single insn permutations. */
48220 static bool
48221 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
48223 struct expand_vec_perm_d dremap, dfinal;
48224 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
48225 unsigned contents[2];
48226 bool ok;
48228 if (!(TARGET_AVX2
48229 && (d->vmode == V32QImode || d->vmode == V16HImode)
48230 && d->one_operand_p))
48231 return false;
48233 contents[0] = 0;
48234 contents[1] = 0;
48235 for (i = 0; i < nelt2; ++i)
48237 contents[0] |= 1u << (d->perm[i] / nelt4);
48238 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
48241 for (i = 0; i < 2; ++i)
48243 unsigned int cnt = 0;
48244 for (j = 0; j < 4; ++j)
48245 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
48246 return false;
48249 if (d->testing_p)
48250 return true;
48252 dremap = *d;
48253 dremap.vmode = V4DImode;
48254 dremap.nelt = 4;
48255 dremap.target = gen_reg_rtx (V4DImode);
48256 dremap.op0 = gen_lowpart (V4DImode, d->op0);
48257 dremap.op1 = dremap.op0;
48258 dremap.one_operand_p = true;
48259 for (i = 0; i < 2; ++i)
48261 unsigned int cnt = 0;
48262 for (j = 0; j < 4; ++j)
48263 if ((contents[i] & (1u << j)) != 0)
48264 dremap.perm[2 * i + cnt++] = j;
48265 for (; cnt < 2; ++cnt)
48266 dremap.perm[2 * i + cnt] = 0;
48269 dfinal = *d;
48270 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
48271 dfinal.op1 = dfinal.op0;
48272 dfinal.one_operand_p = true;
48273 for (i = 0, j = 0; i < nelt; ++i)
48275 if (i == nelt2)
48276 j = 2;
48277 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
48278 if ((d->perm[i] / nelt4) == dremap.perm[j])
48280 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
48281 dfinal.perm[i] |= nelt4;
48282 else
48283 gcc_unreachable ();
48286 ok = expand_vec_perm_1 (&dremap);
48287 gcc_assert (ok);
48289 ok = expand_vec_perm_1 (&dfinal);
48290 gcc_assert (ok);
48292 return true;
48295 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
48296 a vector permutation using two instructions, vperm2f128 resp.
48297 vperm2i128 followed by any single in-lane permutation. */
48299 static bool
48300 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
48302 struct expand_vec_perm_d dfirst, dsecond;
48303 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
48304 bool ok;
48306 if (!TARGET_AVX
48307 || GET_MODE_SIZE (d->vmode) != 32
48308 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
48309 return false;
48311 dsecond = *d;
48312 dsecond.one_operand_p = false;
48313 dsecond.testing_p = true;
48315 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
48316 immediate. For perm < 16 the second permutation uses
48317 d->op0 as first operand, for perm >= 16 it uses d->op1
48318 as first operand. The second operand is the result of
48319 vperm2[fi]128. */
48320 for (perm = 0; perm < 32; perm++)
48322 /* Ignore permutations which do not move anything cross-lane. */
48323 if (perm < 16)
48325 /* The second shuffle for e.g. V4DFmode has
48326 0123 and ABCD operands.
48327 Ignore AB23, as 23 is already in the second lane
48328 of the first operand. */
48329 if ((perm & 0xc) == (1 << 2)) continue;
48330 /* And 01CD, as 01 is in the first lane of the first
48331 operand. */
48332 if ((perm & 3) == 0) continue;
48333 /* And 4567, as then the vperm2[fi]128 doesn't change
48334 anything on the original 4567 second operand. */
48335 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
48337 else
48339 /* The second shuffle for e.g. V4DFmode has
48340 4567 and ABCD operands.
48341 Ignore AB67, as 67 is already in the second lane
48342 of the first operand. */
48343 if ((perm & 0xc) == (3 << 2)) continue;
48344 /* And 45CD, as 45 is in the first lane of the first
48345 operand. */
48346 if ((perm & 3) == 2) continue;
48347 /* And 0123, as then the vperm2[fi]128 doesn't change
48348 anything on the original 0123 first operand. */
48349 if ((perm & 0xf) == (1 << 2)) continue;
48352 for (i = 0; i < nelt; i++)
48354 j = d->perm[i] / nelt2;
48355 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
48356 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
48357 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
48358 dsecond.perm[i] = d->perm[i] & (nelt - 1);
48359 else
48360 break;
48363 if (i == nelt)
48365 start_sequence ();
48366 ok = expand_vec_perm_1 (&dsecond);
48367 end_sequence ();
48369 else
48370 ok = false;
48372 if (ok)
48374 if (d->testing_p)
48375 return true;
48377 /* Found a usable second shuffle. dfirst will be
48378 vperm2f128 on d->op0 and d->op1. */
48379 dsecond.testing_p = false;
48380 dfirst = *d;
48381 dfirst.target = gen_reg_rtx (d->vmode);
48382 for (i = 0; i < nelt; i++)
48383 dfirst.perm[i] = (i & (nelt2 - 1))
48384 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
48386 canonicalize_perm (&dfirst);
48387 ok = expand_vec_perm_1 (&dfirst);
48388 gcc_assert (ok);
48390 /* And dsecond is some single insn shuffle, taking
48391 d->op0 and result of vperm2f128 (if perm < 16) or
48392 d->op1 and result of vperm2f128 (otherwise). */
48393 if (perm >= 16)
48394 dsecond.op0 = dsecond.op1;
48395 dsecond.op1 = dfirst.target;
48397 ok = expand_vec_perm_1 (&dsecond);
48398 gcc_assert (ok);
48400 return true;
48403 /* For one operand, the only useful vperm2f128 permutation is 0x01
48404 aka lanes swap. */
48405 if (d->one_operand_p)
48406 return false;
48409 return false;
48412 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
48413 a two vector permutation using 2 intra-lane interleave insns
48414 and cross-lane shuffle for 32-byte vectors. */
48416 static bool
48417 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
48419 unsigned i, nelt;
48420 rtx (*gen) (rtx, rtx, rtx);
48422 if (d->one_operand_p)
48423 return false;
48424 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
48426 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
48428 else
48429 return false;
48431 nelt = d->nelt;
48432 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
48433 return false;
48434 for (i = 0; i < nelt; i += 2)
48435 if (d->perm[i] != d->perm[0] + i / 2
48436 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
48437 return false;
48439 if (d->testing_p)
48440 return true;
48442 switch (d->vmode)
48444 case E_V32QImode:
48445 if (d->perm[0])
48446 gen = gen_vec_interleave_highv32qi;
48447 else
48448 gen = gen_vec_interleave_lowv32qi;
48449 break;
48450 case E_V16HImode:
48451 if (d->perm[0])
48452 gen = gen_vec_interleave_highv16hi;
48453 else
48454 gen = gen_vec_interleave_lowv16hi;
48455 break;
48456 case E_V8SImode:
48457 if (d->perm[0])
48458 gen = gen_vec_interleave_highv8si;
48459 else
48460 gen = gen_vec_interleave_lowv8si;
48461 break;
48462 case E_V4DImode:
48463 if (d->perm[0])
48464 gen = gen_vec_interleave_highv4di;
48465 else
48466 gen = gen_vec_interleave_lowv4di;
48467 break;
48468 case E_V8SFmode:
48469 if (d->perm[0])
48470 gen = gen_vec_interleave_highv8sf;
48471 else
48472 gen = gen_vec_interleave_lowv8sf;
48473 break;
48474 case E_V4DFmode:
48475 if (d->perm[0])
48476 gen = gen_vec_interleave_highv4df;
48477 else
48478 gen = gen_vec_interleave_lowv4df;
48479 break;
48480 default:
48481 gcc_unreachable ();
48484 emit_insn (gen (d->target, d->op0, d->op1));
48485 return true;
48488 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
48489 a single vector permutation using a single intra-lane vector
48490 permutation, vperm2f128 swapping the lanes and vblend* insn blending
48491 the non-swapped and swapped vectors together. */
48493 static bool
48494 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
48496 struct expand_vec_perm_d dfirst, dsecond;
48497 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
48498 rtx_insn *seq;
48499 bool ok;
48500 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
48502 if (!TARGET_AVX
48503 || TARGET_AVX2
48504 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
48505 || !d->one_operand_p)
48506 return false;
48508 dfirst = *d;
48509 for (i = 0; i < nelt; i++)
48510 dfirst.perm[i] = 0xff;
48511 for (i = 0, msk = 0; i < nelt; i++)
48513 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
48514 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
48515 return false;
48516 dfirst.perm[j] = d->perm[i];
48517 if (j != i)
48518 msk |= (1 << i);
48520 for (i = 0; i < nelt; i++)
48521 if (dfirst.perm[i] == 0xff)
48522 dfirst.perm[i] = i;
48524 if (!d->testing_p)
48525 dfirst.target = gen_reg_rtx (dfirst.vmode);
48527 start_sequence ();
48528 ok = expand_vec_perm_1 (&dfirst);
48529 seq = get_insns ();
48530 end_sequence ();
48532 if (!ok)
48533 return false;
48535 if (d->testing_p)
48536 return true;
48538 emit_insn (seq);
48540 dsecond = *d;
48541 dsecond.op0 = dfirst.target;
48542 dsecond.op1 = dfirst.target;
48543 dsecond.one_operand_p = true;
48544 dsecond.target = gen_reg_rtx (dsecond.vmode);
48545 for (i = 0; i < nelt; i++)
48546 dsecond.perm[i] = i ^ nelt2;
48548 ok = expand_vec_perm_1 (&dsecond);
48549 gcc_assert (ok);
48551 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
48552 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
48553 return true;
48556 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
48557 permutation using two vperm2f128, followed by a vshufpd insn blending
48558 the two vectors together. */
48560 static bool
48561 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
48563 struct expand_vec_perm_d dfirst, dsecond, dthird;
48564 bool ok;
48566 if (!TARGET_AVX || (d->vmode != V4DFmode))
48567 return false;
48569 if (d->testing_p)
48570 return true;
48572 dfirst = *d;
48573 dsecond = *d;
48574 dthird = *d;
48576 dfirst.perm[0] = (d->perm[0] & ~1);
48577 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
48578 dfirst.perm[2] = (d->perm[2] & ~1);
48579 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
48580 dsecond.perm[0] = (d->perm[1] & ~1);
48581 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
48582 dsecond.perm[2] = (d->perm[3] & ~1);
48583 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
48584 dthird.perm[0] = (d->perm[0] % 2);
48585 dthird.perm[1] = (d->perm[1] % 2) + 4;
48586 dthird.perm[2] = (d->perm[2] % 2) + 2;
48587 dthird.perm[3] = (d->perm[3] % 2) + 6;
48589 dfirst.target = gen_reg_rtx (dfirst.vmode);
48590 dsecond.target = gen_reg_rtx (dsecond.vmode);
48591 dthird.op0 = dfirst.target;
48592 dthird.op1 = dsecond.target;
48593 dthird.one_operand_p = false;
48595 canonicalize_perm (&dfirst);
48596 canonicalize_perm (&dsecond);
48598 ok = expand_vec_perm_1 (&dfirst)
48599 && expand_vec_perm_1 (&dsecond)
48600 && expand_vec_perm_1 (&dthird);
48602 gcc_assert (ok);
48604 return true;
48607 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
48608 permutation with two pshufb insns and an ior. We should have already
48609 failed all two instruction sequences. */
48611 static bool
48612 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
48614 rtx rperm[2][16], vperm, l, h, op, m128;
48615 unsigned int i, nelt, eltsz;
48617 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
48618 return false;
48619 gcc_assert (!d->one_operand_p);
48621 if (d->testing_p)
48622 return true;
48624 nelt = d->nelt;
48625 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48627 /* Generate two permutation masks. If the required element is within
48628 the given vector it is shuffled into the proper lane. If the required
48629 element is in the other vector, force a zero into the lane by setting
48630 bit 7 in the permutation mask. */
48631 m128 = GEN_INT (-128);
48632 for (i = 0; i < nelt; ++i)
48634 unsigned j, e = d->perm[i];
48635 unsigned which = (e >= nelt);
48636 if (e >= nelt)
48637 e -= nelt;
48639 for (j = 0; j < eltsz; ++j)
48641 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
48642 rperm[1-which][i*eltsz + j] = m128;
48646 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
48647 vperm = force_reg (V16QImode, vperm);
48649 l = gen_reg_rtx (V16QImode);
48650 op = gen_lowpart (V16QImode, d->op0);
48651 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
48653 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
48654 vperm = force_reg (V16QImode, vperm);
48656 h = gen_reg_rtx (V16QImode);
48657 op = gen_lowpart (V16QImode, d->op1);
48658 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
48660 op = d->target;
48661 if (d->vmode != V16QImode)
48662 op = gen_reg_rtx (V16QImode);
48663 emit_insn (gen_iorv16qi3 (op, l, h));
48664 if (op != d->target)
48665 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48667 return true;
48670 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
48671 with two vpshufb insns, vpermq and vpor. We should have already failed
48672 all two or three instruction sequences. */
48674 static bool
48675 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
48677 rtx rperm[2][32], vperm, l, h, hp, op, m128;
48678 unsigned int i, nelt, eltsz;
48680 if (!TARGET_AVX2
48681 || !d->one_operand_p
48682 || (d->vmode != V32QImode && d->vmode != V16HImode))
48683 return false;
48685 if (d->testing_p)
48686 return true;
48688 nelt = d->nelt;
48689 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48691 /* Generate two permutation masks. If the required element is within
48692 the same lane, it is shuffled in. If the required element from the
48693 other lane, force a zero by setting bit 7 in the permutation mask.
48694 In the other mask the mask has non-negative elements if element
48695 is requested from the other lane, but also moved to the other lane,
48696 so that the result of vpshufb can have the two V2TImode halves
48697 swapped. */
48698 m128 = GEN_INT (-128);
48699 for (i = 0; i < nelt; ++i)
48701 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48702 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48704 for (j = 0; j < eltsz; ++j)
48706 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
48707 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
48711 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48712 vperm = force_reg (V32QImode, vperm);
48714 h = gen_reg_rtx (V32QImode);
48715 op = gen_lowpart (V32QImode, d->op0);
48716 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48718 /* Swap the 128-byte lanes of h into hp. */
48719 hp = gen_reg_rtx (V4DImode);
48720 op = gen_lowpart (V4DImode, h);
48721 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
48722 const1_rtx));
48724 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48725 vperm = force_reg (V32QImode, vperm);
48727 l = gen_reg_rtx (V32QImode);
48728 op = gen_lowpart (V32QImode, d->op0);
48729 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48731 op = d->target;
48732 if (d->vmode != V32QImode)
48733 op = gen_reg_rtx (V32QImode);
48734 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48735 if (op != d->target)
48736 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48738 return true;
48741 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48742 and extract-odd permutations of two V32QImode and V16QImode operand
48743 with two vpshufb insns, vpor and vpermq. We should have already
48744 failed all two or three instruction sequences. */
48746 static bool
48747 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48749 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48750 unsigned int i, nelt, eltsz;
48752 if (!TARGET_AVX2
48753 || d->one_operand_p
48754 || (d->vmode != V32QImode && d->vmode != V16HImode))
48755 return false;
48757 for (i = 0; i < d->nelt; ++i)
48758 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48759 return false;
48761 if (d->testing_p)
48762 return true;
48764 nelt = d->nelt;
48765 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48767 /* Generate two permutation masks. In the first permutation mask
48768 the first quarter will contain indexes for the first half
48769 of the op0, the second quarter will contain bit 7 set, third quarter
48770 will contain indexes for the second half of the op0 and the
48771 last quarter bit 7 set. In the second permutation mask
48772 the first quarter will contain bit 7 set, the second quarter
48773 indexes for the first half of the op1, the third quarter bit 7 set
48774 and last quarter indexes for the second half of the op1.
48775 I.e. the first mask e.g. for V32QImode extract even will be:
48776 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48777 (all values masked with 0xf except for -128) and second mask
48778 for extract even will be
48779 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48780 m128 = GEN_INT (-128);
48781 for (i = 0; i < nelt; ++i)
48783 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48784 unsigned which = d->perm[i] >= nelt;
48785 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48787 for (j = 0; j < eltsz; ++j)
48789 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48790 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48794 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48795 vperm = force_reg (V32QImode, vperm);
48797 l = gen_reg_rtx (V32QImode);
48798 op = gen_lowpart (V32QImode, d->op0);
48799 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48801 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48802 vperm = force_reg (V32QImode, vperm);
48804 h = gen_reg_rtx (V32QImode);
48805 op = gen_lowpart (V32QImode, d->op1);
48806 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48808 ior = gen_reg_rtx (V32QImode);
48809 emit_insn (gen_iorv32qi3 (ior, l, h));
48811 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48812 op = gen_reg_rtx (V4DImode);
48813 ior = gen_lowpart (V4DImode, ior);
48814 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48815 const1_rtx, GEN_INT (3)));
48816 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48818 return true;
48821 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48822 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48823 with two "and" and "pack" or two "shift" and "pack" insns. We should
48824 have already failed all two instruction sequences. */
48826 static bool
48827 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48829 rtx op, dop0, dop1, t, rperm[16];
48830 unsigned i, odd, c, s, nelt = d->nelt;
48831 bool end_perm = false;
48832 machine_mode half_mode;
48833 rtx (*gen_and) (rtx, rtx, rtx);
48834 rtx (*gen_pack) (rtx, rtx, rtx);
48835 rtx (*gen_shift) (rtx, rtx, rtx);
48837 if (d->one_operand_p)
48838 return false;
48840 switch (d->vmode)
48842 case E_V8HImode:
48843 /* Required for "pack". */
48844 if (!TARGET_SSE4_1)
48845 return false;
48846 c = 0xffff;
48847 s = 16;
48848 half_mode = V4SImode;
48849 gen_and = gen_andv4si3;
48850 gen_pack = gen_sse4_1_packusdw;
48851 gen_shift = gen_lshrv4si3;
48852 break;
48853 case E_V16QImode:
48854 /* No check as all instructions are SSE2. */
48855 c = 0xff;
48856 s = 8;
48857 half_mode = V8HImode;
48858 gen_and = gen_andv8hi3;
48859 gen_pack = gen_sse2_packuswb;
48860 gen_shift = gen_lshrv8hi3;
48861 break;
48862 case E_V16HImode:
48863 if (!TARGET_AVX2)
48864 return false;
48865 c = 0xffff;
48866 s = 16;
48867 half_mode = V8SImode;
48868 gen_and = gen_andv8si3;
48869 gen_pack = gen_avx2_packusdw;
48870 gen_shift = gen_lshrv8si3;
48871 end_perm = true;
48872 break;
48873 case E_V32QImode:
48874 if (!TARGET_AVX2)
48875 return false;
48876 c = 0xff;
48877 s = 8;
48878 half_mode = V16HImode;
48879 gen_and = gen_andv16hi3;
48880 gen_pack = gen_avx2_packuswb;
48881 gen_shift = gen_lshrv16hi3;
48882 end_perm = true;
48883 break;
48884 default:
48885 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48886 general shuffles. */
48887 return false;
48890 /* Check that permutation is even or odd. */
48891 odd = d->perm[0];
48892 if (odd > 1)
48893 return false;
48895 for (i = 1; i < nelt; ++i)
48896 if (d->perm[i] != 2 * i + odd)
48897 return false;
48899 if (d->testing_p)
48900 return true;
48902 dop0 = gen_reg_rtx (half_mode);
48903 dop1 = gen_reg_rtx (half_mode);
48904 if (odd == 0)
48906 for (i = 0; i < nelt / 2; i++)
48907 rperm[i] = GEN_INT (c);
48908 t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
48909 t = force_reg (half_mode, t);
48910 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48911 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48913 else
48915 emit_insn (gen_shift (dop0,
48916 gen_lowpart (half_mode, d->op0),
48917 GEN_INT (s)));
48918 emit_insn (gen_shift (dop1,
48919 gen_lowpart (half_mode, d->op1),
48920 GEN_INT (s)));
48922 /* In AVX2 for 256 bit case we need to permute pack result. */
48923 if (TARGET_AVX2 && end_perm)
48925 op = gen_reg_rtx (d->vmode);
48926 t = gen_reg_rtx (V4DImode);
48927 emit_insn (gen_pack (op, dop0, dop1));
48928 emit_insn (gen_avx2_permv4di_1 (t,
48929 gen_lowpart (V4DImode, op),
48930 const0_rtx,
48931 const2_rtx,
48932 const1_rtx,
48933 GEN_INT (3)));
48934 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48936 else
48937 emit_insn (gen_pack (d->target, dop0, dop1));
48939 return true;
48942 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48943 and extract-odd permutations of two V64QI operands
48944 with two "shifts", two "truncs" and one "concat" insns for "odd"
48945 and two "truncs" and one concat insn for "even."
48946 Have already failed all two instruction sequences. */
48948 static bool
48949 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48951 rtx t1, t2, t3, t4;
48952 unsigned i, odd, nelt = d->nelt;
48954 if (!TARGET_AVX512BW
48955 || d->one_operand_p
48956 || d->vmode != V64QImode)
48957 return false;
48959 /* Check that permutation is even or odd. */
48960 odd = d->perm[0];
48961 if (odd > 1)
48962 return false;
48964 for (i = 1; i < nelt; ++i)
48965 if (d->perm[i] != 2 * i + odd)
48966 return false;
48968 if (d->testing_p)
48969 return true;
48972 if (odd)
48974 t1 = gen_reg_rtx (V32HImode);
48975 t2 = gen_reg_rtx (V32HImode);
48976 emit_insn (gen_lshrv32hi3 (t1,
48977 gen_lowpart (V32HImode, d->op0),
48978 GEN_INT (8)));
48979 emit_insn (gen_lshrv32hi3 (t2,
48980 gen_lowpart (V32HImode, d->op1),
48981 GEN_INT (8)));
48983 else
48985 t1 = gen_lowpart (V32HImode, d->op0);
48986 t2 = gen_lowpart (V32HImode, d->op1);
48989 t3 = gen_reg_rtx (V32QImode);
48990 t4 = gen_reg_rtx (V32QImode);
48991 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48992 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48993 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48995 return true;
48998 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48999 and extract-odd permutations. */
49001 static bool
49002 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
49004 rtx t1, t2, t3, t4, t5;
49006 switch (d->vmode)
49008 case E_V4DFmode:
49009 if (d->testing_p)
49010 break;
49011 t1 = gen_reg_rtx (V4DFmode);
49012 t2 = gen_reg_rtx (V4DFmode);
49014 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49015 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
49016 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
49018 /* Now an unpck[lh]pd will produce the result required. */
49019 if (odd)
49020 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
49021 else
49022 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
49023 emit_insn (t3);
49024 break;
49026 case E_V8SFmode:
49028 int mask = odd ? 0xdd : 0x88;
49030 if (d->testing_p)
49031 break;
49032 t1 = gen_reg_rtx (V8SFmode);
49033 t2 = gen_reg_rtx (V8SFmode);
49034 t3 = gen_reg_rtx (V8SFmode);
49036 /* Shuffle within the 128-bit lanes to produce:
49037 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
49038 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
49039 GEN_INT (mask)));
49041 /* Shuffle the lanes around to produce:
49042 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
49043 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
49044 GEN_INT (0x3)));
49046 /* Shuffle within the 128-bit lanes to produce:
49047 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
49048 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
49050 /* Shuffle within the 128-bit lanes to produce:
49051 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
49052 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
49054 /* Shuffle the lanes around to produce:
49055 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
49056 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
49057 GEN_INT (0x20)));
49059 break;
49061 case E_V2DFmode:
49062 case E_V4SFmode:
49063 case E_V2DImode:
49064 case E_V4SImode:
49065 /* These are always directly implementable by expand_vec_perm_1. */
49066 gcc_unreachable ();
49068 case E_V8HImode:
49069 if (TARGET_SSE4_1)
49070 return expand_vec_perm_even_odd_pack (d);
49071 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
49072 return expand_vec_perm_pshufb2 (d);
49073 else
49075 if (d->testing_p)
49076 break;
49077 /* We need 2*log2(N)-1 operations to achieve odd/even
49078 with interleave. */
49079 t1 = gen_reg_rtx (V8HImode);
49080 t2 = gen_reg_rtx (V8HImode);
49081 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
49082 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
49083 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
49084 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
49085 if (odd)
49086 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
49087 else
49088 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
49089 emit_insn (t3);
49091 break;
49093 case E_V16QImode:
49094 return expand_vec_perm_even_odd_pack (d);
49096 case E_V16HImode:
49097 case E_V32QImode:
49098 return expand_vec_perm_even_odd_pack (d);
49100 case E_V64QImode:
49101 return expand_vec_perm_even_odd_trunc (d);
49103 case E_V4DImode:
49104 if (!TARGET_AVX2)
49106 struct expand_vec_perm_d d_copy = *d;
49107 d_copy.vmode = V4DFmode;
49108 if (d->testing_p)
49109 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
49110 else
49111 d_copy.target = gen_reg_rtx (V4DFmode);
49112 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
49113 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
49114 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49116 if (!d->testing_p)
49117 emit_move_insn (d->target,
49118 gen_lowpart (V4DImode, d_copy.target));
49119 return true;
49121 return false;
49124 if (d->testing_p)
49125 break;
49127 t1 = gen_reg_rtx (V4DImode);
49128 t2 = gen_reg_rtx (V4DImode);
49130 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
49131 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
49132 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
49134 /* Now an vpunpck[lh]qdq will produce the result required. */
49135 if (odd)
49136 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
49137 else
49138 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
49139 emit_insn (t3);
49140 break;
49142 case E_V8SImode:
49143 if (!TARGET_AVX2)
49145 struct expand_vec_perm_d d_copy = *d;
49146 d_copy.vmode = V8SFmode;
49147 if (d->testing_p)
49148 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
49149 else
49150 d_copy.target = gen_reg_rtx (V8SFmode);
49151 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
49152 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
49153 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
49155 if (!d->testing_p)
49156 emit_move_insn (d->target,
49157 gen_lowpart (V8SImode, d_copy.target));
49158 return true;
49160 return false;
49163 if (d->testing_p)
49164 break;
49166 t1 = gen_reg_rtx (V8SImode);
49167 t2 = gen_reg_rtx (V8SImode);
49168 t3 = gen_reg_rtx (V4DImode);
49169 t4 = gen_reg_rtx (V4DImode);
49170 t5 = gen_reg_rtx (V4DImode);
49172 /* Shuffle the lanes around into
49173 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
49174 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
49175 gen_lowpart (V4DImode, d->op1),
49176 GEN_INT (0x20)));
49177 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
49178 gen_lowpart (V4DImode, d->op1),
49179 GEN_INT (0x31)));
49181 /* Swap the 2nd and 3rd position in each lane into
49182 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
49183 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
49184 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49185 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
49186 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
49188 /* Now an vpunpck[lh]qdq will produce
49189 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
49190 if (odd)
49191 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
49192 gen_lowpart (V4DImode, t2));
49193 else
49194 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
49195 gen_lowpart (V4DImode, t2));
49196 emit_insn (t3);
49197 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
49198 break;
49200 default:
49201 gcc_unreachable ();
49204 return true;
49207 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49208 extract-even and extract-odd permutations. */
49210 static bool
49211 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
49213 unsigned i, odd, nelt = d->nelt;
49215 odd = d->perm[0];
49216 if (odd != 0 && odd != 1)
49217 return false;
49219 for (i = 1; i < nelt; ++i)
49220 if (d->perm[i] != 2 * i + odd)
49221 return false;
49223 return expand_vec_perm_even_odd_1 (d, odd);
49226 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
49227 permutations. We assume that expand_vec_perm_1 has already failed. */
49229 static bool
49230 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
49232 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
49233 machine_mode vmode = d->vmode;
49234 unsigned char perm2[4];
49235 rtx op0 = d->op0, dest;
49236 bool ok;
49238 switch (vmode)
49240 case E_V4DFmode:
49241 case E_V8SFmode:
49242 /* These are special-cased in sse.md so that we can optionally
49243 use the vbroadcast instruction. They expand to two insns
49244 if the input happens to be in a register. */
49245 gcc_unreachable ();
49247 case E_V2DFmode:
49248 case E_V2DImode:
49249 case E_V4SFmode:
49250 case E_V4SImode:
49251 /* These are always implementable using standard shuffle patterns. */
49252 gcc_unreachable ();
49254 case E_V8HImode:
49255 case E_V16QImode:
49256 /* These can be implemented via interleave. We save one insn by
49257 stopping once we have promoted to V4SImode and then use pshufd. */
49258 if (d->testing_p)
49259 return true;
49262 rtx dest;
49263 rtx (*gen) (rtx, rtx, rtx)
49264 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
49265 : gen_vec_interleave_lowv8hi;
49267 if (elt >= nelt2)
49269 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
49270 : gen_vec_interleave_highv8hi;
49271 elt -= nelt2;
49273 nelt2 /= 2;
49275 dest = gen_reg_rtx (vmode);
49276 emit_insn (gen (dest, op0, op0));
49277 vmode = get_mode_wider_vector (vmode);
49278 op0 = gen_lowpart (vmode, dest);
49280 while (vmode != V4SImode);
49282 memset (perm2, elt, 4);
49283 dest = gen_reg_rtx (V4SImode);
49284 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
49285 gcc_assert (ok);
49286 if (!d->testing_p)
49287 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
49288 return true;
49290 case E_V64QImode:
49291 case E_V32QImode:
49292 case E_V16HImode:
49293 case E_V8SImode:
49294 case E_V4DImode:
49295 /* For AVX2 broadcasts of the first element vpbroadcast* or
49296 vpermq should be used by expand_vec_perm_1. */
49297 gcc_assert (!TARGET_AVX2 || d->perm[0]);
49298 return false;
49300 default:
49301 gcc_unreachable ();
49305 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
49306 broadcast permutations. */
49308 static bool
49309 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
49311 unsigned i, elt, nelt = d->nelt;
49313 if (!d->one_operand_p)
49314 return false;
49316 elt = d->perm[0];
49317 for (i = 1; i < nelt; ++i)
49318 if (d->perm[i] != elt)
49319 return false;
49321 return expand_vec_perm_broadcast_1 (d);
49324 /* Implement arbitrary permutations of two V64QImode operands
49325 will 2 vpermi2w, 2 vpshufb and one vpor instruction. */
49326 static bool
49327 expand_vec_perm_vpermi2_vpshub2 (struct expand_vec_perm_d *d)
49329 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
49330 return false;
49332 if (d->testing_p)
49333 return true;
49335 struct expand_vec_perm_d ds[2];
49336 rtx rperm[128], vperm, target0, target1;
49337 unsigned int i, nelt;
49338 machine_mode vmode;
49340 nelt = d->nelt;
49341 vmode = V64QImode;
49343 for (i = 0; i < 2; i++)
49345 ds[i] = *d;
49346 ds[i].vmode = V32HImode;
49347 ds[i].nelt = 32;
49348 ds[i].target = gen_reg_rtx (V32HImode);
49349 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
49350 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
49353 /* Prepare permutations such that the first one takes care of
49354 putting the even bytes into the right positions or one higher
49355 positions (ds[0]) and the second one takes care of
49356 putting the odd bytes into the right positions or one below
49357 (ds[1]). */
49359 for (i = 0; i < nelt; i++)
49361 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
49362 if (i & 1)
49364 rperm[i] = constm1_rtx;
49365 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49367 else
49369 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
49370 rperm[i + 64] = constm1_rtx;
49374 bool ok = expand_vec_perm_1 (&ds[0]);
49375 gcc_assert (ok);
49376 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
49378 ok = expand_vec_perm_1 (&ds[1]);
49379 gcc_assert (ok);
49380 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
49382 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
49383 vperm = force_reg (vmode, vperm);
49384 target0 = gen_reg_rtx (V64QImode);
49385 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
49387 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
49388 vperm = force_reg (vmode, vperm);
49389 target1 = gen_reg_rtx (V64QImode);
49390 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
49392 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
49393 return true;
49396 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
49397 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
49398 all the shorter instruction sequences. */
49400 static bool
49401 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
49403 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
49404 unsigned int i, nelt, eltsz;
49405 bool used[4];
49407 if (!TARGET_AVX2
49408 || d->one_operand_p
49409 || (d->vmode != V32QImode && d->vmode != V16HImode))
49410 return false;
49412 if (d->testing_p)
49413 return true;
49415 nelt = d->nelt;
49416 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
49418 /* Generate 4 permutation masks. If the required element is within
49419 the same lane, it is shuffled in. If the required element from the
49420 other lane, force a zero by setting bit 7 in the permutation mask.
49421 In the other mask the mask has non-negative elements if element
49422 is requested from the other lane, but also moved to the other lane,
49423 so that the result of vpshufb can have the two V2TImode halves
49424 swapped. */
49425 m128 = GEN_INT (-128);
49426 for (i = 0; i < 32; ++i)
49428 rperm[0][i] = m128;
49429 rperm[1][i] = m128;
49430 rperm[2][i] = m128;
49431 rperm[3][i] = m128;
49433 used[0] = false;
49434 used[1] = false;
49435 used[2] = false;
49436 used[3] = false;
49437 for (i = 0; i < nelt; ++i)
49439 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
49440 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
49441 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
49443 for (j = 0; j < eltsz; ++j)
49444 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
49445 used[which] = true;
49448 for (i = 0; i < 2; ++i)
49450 if (!used[2 * i + 1])
49452 h[i] = NULL_RTX;
49453 continue;
49455 vperm = gen_rtx_CONST_VECTOR (V32QImode,
49456 gen_rtvec_v (32, rperm[2 * i + 1]));
49457 vperm = force_reg (V32QImode, vperm);
49458 h[i] = gen_reg_rtx (V32QImode);
49459 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49460 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
49463 /* Swap the 128-byte lanes of h[X]. */
49464 for (i = 0; i < 2; ++i)
49466 if (h[i] == NULL_RTX)
49467 continue;
49468 op = gen_reg_rtx (V4DImode);
49469 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
49470 const2_rtx, GEN_INT (3), const0_rtx,
49471 const1_rtx));
49472 h[i] = gen_lowpart (V32QImode, op);
49475 for (i = 0; i < 2; ++i)
49477 if (!used[2 * i])
49479 l[i] = NULL_RTX;
49480 continue;
49482 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
49483 vperm = force_reg (V32QImode, vperm);
49484 l[i] = gen_reg_rtx (V32QImode);
49485 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
49486 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
49489 for (i = 0; i < 2; ++i)
49491 if (h[i] && l[i])
49493 op = gen_reg_rtx (V32QImode);
49494 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
49495 l[i] = op;
49497 else if (h[i])
49498 l[i] = h[i];
49501 gcc_assert (l[0] && l[1]);
49502 op = d->target;
49503 if (d->vmode != V32QImode)
49504 op = gen_reg_rtx (V32QImode);
49505 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
49506 if (op != d->target)
49507 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
49508 return true;
49511 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
49512 With all of the interface bits taken care of, perform the expansion
49513 in D and return true on success. */
49515 static bool
49516 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
49518 /* Try a single instruction expansion. */
49519 if (expand_vec_perm_1 (d))
49520 return true;
49522 /* Try sequences of two instructions. */
49524 if (expand_vec_perm_pshuflw_pshufhw (d))
49525 return true;
49527 if (expand_vec_perm_palignr (d, false))
49528 return true;
49530 if (expand_vec_perm_interleave2 (d))
49531 return true;
49533 if (expand_vec_perm_broadcast (d))
49534 return true;
49536 if (expand_vec_perm_vpermq_perm_1 (d))
49537 return true;
49539 if (expand_vec_perm_vperm2f128 (d))
49540 return true;
49542 if (expand_vec_perm_pblendv (d))
49543 return true;
49545 /* Try sequences of three instructions. */
49547 if (expand_vec_perm_even_odd_pack (d))
49548 return true;
49550 if (expand_vec_perm_2vperm2f128_vshuf (d))
49551 return true;
49553 if (expand_vec_perm_pshufb2 (d))
49554 return true;
49556 if (expand_vec_perm_interleave3 (d))
49557 return true;
49559 if (expand_vec_perm_vperm2f128_vblend (d))
49560 return true;
49562 /* Try sequences of four instructions. */
49564 if (expand_vec_perm_even_odd_trunc (d))
49565 return true;
49566 if (expand_vec_perm_vpshufb2_vpermq (d))
49567 return true;
49569 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
49570 return true;
49572 if (expand_vec_perm_vpermi2_vpshub2 (d))
49573 return true;
49575 /* ??? Look for narrow permutations whose element orderings would
49576 allow the promotion to a wider mode. */
49578 /* ??? Look for sequences of interleave or a wider permute that place
49579 the data into the correct lanes for a half-vector shuffle like
49580 pshuf[lh]w or vpermilps. */
49582 /* ??? Look for sequences of interleave that produce the desired results.
49583 The combinatorics of punpck[lh] get pretty ugly... */
49585 if (expand_vec_perm_even_odd (d))
49586 return true;
49588 /* Even longer sequences. */
49589 if (expand_vec_perm_vpshufb4_vpermq2 (d))
49590 return true;
49592 /* See if we can get the same permutation in different vector integer
49593 mode. */
49594 struct expand_vec_perm_d nd;
49595 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
49597 if (!d->testing_p)
49598 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
49599 return true;
49602 return false;
49605 /* If a permutation only uses one operand, make it clear. Returns true
49606 if the permutation references both operands. */
49608 static bool
49609 canonicalize_perm (struct expand_vec_perm_d *d)
49611 int i, which, nelt = d->nelt;
49613 for (i = which = 0; i < nelt; ++i)
49614 which |= (d->perm[i] < nelt ? 1 : 2);
49616 d->one_operand_p = true;
49617 switch (which)
49619 default:
49620 gcc_unreachable();
49622 case 3:
49623 if (!rtx_equal_p (d->op0, d->op1))
49625 d->one_operand_p = false;
49626 break;
49628 /* The elements of PERM do not suggest that only the first operand
49629 is used, but both operands are identical. Allow easier matching
49630 of the permutation by folding the permutation into the single
49631 input vector. */
49632 /* FALLTHRU */
49634 case 2:
49635 for (i = 0; i < nelt; ++i)
49636 d->perm[i] &= nelt - 1;
49637 d->op0 = d->op1;
49638 break;
49640 case 1:
49641 d->op1 = d->op0;
49642 break;
49645 return (which == 3);
49648 bool
49649 ix86_expand_vec_perm_const (rtx operands[4])
49651 struct expand_vec_perm_d d;
49652 unsigned char perm[MAX_VECT_LEN];
49653 int i, nelt;
49654 bool two_args;
49655 rtx sel;
49657 d.target = operands[0];
49658 d.op0 = operands[1];
49659 d.op1 = operands[2];
49660 sel = operands[3];
49662 d.vmode = GET_MODE (d.target);
49663 gcc_assert (VECTOR_MODE_P (d.vmode));
49664 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49665 d.testing_p = false;
49667 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
49668 gcc_assert (XVECLEN (sel, 0) == nelt);
49669 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
49671 for (i = 0; i < nelt; ++i)
49673 rtx e = XVECEXP (sel, 0, i);
49674 int ei = INTVAL (e) & (2 * nelt - 1);
49675 d.perm[i] = ei;
49676 perm[i] = ei;
49679 two_args = canonicalize_perm (&d);
49681 if (ix86_expand_vec_perm_const_1 (&d))
49682 return true;
49684 /* If the selector says both arguments are needed, but the operands are the
49685 same, the above tried to expand with one_operand_p and flattened selector.
49686 If that didn't work, retry without one_operand_p; we succeeded with that
49687 during testing. */
49688 if (two_args && d.one_operand_p)
49690 d.one_operand_p = false;
49691 memcpy (d.perm, perm, sizeof (perm));
49692 return ix86_expand_vec_perm_const_1 (&d);
49695 return false;
49698 /* Implement targetm.vectorize.vec_perm_const_ok. */
49700 static bool
49701 ix86_vectorize_vec_perm_const_ok (machine_mode vmode,
49702 const unsigned char *sel)
49704 struct expand_vec_perm_d d;
49705 unsigned int i, nelt, which;
49706 bool ret;
49708 d.vmode = vmode;
49709 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49710 d.testing_p = true;
49712 /* Given sufficient ISA support we can just return true here
49713 for selected vector modes. */
49714 switch (d.vmode)
49716 case E_V16SFmode:
49717 case E_V16SImode:
49718 case E_V8DImode:
49719 case E_V8DFmode:
49720 if (TARGET_AVX512F)
49721 /* All implementable with a single vpermi2 insn. */
49722 return true;
49723 break;
49724 case E_V32HImode:
49725 if (TARGET_AVX512BW)
49726 /* All implementable with a single vpermi2 insn. */
49727 return true;
49728 break;
49729 case E_V64QImode:
49730 if (TARGET_AVX512BW)
49731 /* Implementable with 2 vpermi2, 2 vpshufb and 1 or insn. */
49732 return true;
49733 break;
49734 case E_V8SImode:
49735 case E_V8SFmode:
49736 case E_V4DFmode:
49737 case E_V4DImode:
49738 if (TARGET_AVX512VL)
49739 /* All implementable with a single vpermi2 insn. */
49740 return true;
49741 break;
49742 case E_V16HImode:
49743 if (TARGET_AVX2)
49744 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49745 return true;
49746 break;
49747 case E_V32QImode:
49748 if (TARGET_AVX2)
49749 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
49750 return true;
49751 break;
49752 case E_V4SImode:
49753 case E_V4SFmode:
49754 case E_V8HImode:
49755 case E_V16QImode:
49756 /* All implementable with a single vpperm insn. */
49757 if (TARGET_XOP)
49758 return true;
49759 /* All implementable with 2 pshufb + 1 ior. */
49760 if (TARGET_SSSE3)
49761 return true;
49762 break;
49763 case E_V2DImode:
49764 case E_V2DFmode:
49765 /* All implementable with shufpd or unpck[lh]pd. */
49766 return true;
49767 default:
49768 return false;
49771 /* Extract the values from the vector CST into the permutation
49772 array in D. */
49773 memcpy (d.perm, sel, nelt);
49774 for (i = which = 0; i < nelt; ++i)
49776 unsigned char e = d.perm[i];
49777 gcc_assert (e < 2 * nelt);
49778 which |= (e < nelt ? 1 : 2);
49781 /* For all elements from second vector, fold the elements to first. */
49782 if (which == 2)
49783 for (i = 0; i < nelt; ++i)
49784 d.perm[i] -= nelt;
49786 /* Check whether the mask can be applied to the vector type. */
49787 d.one_operand_p = (which != 3);
49789 /* Implementable with shufps or pshufd. */
49790 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49791 return true;
49793 /* Otherwise we have to go through the motions and see if we can
49794 figure out how to generate the requested permutation. */
49795 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49796 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49797 if (!d.one_operand_p)
49798 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49800 start_sequence ();
49801 ret = ix86_expand_vec_perm_const_1 (&d);
49802 end_sequence ();
49804 return ret;
49807 void
49808 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49810 struct expand_vec_perm_d d;
49811 unsigned i, nelt;
49813 d.target = targ;
49814 d.op0 = op0;
49815 d.op1 = op1;
49816 d.vmode = GET_MODE (targ);
49817 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49818 d.one_operand_p = false;
49819 d.testing_p = false;
49821 for (i = 0; i < nelt; ++i)
49822 d.perm[i] = i * 2 + odd;
49824 /* We'll either be able to implement the permutation directly... */
49825 if (expand_vec_perm_1 (&d))
49826 return;
49828 /* ... or we use the special-case patterns. */
49829 expand_vec_perm_even_odd_1 (&d, odd);
49832 static void
49833 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49835 struct expand_vec_perm_d d;
49836 unsigned i, nelt, base;
49837 bool ok;
49839 d.target = targ;
49840 d.op0 = op0;
49841 d.op1 = op1;
49842 d.vmode = GET_MODE (targ);
49843 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49844 d.one_operand_p = false;
49845 d.testing_p = false;
49847 base = high_p ? nelt / 2 : 0;
49848 for (i = 0; i < nelt / 2; ++i)
49850 d.perm[i * 2] = i + base;
49851 d.perm[i * 2 + 1] = i + base + nelt;
49854 /* Note that for AVX this isn't one instruction. */
49855 ok = ix86_expand_vec_perm_const_1 (&d);
49856 gcc_assert (ok);
49860 /* Expand a vector operation CODE for a V*QImode in terms of the
49861 same operation on V*HImode. */
49863 void
49864 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49866 machine_mode qimode = GET_MODE (dest);
49867 machine_mode himode;
49868 rtx (*gen_il) (rtx, rtx, rtx);
49869 rtx (*gen_ih) (rtx, rtx, rtx);
49870 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49871 struct expand_vec_perm_d d;
49872 bool ok, full_interleave;
49873 bool uns_p = false;
49874 int i;
49876 switch (qimode)
49878 case E_V16QImode:
49879 himode = V8HImode;
49880 gen_il = gen_vec_interleave_lowv16qi;
49881 gen_ih = gen_vec_interleave_highv16qi;
49882 break;
49883 case E_V32QImode:
49884 himode = V16HImode;
49885 gen_il = gen_avx2_interleave_lowv32qi;
49886 gen_ih = gen_avx2_interleave_highv32qi;
49887 break;
49888 case E_V64QImode:
49889 himode = V32HImode;
49890 gen_il = gen_avx512bw_interleave_lowv64qi;
49891 gen_ih = gen_avx512bw_interleave_highv64qi;
49892 break;
49893 default:
49894 gcc_unreachable ();
49897 op2_l = op2_h = op2;
49898 switch (code)
49900 case MULT:
49901 /* Unpack data such that we've got a source byte in each low byte of
49902 each word. We don't care what goes into the high byte of each word.
49903 Rather than trying to get zero in there, most convenient is to let
49904 it be a copy of the low byte. */
49905 op2_l = gen_reg_rtx (qimode);
49906 op2_h = gen_reg_rtx (qimode);
49907 emit_insn (gen_il (op2_l, op2, op2));
49908 emit_insn (gen_ih (op2_h, op2, op2));
49909 /* FALLTHRU */
49911 op1_l = gen_reg_rtx (qimode);
49912 op1_h = gen_reg_rtx (qimode);
49913 emit_insn (gen_il (op1_l, op1, op1));
49914 emit_insn (gen_ih (op1_h, op1, op1));
49915 full_interleave = qimode == V16QImode;
49916 break;
49918 case ASHIFT:
49919 case LSHIFTRT:
49920 uns_p = true;
49921 /* FALLTHRU */
49922 case ASHIFTRT:
49923 op1_l = gen_reg_rtx (himode);
49924 op1_h = gen_reg_rtx (himode);
49925 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49926 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49927 full_interleave = true;
49928 break;
49929 default:
49930 gcc_unreachable ();
49933 /* Perform the operation. */
49934 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49935 1, OPTAB_DIRECT);
49936 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49937 1, OPTAB_DIRECT);
49938 gcc_assert (res_l && res_h);
49940 /* Merge the data back into the right place. */
49941 d.target = dest;
49942 d.op0 = gen_lowpart (qimode, res_l);
49943 d.op1 = gen_lowpart (qimode, res_h);
49944 d.vmode = qimode;
49945 d.nelt = GET_MODE_NUNITS (qimode);
49946 d.one_operand_p = false;
49947 d.testing_p = false;
49949 if (full_interleave)
49951 /* For SSE2, we used an full interleave, so the desired
49952 results are in the even elements. */
49953 for (i = 0; i < d.nelt; ++i)
49954 d.perm[i] = i * 2;
49956 else
49958 /* For AVX, the interleave used above was not cross-lane. So the
49959 extraction is evens but with the second and third quarter swapped.
49960 Happily, that is even one insn shorter than even extraction.
49961 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49962 always first from the first and then from the second source operand,
49963 the index bits above the low 4 bits remains the same.
49964 Thus, for d.nelt == 32 we want permutation
49965 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49966 and for d.nelt == 64 we want permutation
49967 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49968 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49969 for (i = 0; i < d.nelt; ++i)
49970 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49973 ok = ix86_expand_vec_perm_const_1 (&d);
49974 gcc_assert (ok);
49976 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49977 gen_rtx_fmt_ee (code, qimode, op1, op2));
49980 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49981 if op is CONST_VECTOR with all odd elements equal to their
49982 preceding element. */
49984 static bool
49985 const_vector_equal_evenodd_p (rtx op)
49987 machine_mode mode = GET_MODE (op);
49988 int i, nunits = GET_MODE_NUNITS (mode);
49989 if (GET_CODE (op) != CONST_VECTOR
49990 || nunits != CONST_VECTOR_NUNITS (op))
49991 return false;
49992 for (i = 0; i < nunits; i += 2)
49993 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49994 return false;
49995 return true;
49998 void
49999 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
50000 bool uns_p, bool odd_p)
50002 machine_mode mode = GET_MODE (op1);
50003 machine_mode wmode = GET_MODE (dest);
50004 rtx x;
50005 rtx orig_op1 = op1, orig_op2 = op2;
50007 if (!nonimmediate_operand (op1, mode))
50008 op1 = force_reg (mode, op1);
50009 if (!nonimmediate_operand (op2, mode))
50010 op2 = force_reg (mode, op2);
50012 /* We only play even/odd games with vectors of SImode. */
50013 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
50015 /* If we're looking for the odd results, shift those members down to
50016 the even slots. For some cpus this is faster than a PSHUFD. */
50017 if (odd_p)
50019 /* For XOP use vpmacsdqh, but only for smult, as it is only
50020 signed. */
50021 if (TARGET_XOP && mode == V4SImode && !uns_p)
50023 x = force_reg (wmode, CONST0_RTX (wmode));
50024 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
50025 return;
50028 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
50029 if (!const_vector_equal_evenodd_p (orig_op1))
50030 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
50031 x, NULL, 1, OPTAB_DIRECT);
50032 if (!const_vector_equal_evenodd_p (orig_op2))
50033 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
50034 x, NULL, 1, OPTAB_DIRECT);
50035 op1 = gen_lowpart (mode, op1);
50036 op2 = gen_lowpart (mode, op2);
50039 if (mode == V16SImode)
50041 if (uns_p)
50042 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
50043 else
50044 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
50046 else if (mode == V8SImode)
50048 if (uns_p)
50049 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
50050 else
50051 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
50053 else if (uns_p)
50054 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
50055 else if (TARGET_SSE4_1)
50056 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
50057 else
50059 rtx s1, s2, t0, t1, t2;
50061 /* The easiest way to implement this without PMULDQ is to go through
50062 the motions as if we are performing a full 64-bit multiply. With
50063 the exception that we need to do less shuffling of the elements. */
50065 /* Compute the sign-extension, aka highparts, of the two operands. */
50066 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50067 op1, pc_rtx, pc_rtx);
50068 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
50069 op2, pc_rtx, pc_rtx);
50071 /* Multiply LO(A) * HI(B), and vice-versa. */
50072 t1 = gen_reg_rtx (wmode);
50073 t2 = gen_reg_rtx (wmode);
50074 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
50075 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
50077 /* Multiply LO(A) * LO(B). */
50078 t0 = gen_reg_rtx (wmode);
50079 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
50081 /* Combine and shift the highparts into place. */
50082 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
50083 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
50084 1, OPTAB_DIRECT);
50086 /* Combine high and low parts. */
50087 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
50088 return;
50090 emit_insn (x);
50093 void
50094 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
50095 bool uns_p, bool high_p)
50097 machine_mode wmode = GET_MODE (dest);
50098 machine_mode mode = GET_MODE (op1);
50099 rtx t1, t2, t3, t4, mask;
50101 switch (mode)
50103 case E_V4SImode:
50104 t1 = gen_reg_rtx (mode);
50105 t2 = gen_reg_rtx (mode);
50106 if (TARGET_XOP && !uns_p)
50108 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
50109 shuffle the elements once so that all elements are in the right
50110 place for immediate use: { A C B D }. */
50111 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
50112 const1_rtx, GEN_INT (3)));
50113 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
50114 const1_rtx, GEN_INT (3)));
50116 else
50118 /* Put the elements into place for the multiply. */
50119 ix86_expand_vec_interleave (t1, op1, op1, high_p);
50120 ix86_expand_vec_interleave (t2, op2, op2, high_p);
50121 high_p = false;
50123 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
50124 break;
50126 case E_V8SImode:
50127 /* Shuffle the elements between the lanes. After this we
50128 have { A B E F | C D G H } for each operand. */
50129 t1 = gen_reg_rtx (V4DImode);
50130 t2 = gen_reg_rtx (V4DImode);
50131 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
50132 const0_rtx, const2_rtx,
50133 const1_rtx, GEN_INT (3)));
50134 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
50135 const0_rtx, const2_rtx,
50136 const1_rtx, GEN_INT (3)));
50138 /* Shuffle the elements within the lanes. After this we
50139 have { A A B B | C C D D } or { E E F F | G G H H }. */
50140 t3 = gen_reg_rtx (V8SImode);
50141 t4 = gen_reg_rtx (V8SImode);
50142 mask = GEN_INT (high_p
50143 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
50144 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
50145 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
50146 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
50148 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
50149 break;
50151 case E_V8HImode:
50152 case E_V16HImode:
50153 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
50154 uns_p, OPTAB_DIRECT);
50155 t2 = expand_binop (mode,
50156 uns_p ? umul_highpart_optab : smul_highpart_optab,
50157 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
50158 gcc_assert (t1 && t2);
50160 t3 = gen_reg_rtx (mode);
50161 ix86_expand_vec_interleave (t3, t1, t2, high_p);
50162 emit_move_insn (dest, gen_lowpart (wmode, t3));
50163 break;
50165 case E_V16QImode:
50166 case E_V32QImode:
50167 case E_V32HImode:
50168 case E_V16SImode:
50169 case E_V64QImode:
50170 t1 = gen_reg_rtx (wmode);
50171 t2 = gen_reg_rtx (wmode);
50172 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
50173 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
50175 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
50176 break;
50178 default:
50179 gcc_unreachable ();
50183 void
50184 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
50186 rtx res_1, res_2, res_3, res_4;
50188 res_1 = gen_reg_rtx (V4SImode);
50189 res_2 = gen_reg_rtx (V4SImode);
50190 res_3 = gen_reg_rtx (V2DImode);
50191 res_4 = gen_reg_rtx (V2DImode);
50192 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
50193 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
50195 /* Move the results in element 2 down to element 1; we don't care
50196 what goes in elements 2 and 3. Then we can merge the parts
50197 back together with an interleave.
50199 Note that two other sequences were tried:
50200 (1) Use interleaves at the start instead of psrldq, which allows
50201 us to use a single shufps to merge things back at the end.
50202 (2) Use shufps here to combine the two vectors, then pshufd to
50203 put the elements in the correct order.
50204 In both cases the cost of the reformatting stall was too high
50205 and the overall sequence slower. */
50207 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
50208 const0_rtx, const2_rtx,
50209 const0_rtx, const0_rtx));
50210 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
50211 const0_rtx, const2_rtx,
50212 const0_rtx, const0_rtx));
50213 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
50215 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
50218 void
50219 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
50221 machine_mode mode = GET_MODE (op0);
50222 rtx t1, t2, t3, t4, t5, t6;
50224 if (TARGET_AVX512DQ && mode == V8DImode)
50225 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
50226 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
50227 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
50228 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
50229 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
50230 else if (TARGET_XOP && mode == V2DImode)
50232 /* op1: A,B,C,D, op2: E,F,G,H */
50233 op1 = gen_lowpart (V4SImode, op1);
50234 op2 = gen_lowpart (V4SImode, op2);
50236 t1 = gen_reg_rtx (V4SImode);
50237 t2 = gen_reg_rtx (V4SImode);
50238 t3 = gen_reg_rtx (V2DImode);
50239 t4 = gen_reg_rtx (V2DImode);
50241 /* t1: B,A,D,C */
50242 emit_insn (gen_sse2_pshufd_1 (t1, op1,
50243 GEN_INT (1),
50244 GEN_INT (0),
50245 GEN_INT (3),
50246 GEN_INT (2)));
50248 /* t2: (B*E),(A*F),(D*G),(C*H) */
50249 emit_insn (gen_mulv4si3 (t2, t1, op2));
50251 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
50252 emit_insn (gen_xop_phadddq (t3, t2));
50254 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
50255 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
50257 /* Multiply lower parts and add all */
50258 t5 = gen_reg_rtx (V2DImode);
50259 emit_insn (gen_vec_widen_umult_even_v4si (t5,
50260 gen_lowpart (V4SImode, op1),
50261 gen_lowpart (V4SImode, op2)));
50262 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
50265 else
50267 machine_mode nmode;
50268 rtx (*umul) (rtx, rtx, rtx);
50270 if (mode == V2DImode)
50272 umul = gen_vec_widen_umult_even_v4si;
50273 nmode = V4SImode;
50275 else if (mode == V4DImode)
50277 umul = gen_vec_widen_umult_even_v8si;
50278 nmode = V8SImode;
50280 else if (mode == V8DImode)
50282 umul = gen_vec_widen_umult_even_v16si;
50283 nmode = V16SImode;
50285 else
50286 gcc_unreachable ();
50289 /* Multiply low parts. */
50290 t1 = gen_reg_rtx (mode);
50291 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
50293 /* Shift input vectors right 32 bits so we can multiply high parts. */
50294 t6 = GEN_INT (32);
50295 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
50296 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
50298 /* Multiply high parts by low parts. */
50299 t4 = gen_reg_rtx (mode);
50300 t5 = gen_reg_rtx (mode);
50301 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
50302 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
50304 /* Combine and shift the highparts back. */
50305 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
50306 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
50308 /* Combine high and low parts. */
50309 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
50312 set_unique_reg_note (get_last_insn (), REG_EQUAL,
50313 gen_rtx_MULT (mode, op1, op2));
50316 /* Return 1 if control tansfer instruction INSN
50317 should be encoded with bnd prefix.
50318 If insn is NULL then return 1 when control
50319 transfer instructions should be prefixed with
50320 bnd by default for current function. */
50322 bool
50323 ix86_bnd_prefixed_insn_p (rtx insn)
50325 /* For call insns check special flag. */
50326 if (insn && CALL_P (insn))
50328 rtx call = get_call_rtx_from (insn);
50329 if (call)
50330 return CALL_EXPR_WITH_BOUNDS_P (call);
50333 /* All other insns are prefixed only if function is instrumented. */
50334 return chkp_function_instrumented_p (current_function_decl);
50337 /* Calculate integer abs() using only SSE2 instructions. */
50339 void
50340 ix86_expand_sse2_abs (rtx target, rtx input)
50342 machine_mode mode = GET_MODE (target);
50343 rtx tmp0, tmp1, x;
50345 switch (mode)
50347 /* For 32-bit signed integer X, the best way to calculate the absolute
50348 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
50349 case E_V4SImode:
50350 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
50351 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
50352 NULL, 0, OPTAB_DIRECT);
50353 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
50354 NULL, 0, OPTAB_DIRECT);
50355 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
50356 target, 0, OPTAB_DIRECT);
50357 break;
50359 /* For 16-bit signed integer X, the best way to calculate the absolute
50360 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
50361 case E_V8HImode:
50362 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50364 x = expand_simple_binop (mode, SMAX, tmp0, input,
50365 target, 0, OPTAB_DIRECT);
50366 break;
50368 /* For 8-bit signed integer X, the best way to calculate the absolute
50369 value of X is min ((unsigned char) X, (unsigned char) (-X)),
50370 as SSE2 provides the PMINUB insn. */
50371 case E_V16QImode:
50372 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
50374 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
50375 target, 0, OPTAB_DIRECT);
50376 break;
50378 default:
50379 gcc_unreachable ();
50382 if (x != target)
50383 emit_move_insn (target, x);
50386 /* Expand an extract from a vector register through pextr insn.
50387 Return true if successful. */
50389 bool
50390 ix86_expand_pextr (rtx *operands)
50392 rtx dst = operands[0];
50393 rtx src = operands[1];
50395 unsigned int size = INTVAL (operands[2]);
50396 unsigned int pos = INTVAL (operands[3]);
50398 if (SUBREG_P (dst))
50400 /* Reject non-lowpart subregs. */
50401 if (SUBREG_BYTE (dst) > 0)
50402 return false;
50403 dst = SUBREG_REG (dst);
50406 if (SUBREG_P (src))
50408 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
50409 src = SUBREG_REG (src);
50412 switch (GET_MODE (src))
50414 case E_V16QImode:
50415 case E_V8HImode:
50416 case E_V4SImode:
50417 case E_V2DImode:
50418 case E_V1TImode:
50419 case E_TImode:
50421 machine_mode srcmode, dstmode;
50422 rtx d, pat;
50424 dstmode = mode_for_size (size, MODE_INT, 0);
50426 switch (dstmode)
50428 case E_QImode:
50429 if (!TARGET_SSE4_1)
50430 return false;
50431 srcmode = V16QImode;
50432 break;
50434 case E_HImode:
50435 if (!TARGET_SSE2)
50436 return false;
50437 srcmode = V8HImode;
50438 break;
50440 case E_SImode:
50441 if (!TARGET_SSE4_1)
50442 return false;
50443 srcmode = V4SImode;
50444 break;
50446 case E_DImode:
50447 gcc_assert (TARGET_64BIT);
50448 if (!TARGET_SSE4_1)
50449 return false;
50450 srcmode = V2DImode;
50451 break;
50453 default:
50454 return false;
50457 /* Reject extractions from misaligned positions. */
50458 if (pos & (size-1))
50459 return false;
50461 if (GET_MODE (dst) == dstmode)
50462 d = dst;
50463 else
50464 d = gen_reg_rtx (dstmode);
50466 /* Construct insn pattern. */
50467 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
50468 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
50470 /* Let the rtl optimizers know about the zero extension performed. */
50471 if (dstmode == QImode || dstmode == HImode)
50473 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
50474 d = gen_lowpart (SImode, d);
50477 emit_insn (gen_rtx_SET (d, pat));
50479 if (d != dst)
50480 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50481 return true;
50484 default:
50485 return false;
50489 /* Expand an insert into a vector register through pinsr insn.
50490 Return true if successful. */
50492 bool
50493 ix86_expand_pinsr (rtx *operands)
50495 rtx dst = operands[0];
50496 rtx src = operands[3];
50498 unsigned int size = INTVAL (operands[1]);
50499 unsigned int pos = INTVAL (operands[2]);
50501 if (SUBREG_P (dst))
50503 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
50504 dst = SUBREG_REG (dst);
50507 switch (GET_MODE (dst))
50509 case E_V16QImode:
50510 case E_V8HImode:
50511 case E_V4SImode:
50512 case E_V2DImode:
50513 case E_V1TImode:
50514 case E_TImode:
50516 machine_mode srcmode, dstmode;
50517 rtx (*pinsr)(rtx, rtx, rtx, rtx);
50518 rtx d;
50520 srcmode = mode_for_size (size, MODE_INT, 0);
50522 switch (srcmode)
50524 case E_QImode:
50525 if (!TARGET_SSE4_1)
50526 return false;
50527 dstmode = V16QImode;
50528 pinsr = gen_sse4_1_pinsrb;
50529 break;
50531 case E_HImode:
50532 if (!TARGET_SSE2)
50533 return false;
50534 dstmode = V8HImode;
50535 pinsr = gen_sse2_pinsrw;
50536 break;
50538 case E_SImode:
50539 if (!TARGET_SSE4_1)
50540 return false;
50541 dstmode = V4SImode;
50542 pinsr = gen_sse4_1_pinsrd;
50543 break;
50545 case E_DImode:
50546 gcc_assert (TARGET_64BIT);
50547 if (!TARGET_SSE4_1)
50548 return false;
50549 dstmode = V2DImode;
50550 pinsr = gen_sse4_1_pinsrq;
50551 break;
50553 default:
50554 return false;
50557 /* Reject insertions to misaligned positions. */
50558 if (pos & (size-1))
50559 return false;
50561 if (SUBREG_P (src))
50563 unsigned int srcpos = SUBREG_BYTE (src);
50565 if (srcpos > 0)
50567 rtx extr_ops[4];
50569 extr_ops[0] = gen_reg_rtx (srcmode);
50570 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
50571 extr_ops[2] = GEN_INT (size);
50572 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
50574 if (!ix86_expand_pextr (extr_ops))
50575 return false;
50577 src = extr_ops[0];
50579 else
50580 src = gen_lowpart (srcmode, SUBREG_REG (src));
50583 if (GET_MODE (dst) == dstmode)
50584 d = dst;
50585 else
50586 d = gen_reg_rtx (dstmode);
50588 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
50589 gen_lowpart (srcmode, src),
50590 GEN_INT (1 << (pos / size))));
50591 if (d != dst)
50592 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
50593 return true;
50596 default:
50597 return false;
50601 /* This function returns the calling abi specific va_list type node.
50602 It returns the FNDECL specific va_list type. */
50604 static tree
50605 ix86_fn_abi_va_list (tree fndecl)
50607 if (!TARGET_64BIT)
50608 return va_list_type_node;
50609 gcc_assert (fndecl != NULL_TREE);
50611 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
50612 return ms_va_list_type_node;
50613 else
50614 return sysv_va_list_type_node;
50617 /* Returns the canonical va_list type specified by TYPE. If there
50618 is no valid TYPE provided, it return NULL_TREE. */
50620 static tree
50621 ix86_canonical_va_list_type (tree type)
50623 if (TARGET_64BIT)
50625 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
50626 return ms_va_list_type_node;
50628 if ((TREE_CODE (type) == ARRAY_TYPE
50629 && integer_zerop (array_type_nelts (type)))
50630 || POINTER_TYPE_P (type))
50632 tree elem_type = TREE_TYPE (type);
50633 if (TREE_CODE (elem_type) == RECORD_TYPE
50634 && lookup_attribute ("sysv_abi va_list",
50635 TYPE_ATTRIBUTES (elem_type)))
50636 return sysv_va_list_type_node;
50639 return NULL_TREE;
50642 return std_canonical_va_list_type (type);
50645 /* Iterate through the target-specific builtin types for va_list.
50646 IDX denotes the iterator, *PTREE is set to the result type of
50647 the va_list builtin, and *PNAME to its internal type.
50648 Returns zero if there is no element for this index, otherwise
50649 IDX should be increased upon the next call.
50650 Note, do not iterate a base builtin's name like __builtin_va_list.
50651 Used from c_common_nodes_and_builtins. */
50653 static int
50654 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
50656 if (TARGET_64BIT)
50658 switch (idx)
50660 default:
50661 break;
50663 case 0:
50664 *ptree = ms_va_list_type_node;
50665 *pname = "__builtin_ms_va_list";
50666 return 1;
50668 case 1:
50669 *ptree = sysv_va_list_type_node;
50670 *pname = "__builtin_sysv_va_list";
50671 return 1;
50675 return 0;
50678 #undef TARGET_SCHED_DISPATCH
50679 #define TARGET_SCHED_DISPATCH has_dispatch
50680 #undef TARGET_SCHED_DISPATCH_DO
50681 #define TARGET_SCHED_DISPATCH_DO do_dispatch
50682 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50683 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50684 #undef TARGET_SCHED_REORDER
50685 #define TARGET_SCHED_REORDER ix86_sched_reorder
50686 #undef TARGET_SCHED_ADJUST_PRIORITY
50687 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50688 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50689 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50690 ix86_dependencies_evaluation_hook
50692 /* The size of the dispatch window is the total number of bytes of
50693 object code allowed in a window. */
50694 #define DISPATCH_WINDOW_SIZE 16
50696 /* Number of dispatch windows considered for scheduling. */
50697 #define MAX_DISPATCH_WINDOWS 3
50699 /* Maximum number of instructions in a window. */
50700 #define MAX_INSN 4
50702 /* Maximum number of immediate operands in a window. */
50703 #define MAX_IMM 4
50705 /* Maximum number of immediate bits allowed in a window. */
50706 #define MAX_IMM_SIZE 128
50708 /* Maximum number of 32 bit immediates allowed in a window. */
50709 #define MAX_IMM_32 4
50711 /* Maximum number of 64 bit immediates allowed in a window. */
50712 #define MAX_IMM_64 2
50714 /* Maximum total of loads or prefetches allowed in a window. */
50715 #define MAX_LOAD 2
50717 /* Maximum total of stores allowed in a window. */
50718 #define MAX_STORE 1
50720 #undef BIG
50721 #define BIG 100
50724 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
50725 enum dispatch_group {
50726 disp_no_group = 0,
50727 disp_load,
50728 disp_store,
50729 disp_load_store,
50730 disp_prefetch,
50731 disp_imm,
50732 disp_imm_32,
50733 disp_imm_64,
50734 disp_branch,
50735 disp_cmp,
50736 disp_jcc,
50737 disp_last
50740 /* Number of allowable groups in a dispatch window. It is an array
50741 indexed by dispatch_group enum. 100 is used as a big number,
50742 because the number of these kind of operations does not have any
50743 effect in dispatch window, but we need them for other reasons in
50744 the table. */
50745 static unsigned int num_allowable_groups[disp_last] = {
50746 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
50749 char group_name[disp_last + 1][16] = {
50750 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
50751 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
50752 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
50755 /* Instruction path. */
50756 enum insn_path {
50757 no_path = 0,
50758 path_single, /* Single micro op. */
50759 path_double, /* Double micro op. */
50760 path_multi, /* Instructions with more than 2 micro op.. */
50761 last_path
50764 /* sched_insn_info defines a window to the instructions scheduled in
50765 the basic block. It contains a pointer to the insn_info table and
50766 the instruction scheduled.
50768 Windows are allocated for each basic block and are linked
50769 together. */
50770 typedef struct sched_insn_info_s {
50771 rtx insn;
50772 enum dispatch_group group;
50773 enum insn_path path;
50774 int byte_len;
50775 int imm_bytes;
50776 } sched_insn_info;
50778 /* Linked list of dispatch windows. This is a two way list of
50779 dispatch windows of a basic block. It contains information about
50780 the number of uops in the window and the total number of
50781 instructions and of bytes in the object code for this dispatch
50782 window. */
50783 typedef struct dispatch_windows_s {
50784 int num_insn; /* Number of insn in the window. */
50785 int num_uops; /* Number of uops in the window. */
50786 int window_size; /* Number of bytes in the window. */
50787 int window_num; /* Window number between 0 or 1. */
50788 int num_imm; /* Number of immediates in an insn. */
50789 int num_imm_32; /* Number of 32 bit immediates in an insn. */
50790 int num_imm_64; /* Number of 64 bit immediates in an insn. */
50791 int imm_size; /* Total immediates in the window. */
50792 int num_loads; /* Total memory loads in the window. */
50793 int num_stores; /* Total memory stores in the window. */
50794 int violation; /* Violation exists in window. */
50795 sched_insn_info *window; /* Pointer to the window. */
50796 struct dispatch_windows_s *next;
50797 struct dispatch_windows_s *prev;
50798 } dispatch_windows;
50800 /* Immediate valuse used in an insn. */
50801 typedef struct imm_info_s
50803 int imm;
50804 int imm32;
50805 int imm64;
50806 } imm_info;
50808 static dispatch_windows *dispatch_window_list;
50809 static dispatch_windows *dispatch_window_list1;
50811 /* Get dispatch group of insn. */
50813 static enum dispatch_group
50814 get_mem_group (rtx_insn *insn)
50816 enum attr_memory memory;
50818 if (INSN_CODE (insn) < 0)
50819 return disp_no_group;
50820 memory = get_attr_memory (insn);
50821 if (memory == MEMORY_STORE)
50822 return disp_store;
50824 if (memory == MEMORY_LOAD)
50825 return disp_load;
50827 if (memory == MEMORY_BOTH)
50828 return disp_load_store;
50830 return disp_no_group;
50833 /* Return true if insn is a compare instruction. */
50835 static bool
50836 is_cmp (rtx_insn *insn)
50838 enum attr_type type;
50840 type = get_attr_type (insn);
50841 return (type == TYPE_TEST
50842 || type == TYPE_ICMP
50843 || type == TYPE_FCMP
50844 || GET_CODE (PATTERN (insn)) == COMPARE);
50847 /* Return true if a dispatch violation encountered. */
50849 static bool
50850 dispatch_violation (void)
50852 if (dispatch_window_list->next)
50853 return dispatch_window_list->next->violation;
50854 return dispatch_window_list->violation;
50857 /* Return true if insn is a branch instruction. */
50859 static bool
50860 is_branch (rtx_insn *insn)
50862 return (CALL_P (insn) || JUMP_P (insn));
50865 /* Return true if insn is a prefetch instruction. */
50867 static bool
50868 is_prefetch (rtx_insn *insn)
50870 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
50873 /* This function initializes a dispatch window and the list container holding a
50874 pointer to the window. */
50876 static void
50877 init_window (int window_num)
50879 int i;
50880 dispatch_windows *new_list;
50882 if (window_num == 0)
50883 new_list = dispatch_window_list;
50884 else
50885 new_list = dispatch_window_list1;
50887 new_list->num_insn = 0;
50888 new_list->num_uops = 0;
50889 new_list->window_size = 0;
50890 new_list->next = NULL;
50891 new_list->prev = NULL;
50892 new_list->window_num = window_num;
50893 new_list->num_imm = 0;
50894 new_list->num_imm_32 = 0;
50895 new_list->num_imm_64 = 0;
50896 new_list->imm_size = 0;
50897 new_list->num_loads = 0;
50898 new_list->num_stores = 0;
50899 new_list->violation = false;
50901 for (i = 0; i < MAX_INSN; i++)
50903 new_list->window[i].insn = NULL;
50904 new_list->window[i].group = disp_no_group;
50905 new_list->window[i].path = no_path;
50906 new_list->window[i].byte_len = 0;
50907 new_list->window[i].imm_bytes = 0;
50909 return;
50912 /* This function allocates and initializes a dispatch window and the
50913 list container holding a pointer to the window. */
50915 static dispatch_windows *
50916 allocate_window (void)
50918 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
50919 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
50921 return new_list;
50924 /* This routine initializes the dispatch scheduling information. It
50925 initiates building dispatch scheduler tables and constructs the
50926 first dispatch window. */
50928 static void
50929 init_dispatch_sched (void)
50931 /* Allocate a dispatch list and a window. */
50932 dispatch_window_list = allocate_window ();
50933 dispatch_window_list1 = allocate_window ();
50934 init_window (0);
50935 init_window (1);
50938 /* This function returns true if a branch is detected. End of a basic block
50939 does not have to be a branch, but here we assume only branches end a
50940 window. */
50942 static bool
50943 is_end_basic_block (enum dispatch_group group)
50945 return group == disp_branch;
50948 /* This function is called when the end of a window processing is reached. */
50950 static void
50951 process_end_window (void)
50953 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
50954 if (dispatch_window_list->next)
50956 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
50957 gcc_assert (dispatch_window_list->window_size
50958 + dispatch_window_list1->window_size <= 48);
50959 init_window (1);
50961 init_window (0);
50964 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
50965 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
50966 for 48 bytes of instructions. Note that these windows are not dispatch
50967 windows that their sizes are DISPATCH_WINDOW_SIZE. */
50969 static dispatch_windows *
50970 allocate_next_window (int window_num)
50972 if (window_num == 0)
50974 if (dispatch_window_list->next)
50975 init_window (1);
50976 init_window (0);
50977 return dispatch_window_list;
50980 dispatch_window_list->next = dispatch_window_list1;
50981 dispatch_window_list1->prev = dispatch_window_list;
50983 return dispatch_window_list1;
50986 /* Compute number of immediate operands of an instruction. */
50988 static void
50989 find_constant (rtx in_rtx, imm_info *imm_values)
50991 if (INSN_P (in_rtx))
50992 in_rtx = PATTERN (in_rtx);
50993 subrtx_iterator::array_type array;
50994 FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
50995 if (const_rtx x = *iter)
50996 switch (GET_CODE (x))
50998 case CONST:
50999 case SYMBOL_REF:
51000 case CONST_INT:
51001 (imm_values->imm)++;
51002 if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
51003 (imm_values->imm32)++;
51004 else
51005 (imm_values->imm64)++;
51006 break;
51008 case CONST_DOUBLE:
51009 case CONST_WIDE_INT:
51010 (imm_values->imm)++;
51011 (imm_values->imm64)++;
51012 break;
51014 case CODE_LABEL:
51015 if (LABEL_KIND (x) == LABEL_NORMAL)
51017 (imm_values->imm)++;
51018 (imm_values->imm32)++;
51020 break;
51022 default:
51023 break;
51027 /* Return total size of immediate operands of an instruction along with number
51028 of corresponding immediate-operands. It initializes its parameters to zero
51029 befor calling FIND_CONSTANT.
51030 INSN is the input instruction. IMM is the total of immediates.
51031 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
51032 bit immediates. */
51034 static int
51035 get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
51037 imm_info imm_values = {0, 0, 0};
51039 find_constant (insn, &imm_values);
51040 *imm = imm_values.imm;
51041 *imm32 = imm_values.imm32;
51042 *imm64 = imm_values.imm64;
51043 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
51046 /* This function indicates if an operand of an instruction is an
51047 immediate. */
51049 static bool
51050 has_immediate (rtx_insn *insn)
51052 int num_imm_operand;
51053 int num_imm32_operand;
51054 int num_imm64_operand;
51056 if (insn)
51057 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51058 &num_imm64_operand);
51059 return false;
51062 /* Return single or double path for instructions. */
51064 static enum insn_path
51065 get_insn_path (rtx_insn *insn)
51067 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
51069 if ((int)path == 0)
51070 return path_single;
51072 if ((int)path == 1)
51073 return path_double;
51075 return path_multi;
51078 /* Return insn dispatch group. */
51080 static enum dispatch_group
51081 get_insn_group (rtx_insn *insn)
51083 enum dispatch_group group = get_mem_group (insn);
51084 if (group)
51085 return group;
51087 if (is_branch (insn))
51088 return disp_branch;
51090 if (is_cmp (insn))
51091 return disp_cmp;
51093 if (has_immediate (insn))
51094 return disp_imm;
51096 if (is_prefetch (insn))
51097 return disp_prefetch;
51099 return disp_no_group;
51102 /* Count number of GROUP restricted instructions in a dispatch
51103 window WINDOW_LIST. */
51105 static int
51106 count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
51108 enum dispatch_group group = get_insn_group (insn);
51109 int imm_size;
51110 int num_imm_operand;
51111 int num_imm32_operand;
51112 int num_imm64_operand;
51114 if (group == disp_no_group)
51115 return 0;
51117 if (group == disp_imm)
51119 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51120 &num_imm64_operand);
51121 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
51122 || num_imm_operand + window_list->num_imm > MAX_IMM
51123 || (num_imm32_operand > 0
51124 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
51125 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
51126 || (num_imm64_operand > 0
51127 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
51128 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
51129 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
51130 && num_imm64_operand > 0
51131 && ((window_list->num_imm_64 > 0
51132 && window_list->num_insn >= 2)
51133 || window_list->num_insn >= 3)))
51134 return BIG;
51136 return 1;
51139 if ((group == disp_load_store
51140 && (window_list->num_loads >= MAX_LOAD
51141 || window_list->num_stores >= MAX_STORE))
51142 || ((group == disp_load
51143 || group == disp_prefetch)
51144 && window_list->num_loads >= MAX_LOAD)
51145 || (group == disp_store
51146 && window_list->num_stores >= MAX_STORE))
51147 return BIG;
51149 return 1;
51152 /* This function returns true if insn satisfies dispatch rules on the
51153 last window scheduled. */
51155 static bool
51156 fits_dispatch_window (rtx_insn *insn)
51158 dispatch_windows *window_list = dispatch_window_list;
51159 dispatch_windows *window_list_next = dispatch_window_list->next;
51160 unsigned int num_restrict;
51161 enum dispatch_group group = get_insn_group (insn);
51162 enum insn_path path = get_insn_path (insn);
51163 int sum;
51165 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
51166 instructions should be given the lowest priority in the
51167 scheduling process in Haifa scheduler to make sure they will be
51168 scheduled in the same dispatch window as the reference to them. */
51169 if (group == disp_jcc || group == disp_cmp)
51170 return false;
51172 /* Check nonrestricted. */
51173 if (group == disp_no_group || group == disp_branch)
51174 return true;
51176 /* Get last dispatch window. */
51177 if (window_list_next)
51178 window_list = window_list_next;
51180 if (window_list->window_num == 1)
51182 sum = window_list->prev->window_size + window_list->window_size;
51184 if (sum == 32
51185 || (min_insn_size (insn) + sum) >= 48)
51186 /* Window 1 is full. Go for next window. */
51187 return true;
51190 num_restrict = count_num_restricted (insn, window_list);
51192 if (num_restrict > num_allowable_groups[group])
51193 return false;
51195 /* See if it fits in the first window. */
51196 if (window_list->window_num == 0)
51198 /* The first widow should have only single and double path
51199 uops. */
51200 if (path == path_double
51201 && (window_list->num_uops + 2) > MAX_INSN)
51202 return false;
51203 else if (path != path_single)
51204 return false;
51206 return true;
51209 /* Add an instruction INSN with NUM_UOPS micro-operations to the
51210 dispatch window WINDOW_LIST. */
51212 static void
51213 add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
51215 int byte_len = min_insn_size (insn);
51216 int num_insn = window_list->num_insn;
51217 int imm_size;
51218 sched_insn_info *window = window_list->window;
51219 enum dispatch_group group = get_insn_group (insn);
51220 enum insn_path path = get_insn_path (insn);
51221 int num_imm_operand;
51222 int num_imm32_operand;
51223 int num_imm64_operand;
51225 if (!window_list->violation && group != disp_cmp
51226 && !fits_dispatch_window (insn))
51227 window_list->violation = true;
51229 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51230 &num_imm64_operand);
51232 /* Initialize window with new instruction. */
51233 window[num_insn].insn = insn;
51234 window[num_insn].byte_len = byte_len;
51235 window[num_insn].group = group;
51236 window[num_insn].path = path;
51237 window[num_insn].imm_bytes = imm_size;
51239 window_list->window_size += byte_len;
51240 window_list->num_insn = num_insn + 1;
51241 window_list->num_uops = window_list->num_uops + num_uops;
51242 window_list->imm_size += imm_size;
51243 window_list->num_imm += num_imm_operand;
51244 window_list->num_imm_32 += num_imm32_operand;
51245 window_list->num_imm_64 += num_imm64_operand;
51247 if (group == disp_store)
51248 window_list->num_stores += 1;
51249 else if (group == disp_load
51250 || group == disp_prefetch)
51251 window_list->num_loads += 1;
51252 else if (group == disp_load_store)
51254 window_list->num_stores += 1;
51255 window_list->num_loads += 1;
51259 /* Adds a scheduled instruction, INSN, to the current dispatch window.
51260 If the total bytes of instructions or the number of instructions in
51261 the window exceed allowable, it allocates a new window. */
51263 static void
51264 add_to_dispatch_window (rtx_insn *insn)
51266 int byte_len;
51267 dispatch_windows *window_list;
51268 dispatch_windows *next_list;
51269 dispatch_windows *window0_list;
51270 enum insn_path path;
51271 enum dispatch_group insn_group;
51272 bool insn_fits;
51273 int num_insn;
51274 int num_uops;
51275 int window_num;
51276 int insn_num_uops;
51277 int sum;
51279 if (INSN_CODE (insn) < 0)
51280 return;
51282 byte_len = min_insn_size (insn);
51283 window_list = dispatch_window_list;
51284 next_list = window_list->next;
51285 path = get_insn_path (insn);
51286 insn_group = get_insn_group (insn);
51288 /* Get the last dispatch window. */
51289 if (next_list)
51290 window_list = dispatch_window_list->next;
51292 if (path == path_single)
51293 insn_num_uops = 1;
51294 else if (path == path_double)
51295 insn_num_uops = 2;
51296 else
51297 insn_num_uops = (int) path;
51299 /* If current window is full, get a new window.
51300 Window number zero is full, if MAX_INSN uops are scheduled in it.
51301 Window number one is full, if window zero's bytes plus window
51302 one's bytes is 32, or if the bytes of the new instruction added
51303 to the total makes it greater than 48, or it has already MAX_INSN
51304 instructions in it. */
51305 num_insn = window_list->num_insn;
51306 num_uops = window_list->num_uops;
51307 window_num = window_list->window_num;
51308 insn_fits = fits_dispatch_window (insn);
51310 if (num_insn >= MAX_INSN
51311 || num_uops + insn_num_uops > MAX_INSN
51312 || !(insn_fits))
51314 window_num = ~window_num & 1;
51315 window_list = allocate_next_window (window_num);
51318 if (window_num == 0)
51320 add_insn_window (insn, window_list, insn_num_uops);
51321 if (window_list->num_insn >= MAX_INSN
51322 && insn_group == disp_branch)
51324 process_end_window ();
51325 return;
51328 else if (window_num == 1)
51330 window0_list = window_list->prev;
51331 sum = window0_list->window_size + window_list->window_size;
51332 if (sum == 32
51333 || (byte_len + sum) >= 48)
51335 process_end_window ();
51336 window_list = dispatch_window_list;
51339 add_insn_window (insn, window_list, insn_num_uops);
51341 else
51342 gcc_unreachable ();
51344 if (is_end_basic_block (insn_group))
51346 /* End of basic block is reached do end-basic-block process. */
51347 process_end_window ();
51348 return;
51352 /* Print the dispatch window, WINDOW_NUM, to FILE. */
51354 DEBUG_FUNCTION static void
51355 debug_dispatch_window_file (FILE *file, int window_num)
51357 dispatch_windows *list;
51358 int i;
51360 if (window_num == 0)
51361 list = dispatch_window_list;
51362 else
51363 list = dispatch_window_list1;
51365 fprintf (file, "Window #%d:\n", list->window_num);
51366 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
51367 list->num_insn, list->num_uops, list->window_size);
51368 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51369 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
51371 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
51372 list->num_stores);
51373 fprintf (file, " insn info:\n");
51375 for (i = 0; i < MAX_INSN; i++)
51377 if (!list->window[i].insn)
51378 break;
51379 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
51380 i, group_name[list->window[i].group],
51381 i, (void *)list->window[i].insn,
51382 i, list->window[i].path,
51383 i, list->window[i].byte_len,
51384 i, list->window[i].imm_bytes);
51388 /* Print to stdout a dispatch window. */
51390 DEBUG_FUNCTION void
51391 debug_dispatch_window (int window_num)
51393 debug_dispatch_window_file (stdout, window_num);
51396 /* Print INSN dispatch information to FILE. */
51398 DEBUG_FUNCTION static void
51399 debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
51401 int byte_len;
51402 enum insn_path path;
51403 enum dispatch_group group;
51404 int imm_size;
51405 int num_imm_operand;
51406 int num_imm32_operand;
51407 int num_imm64_operand;
51409 if (INSN_CODE (insn) < 0)
51410 return;
51412 byte_len = min_insn_size (insn);
51413 path = get_insn_path (insn);
51414 group = get_insn_group (insn);
51415 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
51416 &num_imm64_operand);
51418 fprintf (file, " insn info:\n");
51419 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
51420 group_name[group], path, byte_len);
51421 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
51422 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
51425 /* Print to STDERR the status of the ready list with respect to
51426 dispatch windows. */
51428 DEBUG_FUNCTION void
51429 debug_ready_dispatch (void)
51431 int i;
51432 int no_ready = number_in_ready ();
51434 fprintf (stdout, "Number of ready: %d\n", no_ready);
51436 for (i = 0; i < no_ready; i++)
51437 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
51440 /* This routine is the driver of the dispatch scheduler. */
51442 static void
51443 do_dispatch (rtx_insn *insn, int mode)
51445 if (mode == DISPATCH_INIT)
51446 init_dispatch_sched ();
51447 else if (mode == ADD_TO_DISPATCH_WINDOW)
51448 add_to_dispatch_window (insn);
51451 /* Return TRUE if Dispatch Scheduling is supported. */
51453 static bool
51454 has_dispatch (rtx_insn *insn, int action)
51456 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
51457 || TARGET_BDVER4 || TARGET_ZNVER1) && flag_dispatch_scheduler)
51458 switch (action)
51460 default:
51461 return false;
51463 case IS_DISPATCH_ON:
51464 return true;
51466 case IS_CMP:
51467 return is_cmp (insn);
51469 case DISPATCH_VIOLATION:
51470 return dispatch_violation ();
51472 case FITS_DISPATCH_WINDOW:
51473 return fits_dispatch_window (insn);
51476 return false;
51479 /* Implementation of reassociation_width target hook used by
51480 reassoc phase to identify parallelism level in reassociated
51481 tree. Statements tree_code is passed in OPC. Arguments type
51482 is passed in MODE.
51484 Currently parallel reassociation is enabled for Atom
51485 processors only and we set reassociation width to be 2
51486 because Atom may issue up to 2 instructions per cycle.
51488 Return value should be fixed if parallel reassociation is
51489 enabled for other processors. */
51491 static int
51492 ix86_reassociation_width (unsigned int, machine_mode mode)
51494 /* Vector part. */
51495 if (VECTOR_MODE_P (mode))
51497 if (TARGET_VECTOR_PARALLEL_EXECUTION)
51498 return 2;
51499 else
51500 return 1;
51503 /* Scalar part. */
51504 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
51505 return 2;
51506 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
51507 return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2);
51508 else
51509 return 1;
51512 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
51513 place emms and femms instructions. */
51515 static machine_mode
51516 ix86_preferred_simd_mode (machine_mode mode)
51518 if (!TARGET_SSE)
51519 return word_mode;
51521 switch (mode)
51523 case E_QImode:
51524 return TARGET_AVX512BW ? V64QImode :
51525 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
51526 case E_HImode:
51527 return TARGET_AVX512BW ? V32HImode :
51528 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
51529 case E_SImode:
51530 return TARGET_AVX512F ? V16SImode :
51531 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
51532 case E_DImode:
51533 return TARGET_AVX512F ? V8DImode :
51534 (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
51536 case E_SFmode:
51537 if (TARGET_AVX512F)
51538 return V16SFmode;
51539 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51540 return V8SFmode;
51541 else
51542 return V4SFmode;
51544 case E_DFmode:
51545 if (TARGET_AVX512F)
51546 return V8DFmode;
51547 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
51548 return V4DFmode;
51549 else if (TARGET_SSE2)
51550 return V2DFmode;
51551 /* FALLTHRU */
51553 default:
51554 return word_mode;
51558 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
51559 vectors. If AVX512F is enabled then try vectorizing with 512bit,
51560 256bit and 128bit vectors. */
51562 static unsigned int
51563 ix86_autovectorize_vector_sizes (void)
51565 return TARGET_AVX512F ? 64 | 32 | 16 :
51566 (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
51569 /* Implemenation of targetm.vectorize.get_mask_mode. */
51571 static machine_mode
51572 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
51574 unsigned elem_size = vector_size / nunits;
51576 /* Scalar mask case. */
51577 if ((TARGET_AVX512F && vector_size == 64)
51578 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
51580 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
51581 return smallest_mode_for_size (nunits, MODE_INT);
51584 machine_mode elem_mode
51585 = smallest_mode_for_size (elem_size * BITS_PER_UNIT, MODE_INT);
51587 gcc_assert (elem_size * nunits == vector_size);
51589 return mode_for_vector (elem_mode, nunits);
51594 /* Return class of registers which could be used for pseudo of MODE
51595 and of class RCLASS for spilling instead of memory. Return NO_REGS
51596 if it is not possible or non-profitable. */
51598 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51600 static reg_class_t
51601 ix86_spill_class (reg_class_t rclass, machine_mode mode)
51603 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
51604 && TARGET_SSE2
51605 && TARGET_INTER_UNIT_MOVES_TO_VEC
51606 && TARGET_INTER_UNIT_MOVES_FROM_VEC
51607 && (mode == SImode || (TARGET_64BIT && mode == DImode))
51608 && INTEGER_CLASS_P (rclass))
51609 return ALL_SSE_REGS;
51610 return NO_REGS;
51613 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
51614 but returns a lower bound. */
51616 static unsigned int
51617 ix86_max_noce_ifcvt_seq_cost (edge e)
51619 bool predictable_p = predictable_edge_p (e);
51621 enum compiler_param param
51622 = (predictable_p
51623 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
51624 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
51626 /* If we have a parameter set, use that, otherwise take a guess using
51627 BRANCH_COST. */
51628 if (global_options_set.x_param_values[param])
51629 return PARAM_VALUE (param);
51630 else
51631 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
51634 /* Return true if SEQ is a good candidate as a replacement for the
51635 if-convertible sequence described in IF_INFO. */
51637 static bool
51638 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
51640 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
51642 int cmov_cnt = 0;
51643 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
51644 Maybe we should allow even more conditional moves as long as they
51645 are used far enough not to stall the CPU, or also consider
51646 IF_INFO->TEST_BB succ edge probabilities. */
51647 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
51649 rtx set = single_set (insn);
51650 if (!set)
51651 continue;
51652 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
51653 continue;
51654 rtx src = SET_SRC (set);
51655 machine_mode mode = GET_MODE (src);
51656 if (GET_MODE_CLASS (mode) != MODE_INT
51657 && GET_MODE_CLASS (mode) != MODE_FLOAT)
51658 continue;
51659 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
51660 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
51661 continue;
51662 /* insn is CMOV or FCMOV. */
51663 if (++cmov_cnt > 1)
51664 return false;
51667 return default_noce_conversion_profitable_p (seq, if_info);
51670 /* Implement targetm.vectorize.init_cost. */
51672 static void *
51673 ix86_init_cost (struct loop *)
51675 unsigned *cost = XNEWVEC (unsigned, 3);
51676 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
51677 return cost;
51680 /* Implement targetm.vectorize.add_stmt_cost. */
51682 static unsigned
51683 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
51684 struct _stmt_vec_info *stmt_info, int misalign,
51685 enum vect_cost_model_location where)
51687 unsigned *cost = (unsigned *) data;
51688 unsigned retval = 0;
51690 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
51691 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
51693 /* Penalize DFmode vector operations for Bonnell. */
51694 if (TARGET_BONNELL && kind == vector_stmt
51695 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
51696 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
51698 /* Statements in an inner loop relative to the loop being
51699 vectorized are weighted more heavily. The value here is
51700 arbitrary and could potentially be improved with analysis. */
51701 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
51702 count *= 50; /* FIXME. */
51704 retval = (unsigned) (count * stmt_cost);
51706 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
51707 for Silvermont as it has out of order integer pipeline and can execute
51708 2 scalar instruction per tick, but has in order SIMD pipeline. */
51709 if ((TARGET_SILVERMONT || TARGET_INTEL)
51710 && stmt_info && stmt_info->stmt)
51712 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
51713 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
51714 retval = (retval * 17) / 10;
51717 cost[where] += retval;
51719 return retval;
51722 /* Implement targetm.vectorize.finish_cost. */
51724 static void
51725 ix86_finish_cost (void *data, unsigned *prologue_cost,
51726 unsigned *body_cost, unsigned *epilogue_cost)
51728 unsigned *cost = (unsigned *) data;
51729 *prologue_cost = cost[vect_prologue];
51730 *body_cost = cost[vect_body];
51731 *epilogue_cost = cost[vect_epilogue];
51734 /* Implement targetm.vectorize.destroy_cost_data. */
51736 static void
51737 ix86_destroy_cost_data (void *data)
51739 free (data);
51742 /* Validate target specific memory model bits in VAL. */
51744 static unsigned HOST_WIDE_INT
51745 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
51747 enum memmodel model = memmodel_from_int (val);
51748 bool strong;
51750 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
51751 |MEMMODEL_MASK)
51752 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
51754 warning (OPT_Winvalid_memory_model,
51755 "Unknown architecture specific memory model");
51756 return MEMMODEL_SEQ_CST;
51758 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
51759 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
51761 warning (OPT_Winvalid_memory_model,
51762 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
51763 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
51765 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
51767 warning (OPT_Winvalid_memory_model,
51768 "HLE_RELEASE not used with RELEASE or stronger memory model");
51769 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
51771 return val;
51774 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
51775 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
51776 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
51777 or number of vecsize_mangle variants that should be emitted. */
51779 static int
51780 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
51781 struct cgraph_simd_clone *clonei,
51782 tree base_type, int num)
51784 int ret = 1;
51786 if (clonei->simdlen
51787 && (clonei->simdlen < 2
51788 || clonei->simdlen > 1024
51789 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
51791 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51792 "unsupported simdlen %d", clonei->simdlen);
51793 return 0;
51796 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
51797 if (TREE_CODE (ret_type) != VOID_TYPE)
51798 switch (TYPE_MODE (ret_type))
51800 case E_QImode:
51801 case E_HImode:
51802 case E_SImode:
51803 case E_DImode:
51804 case E_SFmode:
51805 case E_DFmode:
51806 /* case E_SCmode: */
51807 /* case E_DCmode: */
51808 break;
51809 default:
51810 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51811 "unsupported return type %qT for simd\n", ret_type);
51812 return 0;
51815 tree t;
51816 int i;
51818 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
51819 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
51820 switch (TYPE_MODE (TREE_TYPE (t)))
51822 case E_QImode:
51823 case E_HImode:
51824 case E_SImode:
51825 case E_DImode:
51826 case E_SFmode:
51827 case E_DFmode:
51828 /* case E_SCmode: */
51829 /* case E_DCmode: */
51830 break;
51831 default:
51832 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51833 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
51834 return 0;
51837 if (clonei->cilk_elemental)
51839 /* Parse here processor clause. If not present, default to 'b'. */
51840 clonei->vecsize_mangle = 'b';
51842 else if (!TREE_PUBLIC (node->decl))
51844 /* If the function isn't exported, we can pick up just one ISA
51845 for the clones. */
51846 if (TARGET_AVX512F)
51847 clonei->vecsize_mangle = 'e';
51848 else if (TARGET_AVX2)
51849 clonei->vecsize_mangle = 'd';
51850 else if (TARGET_AVX)
51851 clonei->vecsize_mangle = 'c';
51852 else
51853 clonei->vecsize_mangle = 'b';
51854 ret = 1;
51856 else
51858 clonei->vecsize_mangle = "bcde"[num];
51859 ret = 4;
51861 clonei->mask_mode = VOIDmode;
51862 switch (clonei->vecsize_mangle)
51864 case 'b':
51865 clonei->vecsize_int = 128;
51866 clonei->vecsize_float = 128;
51867 break;
51868 case 'c':
51869 clonei->vecsize_int = 128;
51870 clonei->vecsize_float = 256;
51871 break;
51872 case 'd':
51873 clonei->vecsize_int = 256;
51874 clonei->vecsize_float = 256;
51875 break;
51876 case 'e':
51877 clonei->vecsize_int = 512;
51878 clonei->vecsize_float = 512;
51879 if (TYPE_MODE (base_type) == QImode)
51880 clonei->mask_mode = DImode;
51881 else
51882 clonei->mask_mode = SImode;
51883 break;
51885 if (clonei->simdlen == 0)
51887 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
51888 clonei->simdlen = clonei->vecsize_int;
51889 else
51890 clonei->simdlen = clonei->vecsize_float;
51891 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
51893 else if (clonei->simdlen > 16)
51895 /* For compatibility with ICC, use the same upper bounds
51896 for simdlen. In particular, for CTYPE below, use the return type,
51897 unless the function returns void, in that case use the characteristic
51898 type. If it is possible for given SIMDLEN to pass CTYPE value
51899 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
51900 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
51901 emit corresponding clone. */
51902 tree ctype = ret_type;
51903 if (TREE_CODE (ret_type) == VOID_TYPE)
51904 ctype = base_type;
51905 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
51906 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
51907 cnt /= clonei->vecsize_int;
51908 else
51909 cnt /= clonei->vecsize_float;
51910 if (cnt > (TARGET_64BIT ? 16 : 8))
51912 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
51913 "unsupported simdlen %d", clonei->simdlen);
51914 return 0;
51917 return ret;
51920 /* Add target attribute to SIMD clone NODE if needed. */
51922 static void
51923 ix86_simd_clone_adjust (struct cgraph_node *node)
51925 const char *str = NULL;
51926 gcc_assert (node->decl == cfun->decl);
51927 switch (node->simdclone->vecsize_mangle)
51929 case 'b':
51930 if (!TARGET_SSE2)
51931 str = "sse2";
51932 break;
51933 case 'c':
51934 if (!TARGET_AVX)
51935 str = "avx";
51936 break;
51937 case 'd':
51938 if (!TARGET_AVX2)
51939 str = "avx2";
51940 break;
51941 case 'e':
51942 if (!TARGET_AVX512F)
51943 str = "avx512f";
51944 break;
51945 default:
51946 gcc_unreachable ();
51948 if (str == NULL)
51949 return;
51950 push_cfun (NULL);
51951 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
51952 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
51953 gcc_assert (ok);
51954 pop_cfun ();
51955 ix86_reset_previous_fndecl ();
51956 ix86_set_current_function (node->decl);
51959 /* If SIMD clone NODE can't be used in a vectorized loop
51960 in current function, return -1, otherwise return a badness of using it
51961 (0 if it is most desirable from vecsize_mangle point of view, 1
51962 slightly less desirable, etc.). */
51964 static int
51965 ix86_simd_clone_usable (struct cgraph_node *node)
51967 switch (node->simdclone->vecsize_mangle)
51969 case 'b':
51970 if (!TARGET_SSE2)
51971 return -1;
51972 if (!TARGET_AVX)
51973 return 0;
51974 return TARGET_AVX2 ? 2 : 1;
51975 case 'c':
51976 if (!TARGET_AVX)
51977 return -1;
51978 return TARGET_AVX2 ? 1 : 0;
51979 case 'd':
51980 if (!TARGET_AVX2)
51981 return -1;
51982 return 0;
51983 case 'e':
51984 if (!TARGET_AVX512F)
51985 return -1;
51986 return 0;
51987 default:
51988 gcc_unreachable ();
51992 /* This function adjusts the unroll factor based on
51993 the hardware capabilities. For ex, bdver3 has
51994 a loop buffer which makes unrolling of smaller
51995 loops less important. This function decides the
51996 unroll factor using number of memory references
51997 (value 32 is used) as a heuristic. */
51999 static unsigned
52000 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
52002 basic_block *bbs;
52003 rtx_insn *insn;
52004 unsigned i;
52005 unsigned mem_count = 0;
52007 if (!TARGET_ADJUST_UNROLL)
52008 return nunroll;
52010 /* Count the number of memory references within the loop body.
52011 This value determines the unrolling factor for bdver3 and bdver4
52012 architectures. */
52013 subrtx_iterator::array_type array;
52014 bbs = get_loop_body (loop);
52015 for (i = 0; i < loop->num_nodes; i++)
52016 FOR_BB_INSNS (bbs[i], insn)
52017 if (NONDEBUG_INSN_P (insn))
52018 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
52019 if (const_rtx x = *iter)
52020 if (MEM_P (x))
52022 machine_mode mode = GET_MODE (x);
52023 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
52024 if (n_words > 4)
52025 mem_count += 2;
52026 else
52027 mem_count += 1;
52029 free (bbs);
52031 if (mem_count && mem_count <=32)
52032 return 32/mem_count;
52034 return nunroll;
52038 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
52040 static bool
52041 ix86_float_exceptions_rounding_supported_p (void)
52043 /* For x87 floating point with standard excess precision handling,
52044 there is no adddf3 pattern (since x87 floating point only has
52045 XFmode operations) so the default hook implementation gets this
52046 wrong. */
52047 return TARGET_80387 || TARGET_SSE_MATH;
52050 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
52052 static void
52053 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
52055 if (!TARGET_80387 && !TARGET_SSE_MATH)
52056 return;
52057 tree exceptions_var = create_tmp_var_raw (integer_type_node);
52058 if (TARGET_80387)
52060 tree fenv_index_type = build_index_type (size_int (6));
52061 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
52062 tree fenv_var = create_tmp_var_raw (fenv_type);
52063 TREE_ADDRESSABLE (fenv_var) = 1;
52064 tree fenv_ptr = build_pointer_type (fenv_type);
52065 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
52066 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
52067 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
52068 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
52069 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
52070 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
52071 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
52072 tree hold_fnclex = build_call_expr (fnclex, 0);
52073 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
52074 NULL_TREE, NULL_TREE);
52075 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
52076 hold_fnclex);
52077 *clear = build_call_expr (fnclex, 0);
52078 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
52079 tree fnstsw_call = build_call_expr (fnstsw, 0);
52080 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
52081 sw_var, fnstsw_call);
52082 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
52083 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
52084 exceptions_var, exceptions_x87);
52085 *update = build2 (COMPOUND_EXPR, integer_type_node,
52086 sw_mod, update_mod);
52087 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
52088 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
52090 if (TARGET_SSE_MATH)
52092 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
52093 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
52094 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
52095 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
52096 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
52097 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
52098 mxcsr_orig_var, stmxcsr_hold_call);
52099 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
52100 mxcsr_orig_var,
52101 build_int_cst (unsigned_type_node, 0x1f80));
52102 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
52103 build_int_cst (unsigned_type_node, 0xffffffc0));
52104 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
52105 mxcsr_mod_var, hold_mod_val);
52106 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52107 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
52108 hold_assign_orig, hold_assign_mod);
52109 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
52110 ldmxcsr_hold_call);
52111 if (*hold)
52112 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
52113 else
52114 *hold = hold_all;
52115 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
52116 if (*clear)
52117 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
52118 ldmxcsr_clear_call);
52119 else
52120 *clear = ldmxcsr_clear_call;
52121 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
52122 tree exceptions_sse = fold_convert (integer_type_node,
52123 stxmcsr_update_call);
52124 if (*update)
52126 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
52127 exceptions_var, exceptions_sse);
52128 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
52129 exceptions_var, exceptions_mod);
52130 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
52131 exceptions_assign);
52133 else
52134 *update = build2 (MODIFY_EXPR, integer_type_node,
52135 exceptions_var, exceptions_sse);
52136 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
52137 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52138 ldmxcsr_update_call);
52140 tree atomic_feraiseexcept
52141 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
52142 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
52143 1, exceptions_var);
52144 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
52145 atomic_feraiseexcept_call);
52148 /* Return mode to be used for bounds or VOIDmode
52149 if bounds are not supported. */
52151 static machine_mode
52152 ix86_mpx_bound_mode ()
52154 /* Do not support pointer checker if MPX
52155 is not enabled. */
52156 if (!TARGET_MPX)
52158 if (flag_check_pointer_bounds)
52159 warning (0, "Pointer Checker requires MPX support on this target."
52160 " Use -mmpx options to enable MPX.");
52161 return VOIDmode;
52164 return BNDmode;
52167 /* Return constant used to statically initialize constant bounds.
52169 This function is used to create special bound values. For now
52170 only INIT bounds and NONE bounds are expected. More special
52171 values may be added later. */
52173 static tree
52174 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
52176 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
52177 : build_zero_cst (pointer_sized_int_node);
52178 tree high = ub ? build_zero_cst (pointer_sized_int_node)
52179 : build_minus_one_cst (pointer_sized_int_node);
52181 /* This function is supposed to be used to create INIT and
52182 NONE bounds only. */
52183 gcc_assert ((lb == 0 && ub == -1)
52184 || (lb == -1 && ub == 0));
52186 return build_complex (NULL, low, high);
52189 /* Generate a list of statements STMTS to initialize pointer bounds
52190 variable VAR with bounds LB and UB. Return the number of generated
52191 statements. */
52193 static int
52194 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
52196 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
52197 tree lhs, modify, var_p;
52199 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
52200 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
52202 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
52203 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
52204 append_to_statement_list (modify, stmts);
52206 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
52207 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
52208 TYPE_SIZE_UNIT (pointer_sized_int_node)));
52209 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
52210 append_to_statement_list (modify, stmts);
52212 return 2;
52215 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
52216 /* For i386, common symbol is local only for non-PIE binaries. For
52217 x86-64, common symbol is local only for non-PIE binaries or linker
52218 supports copy reloc in PIE binaries. */
52220 static bool
52221 ix86_binds_local_p (const_tree exp)
52223 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
52224 (!flag_pic
52225 || (TARGET_64BIT
52226 && HAVE_LD_PIE_COPYRELOC != 0)));
52228 #endif
52230 /* If MEM is in the form of [base+offset], extract the two parts
52231 of address and set to BASE and OFFSET, otherwise return false. */
52233 static bool
52234 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
52236 rtx addr;
52238 gcc_assert (MEM_P (mem));
52240 addr = XEXP (mem, 0);
52242 if (GET_CODE (addr) == CONST)
52243 addr = XEXP (addr, 0);
52245 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
52247 *base = addr;
52248 *offset = const0_rtx;
52249 return true;
52252 if (GET_CODE (addr) == PLUS
52253 && (REG_P (XEXP (addr, 0))
52254 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
52255 && CONST_INT_P (XEXP (addr, 1)))
52257 *base = XEXP (addr, 0);
52258 *offset = XEXP (addr, 1);
52259 return true;
52262 return false;
52265 /* Given OPERANDS of consecutive load/store, check if we can merge
52266 them into move multiple. LOAD is true if they are load instructions.
52267 MODE is the mode of memory operands. */
52269 bool
52270 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
52271 machine_mode mode)
52273 HOST_WIDE_INT offval_1, offval_2, msize;
52274 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
52276 if (load)
52278 mem_1 = operands[1];
52279 mem_2 = operands[3];
52280 reg_1 = operands[0];
52281 reg_2 = operands[2];
52283 else
52285 mem_1 = operands[0];
52286 mem_2 = operands[2];
52287 reg_1 = operands[1];
52288 reg_2 = operands[3];
52291 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
52293 if (REGNO (reg_1) != REGNO (reg_2))
52294 return false;
52296 /* Check if the addresses are in the form of [base+offset]. */
52297 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
52298 return false;
52299 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
52300 return false;
52302 /* Check if the bases are the same. */
52303 if (!rtx_equal_p (base_1, base_2))
52304 return false;
52306 offval_1 = INTVAL (offset_1);
52307 offval_2 = INTVAL (offset_2);
52308 msize = GET_MODE_SIZE (mode);
52309 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
52310 if (offval_1 + msize != offval_2)
52311 return false;
52313 return true;
52316 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
52318 static bool
52319 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
52320 optimization_type opt_type)
52322 switch (op)
52324 case asin_optab:
52325 case acos_optab:
52326 case log1p_optab:
52327 case exp_optab:
52328 case exp10_optab:
52329 case exp2_optab:
52330 case expm1_optab:
52331 case ldexp_optab:
52332 case scalb_optab:
52333 case round_optab:
52334 return opt_type == OPTIMIZE_FOR_SPEED;
52336 case rint_optab:
52337 if (SSE_FLOAT_MODE_P (mode1)
52338 && TARGET_SSE_MATH
52339 && !flag_trapping_math
52340 && !TARGET_SSE4_1)
52341 return opt_type == OPTIMIZE_FOR_SPEED;
52342 return true;
52344 case floor_optab:
52345 case ceil_optab:
52346 case btrunc_optab:
52347 if (SSE_FLOAT_MODE_P (mode1)
52348 && TARGET_SSE_MATH
52349 && !flag_trapping_math
52350 && TARGET_SSE4_1)
52351 return true;
52352 return opt_type == OPTIMIZE_FOR_SPEED;
52354 case rsqrt_optab:
52355 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
52357 default:
52358 return true;
52362 /* Address space support.
52364 This is not "far pointers" in the 16-bit sense, but an easy way
52365 to use %fs and %gs segment prefixes. Therefore:
52367 (a) All address spaces have the same modes,
52368 (b) All address spaces have the same addresss forms,
52369 (c) While %fs and %gs are technically subsets of the generic
52370 address space, they are probably not subsets of each other.
52371 (d) Since we have no access to the segment base register values
52372 without resorting to a system call, we cannot convert a
52373 non-default address space to a default address space.
52374 Therefore we do not claim %fs or %gs are subsets of generic.
52376 Therefore we can (mostly) use the default hooks. */
52378 /* All use of segmentation is assumed to make address 0 valid. */
52380 static bool
52381 ix86_addr_space_zero_address_valid (addr_space_t as)
52383 return as != ADDR_SPACE_GENERIC;
52386 static void
52387 ix86_init_libfuncs (void)
52389 if (TARGET_64BIT)
52391 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
52392 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
52394 else
52396 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
52397 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
52400 #if TARGET_MACHO
52401 darwin_rename_builtins ();
52402 #endif
52405 /* Generate call to __divmoddi4. */
52407 static void
52408 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
52409 rtx op0, rtx op1,
52410 rtx *quot_p, rtx *rem_p)
52412 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
52414 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
52415 mode, 3,
52416 op0, GET_MODE (op0),
52417 op1, GET_MODE (op1),
52418 XEXP (rem, 0), Pmode);
52419 *quot_p = quot;
52420 *rem_p = rem;
52423 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
52424 FPU, assume that the fpcw is set to extended precision; when using
52425 only SSE, rounding is correct; when using both SSE and the FPU,
52426 the rounding precision is indeterminate, since either may be chosen
52427 apparently at random. */
52429 static enum flt_eval_method
52430 ix86_excess_precision (enum excess_precision_type type)
52432 switch (type)
52434 case EXCESS_PRECISION_TYPE_FAST:
52435 /* The fastest type to promote to will always be the native type,
52436 whether that occurs with implicit excess precision or
52437 otherwise. */
52438 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52439 case EXCESS_PRECISION_TYPE_STANDARD:
52440 case EXCESS_PRECISION_TYPE_IMPLICIT:
52441 /* Otherwise, the excess precision we want when we are
52442 in a standards compliant mode, and the implicit precision we
52443 provide would be identical were it not for the unpredictable
52444 cases. */
52445 if (!TARGET_80387)
52446 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52447 else if (!TARGET_MIX_SSE_I387)
52449 if (!TARGET_SSE_MATH)
52450 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
52451 else if (TARGET_SSE2)
52452 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
52455 /* If we are in standards compliant mode, but we know we will
52456 calculate in unpredictable precision, return
52457 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
52458 excess precision if the target can't guarantee it will honor
52459 it. */
52460 return (type == EXCESS_PRECISION_TYPE_STANDARD
52461 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
52462 : FLT_EVAL_METHOD_UNPREDICTABLE);
52463 default:
52464 gcc_unreachable ();
52467 return FLT_EVAL_METHOD_UNPREDICTABLE;
52470 /* Target-specific selftests. */
52472 #if CHECKING_P
52474 namespace selftest {
52476 /* Verify that hard regs are dumped as expected (in compact mode). */
52478 static void
52479 ix86_test_dumping_hard_regs ()
52481 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
52482 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
52485 /* Test dumping an insn with repeated references to the same SCRATCH,
52486 to verify the rtx_reuse code. */
52488 static void
52489 ix86_test_dumping_memory_blockage ()
52491 set_new_first_and_last_insn (NULL, NULL);
52493 rtx pat = gen_memory_blockage ();
52494 rtx_reuse_manager r;
52495 r.preprocess (pat);
52497 /* Verify that the repeated references to the SCRATCH show use
52498 reuse IDS. The first should be prefixed with a reuse ID,
52499 and the second should be dumped as a "reuse_rtx" of that ID.
52500 The expected string assumes Pmode == DImode. */
52501 if (Pmode == DImode)
52502 ASSERT_RTL_DUMP_EQ_WITH_REUSE
52503 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
52504 " (unspec:BLK [\n"
52505 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
52506 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
52509 /* Verify loading an RTL dump; specifically a dump of copying
52510 a param on x86_64 from a hard reg into the frame.
52511 This test is target-specific since the dump contains target-specific
52512 hard reg names. */
52514 static void
52515 ix86_test_loading_dump_fragment_1 ()
52517 rtl_dump_test t (SELFTEST_LOCATION,
52518 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
52520 rtx_insn *insn = get_insn_by_uid (1);
52522 /* The block structure and indentation here is purely for
52523 readability; it mirrors the structure of the rtx. */
52524 tree mem_expr;
52526 rtx pat = PATTERN (insn);
52527 ASSERT_EQ (SET, GET_CODE (pat));
52529 rtx dest = SET_DEST (pat);
52530 ASSERT_EQ (MEM, GET_CODE (dest));
52531 /* Verify the "/c" was parsed. */
52532 ASSERT_TRUE (RTX_FLAG (dest, call));
52533 ASSERT_EQ (SImode, GET_MODE (dest));
52535 rtx addr = XEXP (dest, 0);
52536 ASSERT_EQ (PLUS, GET_CODE (addr));
52537 ASSERT_EQ (DImode, GET_MODE (addr));
52539 rtx lhs = XEXP (addr, 0);
52540 /* Verify that the "frame" REG was consolidated. */
52541 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
52544 rtx rhs = XEXP (addr, 1);
52545 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
52546 ASSERT_EQ (-4, INTVAL (rhs));
52549 /* Verify the "[1 i+0 S4 A32]" was parsed. */
52550 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
52551 /* "i" should have been handled by synthesizing a global int
52552 variable named "i". */
52553 mem_expr = MEM_EXPR (dest);
52554 ASSERT_NE (mem_expr, NULL);
52555 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
52556 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
52557 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
52558 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
52559 /* "+0". */
52560 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
52561 ASSERT_EQ (0, MEM_OFFSET (dest));
52562 /* "S4". */
52563 ASSERT_EQ (4, MEM_SIZE (dest));
52564 /* "A32. */
52565 ASSERT_EQ (32, MEM_ALIGN (dest));
52568 rtx src = SET_SRC (pat);
52569 ASSERT_EQ (REG, GET_CODE (src));
52570 ASSERT_EQ (SImode, GET_MODE (src));
52571 ASSERT_EQ (5, REGNO (src));
52572 tree reg_expr = REG_EXPR (src);
52573 /* "i" here should point to the same var as for the MEM_EXPR. */
52574 ASSERT_EQ (reg_expr, mem_expr);
52579 /* Verify that the RTL loader copes with a call_insn dump.
52580 This test is target-specific since the dump contains a target-specific
52581 hard reg name. */
52583 static void
52584 ix86_test_loading_call_insn ()
52586 /* The test dump includes register "xmm0", where requires TARGET_SSE
52587 to exist. */
52588 if (!TARGET_SSE)
52589 return;
52591 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
52593 rtx_insn *insn = get_insns ();
52594 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
52596 /* "/j". */
52597 ASSERT_TRUE (RTX_FLAG (insn, jump));
52599 rtx pat = PATTERN (insn);
52600 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
52602 /* Verify REG_NOTES. */
52604 /* "(expr_list:REG_CALL_DECL". */
52605 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
52606 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
52607 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
52609 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
52610 rtx_expr_list *note1 = note0->next ();
52611 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
52613 ASSERT_EQ (NULL, note1->next ());
52616 /* Verify CALL_INSN_FUNCTION_USAGE. */
52618 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
52619 rtx_expr_list *usage
52620 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
52621 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
52622 ASSERT_EQ (DFmode, GET_MODE (usage));
52623 ASSERT_EQ (USE, GET_CODE (usage->element ()));
52624 ASSERT_EQ (NULL, usage->next ());
52628 /* Verify that the RTL loader copes a dump from print_rtx_function.
52629 This test is target-specific since the dump contains target-specific
52630 hard reg names. */
52632 static void
52633 ix86_test_loading_full_dump ()
52635 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
52637 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52639 rtx_insn *insn_1 = get_insn_by_uid (1);
52640 ASSERT_EQ (NOTE, GET_CODE (insn_1));
52642 rtx_insn *insn_7 = get_insn_by_uid (7);
52643 ASSERT_EQ (INSN, GET_CODE (insn_7));
52644 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
52646 rtx_insn *insn_15 = get_insn_by_uid (15);
52647 ASSERT_EQ (INSN, GET_CODE (insn_15));
52648 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
52650 /* Verify crtl->return_rtx. */
52651 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
52652 ASSERT_EQ (0, REGNO (crtl->return_rtx));
52653 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
52656 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
52657 In particular, verify that it correctly loads the 2nd operand.
52658 This test is target-specific since these are machine-specific
52659 operands (and enums). */
52661 static void
52662 ix86_test_loading_unspec ()
52664 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
52666 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
52668 ASSERT_TRUE (cfun);
52670 /* Test of an UNSPEC. */
52671 rtx_insn *insn = get_insns ();
52672 ASSERT_EQ (INSN, GET_CODE (insn));
52673 rtx set = single_set (insn);
52674 ASSERT_NE (NULL, set);
52675 rtx dst = SET_DEST (set);
52676 ASSERT_EQ (MEM, GET_CODE (dst));
52677 rtx src = SET_SRC (set);
52678 ASSERT_EQ (UNSPEC, GET_CODE (src));
52679 ASSERT_EQ (BLKmode, GET_MODE (src));
52680 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
52682 rtx v0 = XVECEXP (src, 0, 0);
52684 /* Verify that the two uses of the first SCRATCH have pointer
52685 equality. */
52686 rtx scratch_a = XEXP (dst, 0);
52687 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
52689 rtx scratch_b = XEXP (v0, 0);
52690 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
52692 ASSERT_EQ (scratch_a, scratch_b);
52694 /* Verify that the two mems are thus treated as equal. */
52695 ASSERT_TRUE (rtx_equal_p (dst, v0));
52697 /* Verify the the insn is recognized. */
52698 ASSERT_NE(-1, recog_memoized (insn));
52700 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
52701 insn = NEXT_INSN (insn);
52702 ASSERT_EQ (INSN, GET_CODE (insn));
52704 set = single_set (insn);
52705 ASSERT_NE (NULL, set);
52707 src = SET_SRC (set);
52708 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
52709 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
52712 /* Run all target-specific selftests. */
52714 static void
52715 ix86_run_selftests (void)
52717 ix86_test_dumping_hard_regs ();
52718 ix86_test_dumping_memory_blockage ();
52720 /* Various tests of loading RTL dumps, here because they contain
52721 ix86-isms (e.g. names of hard regs). */
52722 ix86_test_loading_dump_fragment_1 ();
52723 ix86_test_loading_call_insn ();
52724 ix86_test_loading_full_dump ();
52725 ix86_test_loading_unspec ();
52728 } // namespace selftest
52730 #endif /* CHECKING_P */
52732 /* Initialize the GCC target structure. */
52733 #undef TARGET_RETURN_IN_MEMORY
52734 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
52736 #undef TARGET_LEGITIMIZE_ADDRESS
52737 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
52739 #undef TARGET_ATTRIBUTE_TABLE
52740 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
52741 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
52742 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
52743 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52744 # undef TARGET_MERGE_DECL_ATTRIBUTES
52745 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
52746 #endif
52748 #undef TARGET_COMP_TYPE_ATTRIBUTES
52749 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
52751 #undef TARGET_INIT_BUILTINS
52752 #define TARGET_INIT_BUILTINS ix86_init_builtins
52753 #undef TARGET_BUILTIN_DECL
52754 #define TARGET_BUILTIN_DECL ix86_builtin_decl
52755 #undef TARGET_EXPAND_BUILTIN
52756 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
52758 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
52759 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
52760 ix86_builtin_vectorized_function
52762 #undef TARGET_VECTORIZE_BUILTIN_GATHER
52763 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
52765 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
52766 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
52768 #undef TARGET_BUILTIN_RECIPROCAL
52769 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
52771 #undef TARGET_ASM_FUNCTION_EPILOGUE
52772 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
52774 #undef TARGET_ENCODE_SECTION_INFO
52775 #ifndef SUBTARGET_ENCODE_SECTION_INFO
52776 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
52777 #else
52778 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
52779 #endif
52781 #undef TARGET_ASM_OPEN_PAREN
52782 #define TARGET_ASM_OPEN_PAREN ""
52783 #undef TARGET_ASM_CLOSE_PAREN
52784 #define TARGET_ASM_CLOSE_PAREN ""
52786 #undef TARGET_ASM_BYTE_OP
52787 #define TARGET_ASM_BYTE_OP ASM_BYTE
52789 #undef TARGET_ASM_ALIGNED_HI_OP
52790 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
52791 #undef TARGET_ASM_ALIGNED_SI_OP
52792 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
52793 #ifdef ASM_QUAD
52794 #undef TARGET_ASM_ALIGNED_DI_OP
52795 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
52796 #endif
52798 #undef TARGET_PROFILE_BEFORE_PROLOGUE
52799 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
52801 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
52802 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
52804 #undef TARGET_ASM_UNALIGNED_HI_OP
52805 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
52806 #undef TARGET_ASM_UNALIGNED_SI_OP
52807 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
52808 #undef TARGET_ASM_UNALIGNED_DI_OP
52809 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
52811 #undef TARGET_PRINT_OPERAND
52812 #define TARGET_PRINT_OPERAND ix86_print_operand
52813 #undef TARGET_PRINT_OPERAND_ADDRESS
52814 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
52815 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
52816 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
52817 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
52818 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
52820 #undef TARGET_SCHED_INIT_GLOBAL
52821 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
52822 #undef TARGET_SCHED_ADJUST_COST
52823 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
52824 #undef TARGET_SCHED_ISSUE_RATE
52825 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
52826 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
52827 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
52828 ia32_multipass_dfa_lookahead
52829 #undef TARGET_SCHED_MACRO_FUSION_P
52830 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
52831 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
52832 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
52834 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
52835 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
52837 #undef TARGET_MEMMODEL_CHECK
52838 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
52840 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
52841 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
52843 #ifdef HAVE_AS_TLS
52844 #undef TARGET_HAVE_TLS
52845 #define TARGET_HAVE_TLS true
52846 #endif
52847 #undef TARGET_CANNOT_FORCE_CONST_MEM
52848 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
52849 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
52850 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
52852 #undef TARGET_DELEGITIMIZE_ADDRESS
52853 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
52855 #undef TARGET_MS_BITFIELD_LAYOUT_P
52856 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
52858 #if TARGET_MACHO
52859 #undef TARGET_BINDS_LOCAL_P
52860 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
52861 #else
52862 #undef TARGET_BINDS_LOCAL_P
52863 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
52864 #endif
52865 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
52866 #undef TARGET_BINDS_LOCAL_P
52867 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
52868 #endif
52870 #undef TARGET_ASM_OUTPUT_MI_THUNK
52871 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
52872 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
52873 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
52875 #undef TARGET_ASM_FILE_START
52876 #define TARGET_ASM_FILE_START x86_file_start
52878 #undef TARGET_OPTION_OVERRIDE
52879 #define TARGET_OPTION_OVERRIDE ix86_option_override
52881 #undef TARGET_REGISTER_MOVE_COST
52882 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
52883 #undef TARGET_MEMORY_MOVE_COST
52884 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
52885 #undef TARGET_RTX_COSTS
52886 #define TARGET_RTX_COSTS ix86_rtx_costs
52887 #undef TARGET_ADDRESS_COST
52888 #define TARGET_ADDRESS_COST ix86_address_cost
52890 #undef TARGET_FLAGS_REGNUM
52891 #define TARGET_FLAGS_REGNUM FLAGS_REG
52892 #undef TARGET_FIXED_CONDITION_CODE_REGS
52893 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
52894 #undef TARGET_CC_MODES_COMPATIBLE
52895 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
52897 #undef TARGET_MACHINE_DEPENDENT_REORG
52898 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
52900 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
52901 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
52903 #undef TARGET_BUILD_BUILTIN_VA_LIST
52904 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
52906 #undef TARGET_FOLD_BUILTIN
52907 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
52909 #undef TARGET_GIMPLE_FOLD_BUILTIN
52910 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
52912 #undef TARGET_COMPARE_VERSION_PRIORITY
52913 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
52915 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
52916 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
52917 ix86_generate_version_dispatcher_body
52919 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
52920 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
52921 ix86_get_function_versions_dispatcher
52923 #undef TARGET_ENUM_VA_LIST_P
52924 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
52926 #undef TARGET_FN_ABI_VA_LIST
52927 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
52929 #undef TARGET_CANONICAL_VA_LIST_TYPE
52930 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
52932 #undef TARGET_EXPAND_BUILTIN_VA_START
52933 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
52935 #undef TARGET_MD_ASM_ADJUST
52936 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
52938 #undef TARGET_C_EXCESS_PRECISION
52939 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
52940 #undef TARGET_PROMOTE_PROTOTYPES
52941 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
52942 #undef TARGET_SETUP_INCOMING_VARARGS
52943 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
52944 #undef TARGET_MUST_PASS_IN_STACK
52945 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
52946 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
52947 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
52948 #undef TARGET_FUNCTION_ARG_ADVANCE
52949 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
52950 #undef TARGET_FUNCTION_ARG
52951 #define TARGET_FUNCTION_ARG ix86_function_arg
52952 #undef TARGET_INIT_PIC_REG
52953 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
52954 #undef TARGET_USE_PSEUDO_PIC_REG
52955 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
52956 #undef TARGET_FUNCTION_ARG_BOUNDARY
52957 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
52958 #undef TARGET_PASS_BY_REFERENCE
52959 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
52960 #undef TARGET_INTERNAL_ARG_POINTER
52961 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
52962 #undef TARGET_UPDATE_STACK_BOUNDARY
52963 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
52964 #undef TARGET_GET_DRAP_RTX
52965 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
52966 #undef TARGET_STRICT_ARGUMENT_NAMING
52967 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
52968 #undef TARGET_STATIC_CHAIN
52969 #define TARGET_STATIC_CHAIN ix86_static_chain
52970 #undef TARGET_TRAMPOLINE_INIT
52971 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
52972 #undef TARGET_RETURN_POPS_ARGS
52973 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
52975 #undef TARGET_WARN_FUNC_RETURN
52976 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
52978 #undef TARGET_LEGITIMATE_COMBINED_INSN
52979 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
52981 #undef TARGET_ASAN_SHADOW_OFFSET
52982 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
52984 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
52985 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
52987 #undef TARGET_SCALAR_MODE_SUPPORTED_P
52988 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
52990 #undef TARGET_VECTOR_MODE_SUPPORTED_P
52991 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
52993 #undef TARGET_C_MODE_FOR_SUFFIX
52994 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
52996 #ifdef HAVE_AS_TLS
52997 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
52998 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
52999 #endif
53001 #ifdef SUBTARGET_INSERT_ATTRIBUTES
53002 #undef TARGET_INSERT_ATTRIBUTES
53003 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
53004 #endif
53006 #undef TARGET_MANGLE_TYPE
53007 #define TARGET_MANGLE_TYPE ix86_mangle_type
53009 #undef TARGET_STACK_PROTECT_GUARD
53010 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
53012 #if !TARGET_MACHO
53013 #undef TARGET_STACK_PROTECT_FAIL
53014 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
53015 #endif
53017 #undef TARGET_FUNCTION_VALUE
53018 #define TARGET_FUNCTION_VALUE ix86_function_value
53020 #undef TARGET_FUNCTION_VALUE_REGNO_P
53021 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
53023 #undef TARGET_PROMOTE_FUNCTION_MODE
53024 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
53026 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
53027 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
53029 #undef TARGET_MEMBER_TYPE_FORCES_BLK
53030 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
53032 #undef TARGET_INSTANTIATE_DECLS
53033 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
53035 #undef TARGET_SECONDARY_RELOAD
53036 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
53038 #undef TARGET_CLASS_MAX_NREGS
53039 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
53041 #undef TARGET_PREFERRED_RELOAD_CLASS
53042 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
53043 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
53044 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
53045 #undef TARGET_CLASS_LIKELY_SPILLED_P
53046 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
53048 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
53049 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
53050 ix86_builtin_vectorization_cost
53051 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
53052 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
53053 ix86_vectorize_vec_perm_const_ok
53054 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
53055 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
53056 ix86_preferred_simd_mode
53057 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
53058 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
53059 ix86_autovectorize_vector_sizes
53060 #undef TARGET_VECTORIZE_GET_MASK_MODE
53061 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
53062 #undef TARGET_VECTORIZE_INIT_COST
53063 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
53064 #undef TARGET_VECTORIZE_ADD_STMT_COST
53065 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
53066 #undef TARGET_VECTORIZE_FINISH_COST
53067 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
53068 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
53069 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
53071 #undef TARGET_SET_CURRENT_FUNCTION
53072 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
53074 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
53075 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
53077 #undef TARGET_OPTION_SAVE
53078 #define TARGET_OPTION_SAVE ix86_function_specific_save
53080 #undef TARGET_OPTION_RESTORE
53081 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
53083 #undef TARGET_OPTION_POST_STREAM_IN
53084 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
53086 #undef TARGET_OPTION_PRINT
53087 #define TARGET_OPTION_PRINT ix86_function_specific_print
53089 #undef TARGET_OPTION_FUNCTION_VERSIONS
53090 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
53092 #undef TARGET_CAN_INLINE_P
53093 #define TARGET_CAN_INLINE_P ix86_can_inline_p
53095 #undef TARGET_LEGITIMATE_ADDRESS_P
53096 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
53098 #undef TARGET_REGISTER_PRIORITY
53099 #define TARGET_REGISTER_PRIORITY ix86_register_priority
53101 #undef TARGET_REGISTER_USAGE_LEVELING_P
53102 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
53104 #undef TARGET_LEGITIMATE_CONSTANT_P
53105 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
53107 #undef TARGET_COMPUTE_FRAME_LAYOUT
53108 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
53110 #undef TARGET_FRAME_POINTER_REQUIRED
53111 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
53113 #undef TARGET_CAN_ELIMINATE
53114 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
53116 #undef TARGET_EXTRA_LIVE_ON_ENTRY
53117 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
53119 #undef TARGET_ASM_CODE_END
53120 #define TARGET_ASM_CODE_END ix86_code_end
53122 #undef TARGET_CONDITIONAL_REGISTER_USAGE
53123 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
53125 #undef TARGET_LOOP_UNROLL_ADJUST
53126 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
53128 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
53129 #undef TARGET_SPILL_CLASS
53130 #define TARGET_SPILL_CLASS ix86_spill_class
53132 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
53133 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
53134 ix86_simd_clone_compute_vecsize_and_simdlen
53136 #undef TARGET_SIMD_CLONE_ADJUST
53137 #define TARGET_SIMD_CLONE_ADJUST \
53138 ix86_simd_clone_adjust
53140 #undef TARGET_SIMD_CLONE_USABLE
53141 #define TARGET_SIMD_CLONE_USABLE \
53142 ix86_simd_clone_usable
53144 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
53145 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
53146 ix86_float_exceptions_rounding_supported_p
53148 #undef TARGET_MODE_EMIT
53149 #define TARGET_MODE_EMIT ix86_emit_mode_set
53151 #undef TARGET_MODE_NEEDED
53152 #define TARGET_MODE_NEEDED ix86_mode_needed
53154 #undef TARGET_MODE_AFTER
53155 #define TARGET_MODE_AFTER ix86_mode_after
53157 #undef TARGET_MODE_ENTRY
53158 #define TARGET_MODE_ENTRY ix86_mode_entry
53160 #undef TARGET_MODE_EXIT
53161 #define TARGET_MODE_EXIT ix86_mode_exit
53163 #undef TARGET_MODE_PRIORITY
53164 #define TARGET_MODE_PRIORITY ix86_mode_priority
53166 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
53167 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
53169 #undef TARGET_LOAD_BOUNDS_FOR_ARG
53170 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
53172 #undef TARGET_STORE_BOUNDS_FOR_ARG
53173 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
53175 #undef TARGET_LOAD_RETURNED_BOUNDS
53176 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
53178 #undef TARGET_STORE_RETURNED_BOUNDS
53179 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
53181 #undef TARGET_CHKP_BOUND_MODE
53182 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
53184 #undef TARGET_BUILTIN_CHKP_FUNCTION
53185 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
53187 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
53188 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
53190 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
53191 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
53193 #undef TARGET_CHKP_INITIALIZE_BOUNDS
53194 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
53196 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
53197 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
53199 #undef TARGET_OFFLOAD_OPTIONS
53200 #define TARGET_OFFLOAD_OPTIONS \
53201 ix86_offload_options
53203 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
53204 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
53206 #undef TARGET_OPTAB_SUPPORTED_P
53207 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
53209 #undef TARGET_HARD_REGNO_SCRATCH_OK
53210 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
53212 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
53213 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
53215 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
53216 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
53218 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
53219 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
53221 #undef TARGET_INIT_LIBFUNCS
53222 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
53224 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
53225 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
53227 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
53228 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
53230 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
53231 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
53233 #if CHECKING_P
53234 #undef TARGET_RUN_TARGET_SELFTESTS
53235 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
53236 #endif /* #if CHECKING_P */
53238 struct gcc_target targetm = TARGET_INITIALIZER;
53240 #include "gt-i386.h"