Added support for Cilk Plus SIMD-enabled function for C.
[official-gcc.git] / gcc / config / i386 / i386.c
blob0be671da3902918e645a898692e6b691c95ca385
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "stringpool.h"
27 #include "attribs.h"
28 #include "calls.h"
29 #include "stor-layout.h"
30 #include "varasm.h"
31 #include "tm_p.h"
32 #include "regs.h"
33 #include "hard-reg-set.h"
34 #include "insn-config.h"
35 #include "conditions.h"
36 #include "output.h"
37 #include "insn-codes.h"
38 #include "insn-attr.h"
39 #include "flags.h"
40 #include "except.h"
41 #include "function.h"
42 #include "recog.h"
43 #include "expr.h"
44 #include "optabs.h"
45 #include "diagnostic-core.h"
46 #include "toplev.h"
47 #include "basic-block.h"
48 #include "ggc.h"
49 #include "target.h"
50 #include "target-def.h"
51 #include "common/common-target.h"
52 #include "langhooks.h"
53 #include "reload.h"
54 #include "cgraph.h"
55 #include "pointer-set.h"
56 #include "hash-table.h"
57 #include "vec.h"
58 #include "basic-block.h"
59 #include "tree-ssa-alias.h"
60 #include "internal-fn.h"
61 #include "gimple-fold.h"
62 #include "tree-eh.h"
63 #include "gimple-expr.h"
64 #include "is-a.h"
65 #include "gimple.h"
66 #include "gimplify.h"
67 #include "dwarf2.h"
68 #include "df.h"
69 #include "tm-constrs.h"
70 #include "params.h"
71 #include "cselib.h"
72 #include "debug.h"
73 #include "sched-int.h"
74 #include "sbitmap.h"
75 #include "fibheap.h"
76 #include "opts.h"
77 #include "diagnostic.h"
78 #include "dumpfile.h"
79 #include "tree-pass.h"
80 #include "context.h"
81 #include "pass_manager.h"
83 static rtx legitimize_dllimport_symbol (rtx, bool);
84 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
85 static rtx legitimize_pe_coff_symbol (rtx, bool);
87 #ifndef CHECK_STACK_LIMIT
88 #define CHECK_STACK_LIMIT (-1)
89 #endif
91 /* Return index of given mode in mult and division cost tables. */
92 #define MODE_INDEX(mode) \
93 ((mode) == QImode ? 0 \
94 : (mode) == HImode ? 1 \
95 : (mode) == SImode ? 2 \
96 : (mode) == DImode ? 3 \
97 : 4)
99 /* Processor costs (relative to an add) */
100 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
101 #define COSTS_N_BYTES(N) ((N) * 2)
103 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
105 static stringop_algs ix86_size_memcpy[2] = {
106 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
107 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
108 static stringop_algs ix86_size_memset[2] = {
109 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
110 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
112 const
113 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
114 COSTS_N_BYTES (2), /* cost of an add instruction */
115 COSTS_N_BYTES (3), /* cost of a lea instruction */
116 COSTS_N_BYTES (2), /* variable shift costs */
117 COSTS_N_BYTES (3), /* constant shift costs */
118 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
119 COSTS_N_BYTES (3), /* HI */
120 COSTS_N_BYTES (3), /* SI */
121 COSTS_N_BYTES (3), /* DI */
122 COSTS_N_BYTES (5)}, /* other */
123 0, /* cost of multiply per each bit set */
124 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
125 COSTS_N_BYTES (3), /* HI */
126 COSTS_N_BYTES (3), /* SI */
127 COSTS_N_BYTES (3), /* DI */
128 COSTS_N_BYTES (5)}, /* other */
129 COSTS_N_BYTES (3), /* cost of movsx */
130 COSTS_N_BYTES (3), /* cost of movzx */
131 0, /* "large" insn */
132 2, /* MOVE_RATIO */
133 2, /* cost for loading QImode using movzbl */
134 {2, 2, 2}, /* cost of loading integer registers
135 in QImode, HImode and SImode.
136 Relative to reg-reg move (2). */
137 {2, 2, 2}, /* cost of storing integer registers */
138 2, /* cost of reg,reg fld/fst */
139 {2, 2, 2}, /* cost of loading fp registers
140 in SFmode, DFmode and XFmode */
141 {2, 2, 2}, /* cost of storing fp registers
142 in SFmode, DFmode and XFmode */
143 3, /* cost of moving MMX register */
144 {3, 3}, /* cost of loading MMX registers
145 in SImode and DImode */
146 {3, 3}, /* cost of storing MMX registers
147 in SImode and DImode */
148 3, /* cost of moving SSE register */
149 {3, 3, 3}, /* cost of loading SSE registers
150 in SImode, DImode and TImode */
151 {3, 3, 3}, /* cost of storing SSE registers
152 in SImode, DImode and TImode */
153 3, /* MMX or SSE register to integer */
154 0, /* size of l1 cache */
155 0, /* size of l2 cache */
156 0, /* size of prefetch block */
157 0, /* number of parallel prefetches */
158 2, /* Branch cost */
159 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
160 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
161 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
162 COSTS_N_BYTES (2), /* cost of FABS instruction. */
163 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
164 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
165 ix86_size_memcpy,
166 ix86_size_memset,
167 1, /* scalar_stmt_cost. */
168 1, /* scalar load_cost. */
169 1, /* scalar_store_cost. */
170 1, /* vec_stmt_cost. */
171 1, /* vec_to_scalar_cost. */
172 1, /* scalar_to_vec_cost. */
173 1, /* vec_align_load_cost. */
174 1, /* vec_unalign_load_cost. */
175 1, /* vec_store_cost. */
176 1, /* cond_taken_branch_cost. */
177 1, /* cond_not_taken_branch_cost. */
180 /* Processor costs (relative to an add) */
181 static stringop_algs i386_memcpy[2] = {
182 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
183 DUMMY_STRINGOP_ALGS};
184 static stringop_algs i386_memset[2] = {
185 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
186 DUMMY_STRINGOP_ALGS};
188 static const
189 struct processor_costs i386_cost = { /* 386 specific costs */
190 COSTS_N_INSNS (1), /* cost of an add instruction */
191 COSTS_N_INSNS (1), /* cost of a lea instruction */
192 COSTS_N_INSNS (3), /* variable shift costs */
193 COSTS_N_INSNS (2), /* constant shift costs */
194 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
195 COSTS_N_INSNS (6), /* HI */
196 COSTS_N_INSNS (6), /* SI */
197 COSTS_N_INSNS (6), /* DI */
198 COSTS_N_INSNS (6)}, /* other */
199 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
200 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
201 COSTS_N_INSNS (23), /* HI */
202 COSTS_N_INSNS (23), /* SI */
203 COSTS_N_INSNS (23), /* DI */
204 COSTS_N_INSNS (23)}, /* other */
205 COSTS_N_INSNS (3), /* cost of movsx */
206 COSTS_N_INSNS (2), /* cost of movzx */
207 15, /* "large" insn */
208 3, /* MOVE_RATIO */
209 4, /* cost for loading QImode using movzbl */
210 {2, 4, 2}, /* cost of loading integer registers
211 in QImode, HImode and SImode.
212 Relative to reg-reg move (2). */
213 {2, 4, 2}, /* cost of storing integer registers */
214 2, /* cost of reg,reg fld/fst */
215 {8, 8, 8}, /* cost of loading fp registers
216 in SFmode, DFmode and XFmode */
217 {8, 8, 8}, /* cost of storing fp registers
218 in SFmode, DFmode and XFmode */
219 2, /* cost of moving MMX register */
220 {4, 8}, /* cost of loading MMX registers
221 in SImode and DImode */
222 {4, 8}, /* cost of storing MMX registers
223 in SImode and DImode */
224 2, /* cost of moving SSE register */
225 {4, 8, 16}, /* cost of loading SSE registers
226 in SImode, DImode and TImode */
227 {4, 8, 16}, /* cost of storing SSE registers
228 in SImode, DImode and TImode */
229 3, /* MMX or SSE register to integer */
230 0, /* size of l1 cache */
231 0, /* size of l2 cache */
232 0, /* size of prefetch block */
233 0, /* number of parallel prefetches */
234 1, /* Branch cost */
235 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
236 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
237 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
238 COSTS_N_INSNS (22), /* cost of FABS instruction. */
239 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
240 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
241 i386_memcpy,
242 i386_memset,
243 1, /* scalar_stmt_cost. */
244 1, /* scalar load_cost. */
245 1, /* scalar_store_cost. */
246 1, /* vec_stmt_cost. */
247 1, /* vec_to_scalar_cost. */
248 1, /* scalar_to_vec_cost. */
249 1, /* vec_align_load_cost. */
250 2, /* vec_unalign_load_cost. */
251 1, /* vec_store_cost. */
252 3, /* cond_taken_branch_cost. */
253 1, /* cond_not_taken_branch_cost. */
256 static stringop_algs i486_memcpy[2] = {
257 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
258 DUMMY_STRINGOP_ALGS};
259 static stringop_algs i486_memset[2] = {
260 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
261 DUMMY_STRINGOP_ALGS};
263 static const
264 struct processor_costs i486_cost = { /* 486 specific costs */
265 COSTS_N_INSNS (1), /* cost of an add instruction */
266 COSTS_N_INSNS (1), /* cost of a lea instruction */
267 COSTS_N_INSNS (3), /* variable shift costs */
268 COSTS_N_INSNS (2), /* constant shift costs */
269 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
270 COSTS_N_INSNS (12), /* HI */
271 COSTS_N_INSNS (12), /* SI */
272 COSTS_N_INSNS (12), /* DI */
273 COSTS_N_INSNS (12)}, /* other */
274 1, /* cost of multiply per each bit set */
275 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
276 COSTS_N_INSNS (40), /* HI */
277 COSTS_N_INSNS (40), /* SI */
278 COSTS_N_INSNS (40), /* DI */
279 COSTS_N_INSNS (40)}, /* other */
280 COSTS_N_INSNS (3), /* cost of movsx */
281 COSTS_N_INSNS (2), /* cost of movzx */
282 15, /* "large" insn */
283 3, /* MOVE_RATIO */
284 4, /* cost for loading QImode using movzbl */
285 {2, 4, 2}, /* cost of loading integer registers
286 in QImode, HImode and SImode.
287 Relative to reg-reg move (2). */
288 {2, 4, 2}, /* cost of storing integer registers */
289 2, /* cost of reg,reg fld/fst */
290 {8, 8, 8}, /* cost of loading fp registers
291 in SFmode, DFmode and XFmode */
292 {8, 8, 8}, /* cost of storing fp registers
293 in SFmode, DFmode and XFmode */
294 2, /* cost of moving MMX register */
295 {4, 8}, /* cost of loading MMX registers
296 in SImode and DImode */
297 {4, 8}, /* cost of storing MMX registers
298 in SImode and DImode */
299 2, /* cost of moving SSE register */
300 {4, 8, 16}, /* cost of loading SSE registers
301 in SImode, DImode and TImode */
302 {4, 8, 16}, /* cost of storing SSE registers
303 in SImode, DImode and TImode */
304 3, /* MMX or SSE register to integer */
305 4, /* size of l1 cache. 486 has 8kB cache
306 shared for code and data, so 4kB is
307 not really precise. */
308 4, /* size of l2 cache */
309 0, /* size of prefetch block */
310 0, /* number of parallel prefetches */
311 1, /* Branch cost */
312 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
313 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
314 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
315 COSTS_N_INSNS (3), /* cost of FABS instruction. */
316 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
317 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
318 i486_memcpy,
319 i486_memset,
320 1, /* scalar_stmt_cost. */
321 1, /* scalar load_cost. */
322 1, /* scalar_store_cost. */
323 1, /* vec_stmt_cost. */
324 1, /* vec_to_scalar_cost. */
325 1, /* scalar_to_vec_cost. */
326 1, /* vec_align_load_cost. */
327 2, /* vec_unalign_load_cost. */
328 1, /* vec_store_cost. */
329 3, /* cond_taken_branch_cost. */
330 1, /* cond_not_taken_branch_cost. */
333 static stringop_algs pentium_memcpy[2] = {
334 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
335 DUMMY_STRINGOP_ALGS};
336 static stringop_algs pentium_memset[2] = {
337 {libcall, {{-1, rep_prefix_4_byte, false}}},
338 DUMMY_STRINGOP_ALGS};
340 static const
341 struct processor_costs pentium_cost = {
342 COSTS_N_INSNS (1), /* cost of an add instruction */
343 COSTS_N_INSNS (1), /* cost of a lea instruction */
344 COSTS_N_INSNS (4), /* variable shift costs */
345 COSTS_N_INSNS (1), /* constant shift costs */
346 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
347 COSTS_N_INSNS (11), /* HI */
348 COSTS_N_INSNS (11), /* SI */
349 COSTS_N_INSNS (11), /* DI */
350 COSTS_N_INSNS (11)}, /* other */
351 0, /* cost of multiply per each bit set */
352 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
353 COSTS_N_INSNS (25), /* HI */
354 COSTS_N_INSNS (25), /* SI */
355 COSTS_N_INSNS (25), /* DI */
356 COSTS_N_INSNS (25)}, /* other */
357 COSTS_N_INSNS (3), /* cost of movsx */
358 COSTS_N_INSNS (2), /* cost of movzx */
359 8, /* "large" insn */
360 6, /* MOVE_RATIO */
361 6, /* cost for loading QImode using movzbl */
362 {2, 4, 2}, /* cost of loading integer registers
363 in QImode, HImode and SImode.
364 Relative to reg-reg move (2). */
365 {2, 4, 2}, /* cost of storing integer registers */
366 2, /* cost of reg,reg fld/fst */
367 {2, 2, 6}, /* cost of loading fp registers
368 in SFmode, DFmode and XFmode */
369 {4, 4, 6}, /* cost of storing fp registers
370 in SFmode, DFmode and XFmode */
371 8, /* cost of moving MMX register */
372 {8, 8}, /* cost of loading MMX registers
373 in SImode and DImode */
374 {8, 8}, /* cost of storing MMX registers
375 in SImode and DImode */
376 2, /* cost of moving SSE register */
377 {4, 8, 16}, /* cost of loading SSE registers
378 in SImode, DImode and TImode */
379 {4, 8, 16}, /* cost of storing SSE registers
380 in SImode, DImode and TImode */
381 3, /* MMX or SSE register to integer */
382 8, /* size of l1 cache. */
383 8, /* size of l2 cache */
384 0, /* size of prefetch block */
385 0, /* number of parallel prefetches */
386 2, /* Branch cost */
387 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
388 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
389 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
390 COSTS_N_INSNS (1), /* cost of FABS instruction. */
391 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
392 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
393 pentium_memcpy,
394 pentium_memset,
395 1, /* scalar_stmt_cost. */
396 1, /* scalar load_cost. */
397 1, /* scalar_store_cost. */
398 1, /* vec_stmt_cost. */
399 1, /* vec_to_scalar_cost. */
400 1, /* scalar_to_vec_cost. */
401 1, /* vec_align_load_cost. */
402 2, /* vec_unalign_load_cost. */
403 1, /* vec_store_cost. */
404 3, /* cond_taken_branch_cost. */
405 1, /* cond_not_taken_branch_cost. */
408 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
409 (we ensure the alignment). For small blocks inline loop is still a
410 noticeable win, for bigger blocks either rep movsl or rep movsb is
411 way to go. Rep movsb has apparently more expensive startup time in CPU,
412 but after 4K the difference is down in the noise. */
413 static stringop_algs pentiumpro_memcpy[2] = {
414 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
415 {8192, rep_prefix_4_byte, false},
416 {-1, rep_prefix_1_byte, false}}},
417 DUMMY_STRINGOP_ALGS};
418 static stringop_algs pentiumpro_memset[2] = {
419 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
420 {8192, rep_prefix_4_byte, false},
421 {-1, libcall, false}}},
422 DUMMY_STRINGOP_ALGS};
423 static const
424 struct processor_costs pentiumpro_cost = {
425 COSTS_N_INSNS (1), /* cost of an add instruction */
426 COSTS_N_INSNS (1), /* cost of a lea instruction */
427 COSTS_N_INSNS (1), /* variable shift costs */
428 COSTS_N_INSNS (1), /* constant shift costs */
429 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
430 COSTS_N_INSNS (4), /* HI */
431 COSTS_N_INSNS (4), /* SI */
432 COSTS_N_INSNS (4), /* DI */
433 COSTS_N_INSNS (4)}, /* other */
434 0, /* cost of multiply per each bit set */
435 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
436 COSTS_N_INSNS (17), /* HI */
437 COSTS_N_INSNS (17), /* SI */
438 COSTS_N_INSNS (17), /* DI */
439 COSTS_N_INSNS (17)}, /* other */
440 COSTS_N_INSNS (1), /* cost of movsx */
441 COSTS_N_INSNS (1), /* cost of movzx */
442 8, /* "large" insn */
443 6, /* MOVE_RATIO */
444 2, /* cost for loading QImode using movzbl */
445 {4, 4, 4}, /* cost of loading integer registers
446 in QImode, HImode and SImode.
447 Relative to reg-reg move (2). */
448 {2, 2, 2}, /* cost of storing integer registers */
449 2, /* cost of reg,reg fld/fst */
450 {2, 2, 6}, /* cost of loading fp registers
451 in SFmode, DFmode and XFmode */
452 {4, 4, 6}, /* cost of storing fp registers
453 in SFmode, DFmode and XFmode */
454 2, /* cost of moving MMX register */
455 {2, 2}, /* cost of loading MMX registers
456 in SImode and DImode */
457 {2, 2}, /* cost of storing MMX registers
458 in SImode and DImode */
459 2, /* cost of moving SSE register */
460 {2, 2, 8}, /* cost of loading SSE registers
461 in SImode, DImode and TImode */
462 {2, 2, 8}, /* cost of storing SSE registers
463 in SImode, DImode and TImode */
464 3, /* MMX or SSE register to integer */
465 8, /* size of l1 cache. */
466 256, /* size of l2 cache */
467 32, /* size of prefetch block */
468 6, /* number of parallel prefetches */
469 2, /* Branch cost */
470 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
471 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
472 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
473 COSTS_N_INSNS (2), /* cost of FABS instruction. */
474 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
475 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
476 pentiumpro_memcpy,
477 pentiumpro_memset,
478 1, /* scalar_stmt_cost. */
479 1, /* scalar load_cost. */
480 1, /* scalar_store_cost. */
481 1, /* vec_stmt_cost. */
482 1, /* vec_to_scalar_cost. */
483 1, /* scalar_to_vec_cost. */
484 1, /* vec_align_load_cost. */
485 2, /* vec_unalign_load_cost. */
486 1, /* vec_store_cost. */
487 3, /* cond_taken_branch_cost. */
488 1, /* cond_not_taken_branch_cost. */
491 static stringop_algs geode_memcpy[2] = {
492 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
493 DUMMY_STRINGOP_ALGS};
494 static stringop_algs geode_memset[2] = {
495 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
496 DUMMY_STRINGOP_ALGS};
497 static const
498 struct processor_costs geode_cost = {
499 COSTS_N_INSNS (1), /* cost of an add instruction */
500 COSTS_N_INSNS (1), /* cost of a lea instruction */
501 COSTS_N_INSNS (2), /* variable shift costs */
502 COSTS_N_INSNS (1), /* constant shift costs */
503 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
504 COSTS_N_INSNS (4), /* HI */
505 COSTS_N_INSNS (7), /* SI */
506 COSTS_N_INSNS (7), /* DI */
507 COSTS_N_INSNS (7)}, /* other */
508 0, /* cost of multiply per each bit set */
509 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
510 COSTS_N_INSNS (23), /* HI */
511 COSTS_N_INSNS (39), /* SI */
512 COSTS_N_INSNS (39), /* DI */
513 COSTS_N_INSNS (39)}, /* other */
514 COSTS_N_INSNS (1), /* cost of movsx */
515 COSTS_N_INSNS (1), /* cost of movzx */
516 8, /* "large" insn */
517 4, /* MOVE_RATIO */
518 1, /* cost for loading QImode using movzbl */
519 {1, 1, 1}, /* cost of loading integer registers
520 in QImode, HImode and SImode.
521 Relative to reg-reg move (2). */
522 {1, 1, 1}, /* cost of storing integer registers */
523 1, /* cost of reg,reg fld/fst */
524 {1, 1, 1}, /* cost of loading fp registers
525 in SFmode, DFmode and XFmode */
526 {4, 6, 6}, /* cost of storing fp registers
527 in SFmode, DFmode and XFmode */
529 1, /* cost of moving MMX register */
530 {1, 1}, /* cost of loading MMX registers
531 in SImode and DImode */
532 {1, 1}, /* cost of storing MMX registers
533 in SImode and DImode */
534 1, /* cost of moving SSE register */
535 {1, 1, 1}, /* cost of loading SSE registers
536 in SImode, DImode and TImode */
537 {1, 1, 1}, /* cost of storing SSE registers
538 in SImode, DImode and TImode */
539 1, /* MMX or SSE register to integer */
540 64, /* size of l1 cache. */
541 128, /* size of l2 cache. */
542 32, /* size of prefetch block */
543 1, /* number of parallel prefetches */
544 1, /* Branch cost */
545 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
546 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
547 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
548 COSTS_N_INSNS (1), /* cost of FABS instruction. */
549 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
550 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
551 geode_memcpy,
552 geode_memset,
553 1, /* scalar_stmt_cost. */
554 1, /* scalar load_cost. */
555 1, /* scalar_store_cost. */
556 1, /* vec_stmt_cost. */
557 1, /* vec_to_scalar_cost. */
558 1, /* scalar_to_vec_cost. */
559 1, /* vec_align_load_cost. */
560 2, /* vec_unalign_load_cost. */
561 1, /* vec_store_cost. */
562 3, /* cond_taken_branch_cost. */
563 1, /* cond_not_taken_branch_cost. */
566 static stringop_algs k6_memcpy[2] = {
567 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
568 DUMMY_STRINGOP_ALGS};
569 static stringop_algs k6_memset[2] = {
570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571 DUMMY_STRINGOP_ALGS};
572 static const
573 struct processor_costs k6_cost = {
574 COSTS_N_INSNS (1), /* cost of an add instruction */
575 COSTS_N_INSNS (2), /* cost of a lea instruction */
576 COSTS_N_INSNS (1), /* variable shift costs */
577 COSTS_N_INSNS (1), /* constant shift costs */
578 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
579 COSTS_N_INSNS (3), /* HI */
580 COSTS_N_INSNS (3), /* SI */
581 COSTS_N_INSNS (3), /* DI */
582 COSTS_N_INSNS (3)}, /* other */
583 0, /* cost of multiply per each bit set */
584 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
585 COSTS_N_INSNS (18), /* HI */
586 COSTS_N_INSNS (18), /* SI */
587 COSTS_N_INSNS (18), /* DI */
588 COSTS_N_INSNS (18)}, /* other */
589 COSTS_N_INSNS (2), /* cost of movsx */
590 COSTS_N_INSNS (2), /* cost of movzx */
591 8, /* "large" insn */
592 4, /* MOVE_RATIO */
593 3, /* cost for loading QImode using movzbl */
594 {4, 5, 4}, /* cost of loading integer registers
595 in QImode, HImode and SImode.
596 Relative to reg-reg move (2). */
597 {2, 3, 2}, /* cost of storing integer registers */
598 4, /* cost of reg,reg fld/fst */
599 {6, 6, 6}, /* cost of loading fp registers
600 in SFmode, DFmode and XFmode */
601 {4, 4, 4}, /* cost of storing fp registers
602 in SFmode, DFmode and XFmode */
603 2, /* cost of moving MMX register */
604 {2, 2}, /* cost of loading MMX registers
605 in SImode and DImode */
606 {2, 2}, /* cost of storing MMX registers
607 in SImode and DImode */
608 2, /* cost of moving SSE register */
609 {2, 2, 8}, /* cost of loading SSE registers
610 in SImode, DImode and TImode */
611 {2, 2, 8}, /* cost of storing SSE registers
612 in SImode, DImode and TImode */
613 6, /* MMX or SSE register to integer */
614 32, /* size of l1 cache. */
615 32, /* size of l2 cache. Some models
616 have integrated l2 cache, but
617 optimizing for k6 is not important
618 enough to worry about that. */
619 32, /* size of prefetch block */
620 1, /* number of parallel prefetches */
621 1, /* Branch cost */
622 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
623 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
624 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
625 COSTS_N_INSNS (2), /* cost of FABS instruction. */
626 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
627 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
628 k6_memcpy,
629 k6_memset,
630 1, /* scalar_stmt_cost. */
631 1, /* scalar load_cost. */
632 1, /* scalar_store_cost. */
633 1, /* vec_stmt_cost. */
634 1, /* vec_to_scalar_cost. */
635 1, /* scalar_to_vec_cost. */
636 1, /* vec_align_load_cost. */
637 2, /* vec_unalign_load_cost. */
638 1, /* vec_store_cost. */
639 3, /* cond_taken_branch_cost. */
640 1, /* cond_not_taken_branch_cost. */
643 /* For some reason, Athlon deals better with REP prefix (relative to loops)
644 compared to K8. Alignment becomes important after 8 bytes for memcpy and
645 128 bytes for memset. */
646 static stringop_algs athlon_memcpy[2] = {
647 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
648 DUMMY_STRINGOP_ALGS};
649 static stringop_algs athlon_memset[2] = {
650 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS};
652 static const
653 struct processor_costs athlon_cost = {
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (2), /* cost of a lea instruction */
656 COSTS_N_INSNS (1), /* variable shift costs */
657 COSTS_N_INSNS (1), /* constant shift costs */
658 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (5), /* HI */
660 COSTS_N_INSNS (5), /* SI */
661 COSTS_N_INSNS (5), /* DI */
662 COSTS_N_INSNS (5)}, /* other */
663 0, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (26), /* HI */
666 COSTS_N_INSNS (42), /* SI */
667 COSTS_N_INSNS (74), /* DI */
668 COSTS_N_INSNS (74)}, /* other */
669 COSTS_N_INSNS (1), /* cost of movsx */
670 COSTS_N_INSNS (1), /* cost of movzx */
671 8, /* "large" insn */
672 9, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {3, 4, 3}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {3, 4, 3}, /* cost of storing integer registers */
678 4, /* cost of reg,reg fld/fst */
679 {4, 4, 12}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {6, 6, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 4}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 4}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 4, 6}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 4, 5}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 5, /* MMX or SSE register to integer */
694 64, /* size of l1 cache. */
695 256, /* size of l2 cache. */
696 64, /* size of prefetch block */
697 6, /* number of parallel prefetches */
698 5, /* Branch cost */
699 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
700 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
701 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
702 COSTS_N_INSNS (2), /* cost of FABS instruction. */
703 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
704 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
705 athlon_memcpy,
706 athlon_memset,
707 1, /* scalar_stmt_cost. */
708 1, /* scalar load_cost. */
709 1, /* scalar_store_cost. */
710 1, /* vec_stmt_cost. */
711 1, /* vec_to_scalar_cost. */
712 1, /* scalar_to_vec_cost. */
713 1, /* vec_align_load_cost. */
714 2, /* vec_unalign_load_cost. */
715 1, /* vec_store_cost. */
716 3, /* cond_taken_branch_cost. */
717 1, /* cond_not_taken_branch_cost. */
720 /* K8 has optimized REP instruction for medium sized blocks, but for very
721 small blocks it is better to use loop. For large blocks, libcall can
722 do nontemporary accesses and beat inline considerably. */
723 static stringop_algs k8_memcpy[2] = {
724 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
725 {-1, rep_prefix_4_byte, false}}},
726 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
727 {-1, libcall, false}}}};
728 static stringop_algs k8_memset[2] = {
729 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
730 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
731 {libcall, {{48, unrolled_loop, false},
732 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
733 static const
734 struct processor_costs k8_cost = {
735 COSTS_N_INSNS (1), /* cost of an add instruction */
736 COSTS_N_INSNS (2), /* cost of a lea instruction */
737 COSTS_N_INSNS (1), /* variable shift costs */
738 COSTS_N_INSNS (1), /* constant shift costs */
739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
740 COSTS_N_INSNS (4), /* HI */
741 COSTS_N_INSNS (3), /* SI */
742 COSTS_N_INSNS (4), /* DI */
743 COSTS_N_INSNS (5)}, /* other */
744 0, /* cost of multiply per each bit set */
745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
746 COSTS_N_INSNS (26), /* HI */
747 COSTS_N_INSNS (42), /* SI */
748 COSTS_N_INSNS (74), /* DI */
749 COSTS_N_INSNS (74)}, /* other */
750 COSTS_N_INSNS (1), /* cost of movsx */
751 COSTS_N_INSNS (1), /* cost of movzx */
752 8, /* "large" insn */
753 9, /* MOVE_RATIO */
754 4, /* cost for loading QImode using movzbl */
755 {3, 4, 3}, /* cost of loading integer registers
756 in QImode, HImode and SImode.
757 Relative to reg-reg move (2). */
758 {3, 4, 3}, /* cost of storing integer registers */
759 4, /* cost of reg,reg fld/fst */
760 {4, 4, 12}, /* cost of loading fp registers
761 in SFmode, DFmode and XFmode */
762 {6, 6, 8}, /* cost of storing fp registers
763 in SFmode, DFmode and XFmode */
764 2, /* cost of moving MMX register */
765 {3, 3}, /* cost of loading MMX registers
766 in SImode and DImode */
767 {4, 4}, /* cost of storing MMX registers
768 in SImode and DImode */
769 2, /* cost of moving SSE register */
770 {4, 3, 6}, /* cost of loading SSE registers
771 in SImode, DImode and TImode */
772 {4, 4, 5}, /* cost of storing SSE registers
773 in SImode, DImode and TImode */
774 5, /* MMX or SSE register to integer */
775 64, /* size of l1 cache. */
776 512, /* size of l2 cache. */
777 64, /* size of prefetch block */
778 /* New AMD processors never drop prefetches; if they cannot be performed
779 immediately, they are queued. We set number of simultaneous prefetches
780 to a large constant to reflect this (it probably is not a good idea not
781 to limit number of prefetches at all, as their execution also takes some
782 time). */
783 100, /* number of parallel prefetches */
784 3, /* Branch cost */
785 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
786 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
787 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
788 COSTS_N_INSNS (2), /* cost of FABS instruction. */
789 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
790 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
792 k8_memcpy,
793 k8_memset,
794 4, /* scalar_stmt_cost. */
795 2, /* scalar load_cost. */
796 2, /* scalar_store_cost. */
797 5, /* vec_stmt_cost. */
798 0, /* vec_to_scalar_cost. */
799 2, /* scalar_to_vec_cost. */
800 2, /* vec_align_load_cost. */
801 3, /* vec_unalign_load_cost. */
802 3, /* vec_store_cost. */
803 3, /* cond_taken_branch_cost. */
804 2, /* cond_not_taken_branch_cost. */
807 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
808 very small blocks it is better to use loop. For large blocks, libcall can
809 do nontemporary accesses and beat inline considerably. */
810 static stringop_algs amdfam10_memcpy[2] = {
811 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
812 {-1, rep_prefix_4_byte, false}}},
813 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
814 {-1, libcall, false}}}};
815 static stringop_algs amdfam10_memset[2] = {
816 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
817 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
818 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}};
820 struct processor_costs amdfam10_cost = {
821 COSTS_N_INSNS (1), /* cost of an add instruction */
822 COSTS_N_INSNS (2), /* cost of a lea instruction */
823 COSTS_N_INSNS (1), /* variable shift costs */
824 COSTS_N_INSNS (1), /* constant shift costs */
825 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
826 COSTS_N_INSNS (4), /* HI */
827 COSTS_N_INSNS (3), /* SI */
828 COSTS_N_INSNS (4), /* DI */
829 COSTS_N_INSNS (5)}, /* other */
830 0, /* cost of multiply per each bit set */
831 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
832 COSTS_N_INSNS (35), /* HI */
833 COSTS_N_INSNS (51), /* SI */
834 COSTS_N_INSNS (83), /* DI */
835 COSTS_N_INSNS (83)}, /* other */
836 COSTS_N_INSNS (1), /* cost of movsx */
837 COSTS_N_INSNS (1), /* cost of movzx */
838 8, /* "large" insn */
839 9, /* MOVE_RATIO */
840 4, /* cost for loading QImode using movzbl */
841 {3, 4, 3}, /* cost of loading integer registers
842 in QImode, HImode and SImode.
843 Relative to reg-reg move (2). */
844 {3, 4, 3}, /* cost of storing integer registers */
845 4, /* cost of reg,reg fld/fst */
846 {4, 4, 12}, /* cost of loading fp registers
847 in SFmode, DFmode and XFmode */
848 {6, 6, 8}, /* cost of storing fp registers
849 in SFmode, DFmode and XFmode */
850 2, /* cost of moving MMX register */
851 {3, 3}, /* cost of loading MMX registers
852 in SImode and DImode */
853 {4, 4}, /* cost of storing MMX registers
854 in SImode and DImode */
855 2, /* cost of moving SSE register */
856 {4, 4, 3}, /* cost of loading SSE registers
857 in SImode, DImode and TImode */
858 {4, 4, 5}, /* cost of storing SSE registers
859 in SImode, DImode and TImode */
860 3, /* MMX or SSE register to integer */
861 /* On K8:
862 MOVD reg64, xmmreg Double FSTORE 4
863 MOVD reg32, xmmreg Double FSTORE 4
864 On AMDFAM10:
865 MOVD reg64, xmmreg Double FADD 3
866 1/1 1/1
867 MOVD reg32, xmmreg Double FADD 3
868 1/1 1/1 */
869 64, /* size of l1 cache. */
870 512, /* size of l2 cache. */
871 64, /* size of prefetch block */
872 /* New AMD processors never drop prefetches; if they cannot be performed
873 immediately, they are queued. We set number of simultaneous prefetches
874 to a large constant to reflect this (it probably is not a good idea not
875 to limit number of prefetches at all, as their execution also takes some
876 time). */
877 100, /* number of parallel prefetches */
878 2, /* Branch cost */
879 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
880 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
881 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
882 COSTS_N_INSNS (2), /* cost of FABS instruction. */
883 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
884 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
886 amdfam10_memcpy,
887 amdfam10_memset,
888 4, /* scalar_stmt_cost. */
889 2, /* scalar load_cost. */
890 2, /* scalar_store_cost. */
891 6, /* vec_stmt_cost. */
892 0, /* vec_to_scalar_cost. */
893 2, /* scalar_to_vec_cost. */
894 2, /* vec_align_load_cost. */
895 2, /* vec_unalign_load_cost. */
896 2, /* vec_store_cost. */
897 2, /* cond_taken_branch_cost. */
898 1, /* cond_not_taken_branch_cost. */
901 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
902 very small blocks it is better to use loop. For large blocks, libcall
903 can do nontemporary accesses and beat inline considerably. */
904 static stringop_algs bdver1_memcpy[2] = {
905 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
906 {-1, rep_prefix_4_byte, false}}},
907 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
908 {-1, libcall, false}}}};
909 static stringop_algs bdver1_memset[2] = {
910 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}};
915 const struct processor_costs bdver1_cost = {
916 COSTS_N_INSNS (1), /* cost of an add instruction */
917 COSTS_N_INSNS (1), /* cost of a lea instruction */
918 COSTS_N_INSNS (1), /* variable shift costs */
919 COSTS_N_INSNS (1), /* constant shift costs */
920 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
921 COSTS_N_INSNS (4), /* HI */
922 COSTS_N_INSNS (4), /* SI */
923 COSTS_N_INSNS (6), /* DI */
924 COSTS_N_INSNS (6)}, /* other */
925 0, /* cost of multiply per each bit set */
926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
927 COSTS_N_INSNS (35), /* HI */
928 COSTS_N_INSNS (51), /* SI */
929 COSTS_N_INSNS (83), /* DI */
930 COSTS_N_INSNS (83)}, /* other */
931 COSTS_N_INSNS (1), /* cost of movsx */
932 COSTS_N_INSNS (1), /* cost of movzx */
933 8, /* "large" insn */
934 9, /* MOVE_RATIO */
935 4, /* cost for loading QImode using movzbl */
936 {5, 5, 4}, /* cost of loading integer registers
937 in QImode, HImode and SImode.
938 Relative to reg-reg move (2). */
939 {4, 4, 4}, /* cost of storing integer registers */
940 2, /* cost of reg,reg fld/fst */
941 {5, 5, 12}, /* cost of loading fp registers
942 in SFmode, DFmode and XFmode */
943 {4, 4, 8}, /* cost of storing fp registers
944 in SFmode, DFmode and XFmode */
945 2, /* cost of moving MMX register */
946 {4, 4}, /* cost of loading MMX registers
947 in SImode and DImode */
948 {4, 4}, /* cost of storing MMX registers
949 in SImode and DImode */
950 2, /* cost of moving SSE register */
951 {4, 4, 4}, /* cost of loading SSE registers
952 in SImode, DImode and TImode */
953 {4, 4, 4}, /* cost of storing SSE registers
954 in SImode, DImode and TImode */
955 2, /* MMX or SSE register to integer */
956 /* On K8:
957 MOVD reg64, xmmreg Double FSTORE 4
958 MOVD reg32, xmmreg Double FSTORE 4
959 On AMDFAM10:
960 MOVD reg64, xmmreg Double FADD 3
961 1/1 1/1
962 MOVD reg32, xmmreg Double FADD 3
963 1/1 1/1 */
964 16, /* size of l1 cache. */
965 2048, /* size of l2 cache. */
966 64, /* size of prefetch block */
967 /* New AMD processors never drop prefetches; if they cannot be performed
968 immediately, they are queued. We set number of simultaneous prefetches
969 to a large constant to reflect this (it probably is not a good idea not
970 to limit number of prefetches at all, as their execution also takes some
971 time). */
972 100, /* number of parallel prefetches */
973 2, /* Branch cost */
974 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
975 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
976 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
977 COSTS_N_INSNS (2), /* cost of FABS instruction. */
978 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
979 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
981 bdver1_memcpy,
982 bdver1_memset,
983 6, /* scalar_stmt_cost. */
984 4, /* scalar load_cost. */
985 4, /* scalar_store_cost. */
986 6, /* vec_stmt_cost. */
987 0, /* vec_to_scalar_cost. */
988 2, /* scalar_to_vec_cost. */
989 4, /* vec_align_load_cost. */
990 4, /* vec_unalign_load_cost. */
991 4, /* vec_store_cost. */
992 2, /* cond_taken_branch_cost. */
993 1, /* cond_not_taken_branch_cost. */
996 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
997 very small blocks it is better to use loop. For large blocks, libcall
998 can do nontemporary accesses and beat inline considerably. */
1000 static stringop_algs bdver2_memcpy[2] = {
1001 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1002 {-1, rep_prefix_4_byte, false}}},
1003 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1004 {-1, libcall, false}}}};
1005 static stringop_algs bdver2_memset[2] = {
1006 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1007 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1008 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1009 {-1, libcall, false}}}};
1011 const struct processor_costs bdver2_cost = {
1012 COSTS_N_INSNS (1), /* cost of an add instruction */
1013 COSTS_N_INSNS (1), /* cost of a lea instruction */
1014 COSTS_N_INSNS (1), /* variable shift costs */
1015 COSTS_N_INSNS (1), /* constant shift costs */
1016 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1017 COSTS_N_INSNS (4), /* HI */
1018 COSTS_N_INSNS (4), /* SI */
1019 COSTS_N_INSNS (6), /* DI */
1020 COSTS_N_INSNS (6)}, /* other */
1021 0, /* cost of multiply per each bit set */
1022 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1023 COSTS_N_INSNS (35), /* HI */
1024 COSTS_N_INSNS (51), /* SI */
1025 COSTS_N_INSNS (83), /* DI */
1026 COSTS_N_INSNS (83)}, /* other */
1027 COSTS_N_INSNS (1), /* cost of movsx */
1028 COSTS_N_INSNS (1), /* cost of movzx */
1029 8, /* "large" insn */
1030 9, /* MOVE_RATIO */
1031 4, /* cost for loading QImode using movzbl */
1032 {5, 5, 4}, /* cost of loading integer registers
1033 in QImode, HImode and SImode.
1034 Relative to reg-reg move (2). */
1035 {4, 4, 4}, /* cost of storing integer registers */
1036 2, /* cost of reg,reg fld/fst */
1037 {5, 5, 12}, /* cost of loading fp registers
1038 in SFmode, DFmode and XFmode */
1039 {4, 4, 8}, /* cost of storing fp registers
1040 in SFmode, DFmode and XFmode */
1041 2, /* cost of moving MMX register */
1042 {4, 4}, /* cost of loading MMX registers
1043 in SImode and DImode */
1044 {4, 4}, /* cost of storing MMX registers
1045 in SImode and DImode */
1046 2, /* cost of moving SSE register */
1047 {4, 4, 4}, /* cost of loading SSE registers
1048 in SImode, DImode and TImode */
1049 {4, 4, 4}, /* cost of storing SSE registers
1050 in SImode, DImode and TImode */
1051 2, /* MMX or SSE register to integer */
1052 /* On K8:
1053 MOVD reg64, xmmreg Double FSTORE 4
1054 MOVD reg32, xmmreg Double FSTORE 4
1055 On AMDFAM10:
1056 MOVD reg64, xmmreg Double FADD 3
1057 1/1 1/1
1058 MOVD reg32, xmmreg Double FADD 3
1059 1/1 1/1 */
1060 16, /* size of l1 cache. */
1061 2048, /* size of l2 cache. */
1062 64, /* size of prefetch block */
1063 /* New AMD processors never drop prefetches; if they cannot be performed
1064 immediately, they are queued. We set number of simultaneous prefetches
1065 to a large constant to reflect this (it probably is not a good idea not
1066 to limit number of prefetches at all, as their execution also takes some
1067 time). */
1068 100, /* number of parallel prefetches */
1069 2, /* Branch cost */
1070 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1071 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1072 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1073 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1074 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1075 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1077 bdver2_memcpy,
1078 bdver2_memset,
1079 6, /* scalar_stmt_cost. */
1080 4, /* scalar load_cost. */
1081 4, /* scalar_store_cost. */
1082 6, /* vec_stmt_cost. */
1083 0, /* vec_to_scalar_cost. */
1084 2, /* scalar_to_vec_cost. */
1085 4, /* vec_align_load_cost. */
1086 4, /* vec_unalign_load_cost. */
1087 4, /* vec_store_cost. */
1088 2, /* cond_taken_branch_cost. */
1089 1, /* cond_not_taken_branch_cost. */
1093 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1094 very small blocks it is better to use loop. For large blocks, libcall
1095 can do nontemporary accesses and beat inline considerably. */
1096 static stringop_algs bdver3_memcpy[2] = {
1097 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1098 {-1, rep_prefix_4_byte, false}}},
1099 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1100 {-1, libcall, false}}}};
1101 static stringop_algs bdver3_memset[2] = {
1102 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1103 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1104 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1105 {-1, libcall, false}}}};
1106 struct processor_costs bdver3_cost = {
1107 COSTS_N_INSNS (1), /* cost of an add instruction */
1108 COSTS_N_INSNS (1), /* cost of a lea instruction */
1109 COSTS_N_INSNS (1), /* variable shift costs */
1110 COSTS_N_INSNS (1), /* constant shift costs */
1111 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1112 COSTS_N_INSNS (4), /* HI */
1113 COSTS_N_INSNS (4), /* SI */
1114 COSTS_N_INSNS (6), /* DI */
1115 COSTS_N_INSNS (6)}, /* other */
1116 0, /* cost of multiply per each bit set */
1117 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1118 COSTS_N_INSNS (35), /* HI */
1119 COSTS_N_INSNS (51), /* SI */
1120 COSTS_N_INSNS (83), /* DI */
1121 COSTS_N_INSNS (83)}, /* other */
1122 COSTS_N_INSNS (1), /* cost of movsx */
1123 COSTS_N_INSNS (1), /* cost of movzx */
1124 8, /* "large" insn */
1125 9, /* MOVE_RATIO */
1126 4, /* cost for loading QImode using movzbl */
1127 {5, 5, 4}, /* cost of loading integer registers
1128 in QImode, HImode and SImode.
1129 Relative to reg-reg move (2). */
1130 {4, 4, 4}, /* cost of storing integer registers */
1131 2, /* cost of reg,reg fld/fst */
1132 {5, 5, 12}, /* cost of loading fp registers
1133 in SFmode, DFmode and XFmode */
1134 {4, 4, 8}, /* cost of storing fp registers
1135 in SFmode, DFmode and XFmode */
1136 2, /* cost of moving MMX register */
1137 {4, 4}, /* cost of loading MMX registers
1138 in SImode and DImode */
1139 {4, 4}, /* cost of storing MMX registers
1140 in SImode and DImode */
1141 2, /* cost of moving SSE register */
1142 {4, 4, 4}, /* cost of loading SSE registers
1143 in SImode, DImode and TImode */
1144 {4, 4, 4}, /* cost of storing SSE registers
1145 in SImode, DImode and TImode */
1146 2, /* MMX or SSE register to integer */
1147 16, /* size of l1 cache. */
1148 2048, /* size of l2 cache. */
1149 64, /* size of prefetch block */
1150 /* New AMD processors never drop prefetches; if they cannot be performed
1151 immediately, they are queued. We set number of simultaneous prefetches
1152 to a large constant to reflect this (it probably is not a good idea not
1153 to limit number of prefetches at all, as their execution also takes some
1154 time). */
1155 100, /* number of parallel prefetches */
1156 2, /* Branch cost */
1157 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1158 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1159 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1160 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1161 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1162 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1164 bdver3_memcpy,
1165 bdver3_memset,
1166 6, /* scalar_stmt_cost. */
1167 4, /* scalar load_cost. */
1168 4, /* scalar_store_cost. */
1169 6, /* vec_stmt_cost. */
1170 0, /* vec_to_scalar_cost. */
1171 2, /* scalar_to_vec_cost. */
1172 4, /* vec_align_load_cost. */
1173 4, /* vec_unalign_load_cost. */
1174 4, /* vec_store_cost. */
1175 2, /* cond_taken_branch_cost. */
1176 1, /* cond_not_taken_branch_cost. */
1179 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1180 very small blocks it is better to use loop. For large blocks, libcall
1181 can do nontemporary accesses and beat inline considerably. */
1182 static stringop_algs bdver4_memcpy[2] = {
1183 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1184 {-1, rep_prefix_4_byte, false}}},
1185 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1186 {-1, libcall, false}}}};
1187 static stringop_algs bdver4_memset[2] = {
1188 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1189 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1190 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1191 {-1, libcall, false}}}};
1192 struct processor_costs bdver4_cost = {
1193 COSTS_N_INSNS (1), /* cost of an add instruction */
1194 COSTS_N_INSNS (1), /* cost of a lea instruction */
1195 COSTS_N_INSNS (1), /* variable shift costs */
1196 COSTS_N_INSNS (1), /* constant shift costs */
1197 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1198 COSTS_N_INSNS (4), /* HI */
1199 COSTS_N_INSNS (4), /* SI */
1200 COSTS_N_INSNS (6), /* DI */
1201 COSTS_N_INSNS (6)}, /* other */
1202 0, /* cost of multiply per each bit set */
1203 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1204 COSTS_N_INSNS (35), /* HI */
1205 COSTS_N_INSNS (51), /* SI */
1206 COSTS_N_INSNS (83), /* DI */
1207 COSTS_N_INSNS (83)}, /* other */
1208 COSTS_N_INSNS (1), /* cost of movsx */
1209 COSTS_N_INSNS (1), /* cost of movzx */
1210 8, /* "large" insn */
1211 9, /* MOVE_RATIO */
1212 4, /* cost for loading QImode using movzbl */
1213 {5, 5, 4}, /* cost of loading integer registers
1214 in QImode, HImode and SImode.
1215 Relative to reg-reg move (2). */
1216 {4, 4, 4}, /* cost of storing integer registers */
1217 2, /* cost of reg,reg fld/fst */
1218 {5, 5, 12}, /* cost of loading fp registers
1219 in SFmode, DFmode and XFmode */
1220 {4, 4, 8}, /* cost of storing fp registers
1221 in SFmode, DFmode and XFmode */
1222 2, /* cost of moving MMX register */
1223 {4, 4}, /* cost of loading MMX registers
1224 in SImode and DImode */
1225 {4, 4}, /* cost of storing MMX registers
1226 in SImode and DImode */
1227 2, /* cost of moving SSE register */
1228 {4, 4, 4}, /* cost of loading SSE registers
1229 in SImode, DImode and TImode */
1230 {4, 4, 4}, /* cost of storing SSE registers
1231 in SImode, DImode and TImode */
1232 2, /* MMX or SSE register to integer */
1233 16, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 /* New AMD processors never drop prefetches; if they cannot be performed
1237 immediately, they are queued. We set number of simultaneous prefetches
1238 to a large constant to reflect this (it probably is not a good idea not
1239 to limit number of prefetches at all, as their execution also takes some
1240 time). */
1241 100, /* number of parallel prefetches */
1242 2, /* Branch cost */
1243 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1244 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1245 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1246 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1247 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1248 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1250 bdver4_memcpy,
1251 bdver4_memset,
1252 6, /* scalar_stmt_cost. */
1253 4, /* scalar load_cost. */
1254 4, /* scalar_store_cost. */
1255 6, /* vec_stmt_cost. */
1256 0, /* vec_to_scalar_cost. */
1257 2, /* scalar_to_vec_cost. */
1258 4, /* vec_align_load_cost. */
1259 4, /* vec_unalign_load_cost. */
1260 4, /* vec_store_cost. */
1261 2, /* cond_taken_branch_cost. */
1262 1, /* cond_not_taken_branch_cost. */
1265 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1266 very small blocks it is better to use loop. For large blocks, libcall can
1267 do nontemporary accesses and beat inline considerably. */
1268 static stringop_algs btver1_memcpy[2] = {
1269 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1270 {-1, rep_prefix_4_byte, false}}},
1271 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1272 {-1, libcall, false}}}};
1273 static stringop_algs btver1_memset[2] = {
1274 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1275 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1276 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1277 {-1, libcall, false}}}};
1278 const struct processor_costs btver1_cost = {
1279 COSTS_N_INSNS (1), /* cost of an add instruction */
1280 COSTS_N_INSNS (2), /* cost of a lea instruction */
1281 COSTS_N_INSNS (1), /* variable shift costs */
1282 COSTS_N_INSNS (1), /* constant shift costs */
1283 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1284 COSTS_N_INSNS (4), /* HI */
1285 COSTS_N_INSNS (3), /* SI */
1286 COSTS_N_INSNS (4), /* DI */
1287 COSTS_N_INSNS (5)}, /* other */
1288 0, /* cost of multiply per each bit set */
1289 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1290 COSTS_N_INSNS (35), /* HI */
1291 COSTS_N_INSNS (51), /* SI */
1292 COSTS_N_INSNS (83), /* DI */
1293 COSTS_N_INSNS (83)}, /* other */
1294 COSTS_N_INSNS (1), /* cost of movsx */
1295 COSTS_N_INSNS (1), /* cost of movzx */
1296 8, /* "large" insn */
1297 9, /* MOVE_RATIO */
1298 4, /* cost for loading QImode using movzbl */
1299 {3, 4, 3}, /* cost of loading integer registers
1300 in QImode, HImode and SImode.
1301 Relative to reg-reg move (2). */
1302 {3, 4, 3}, /* cost of storing integer registers */
1303 4, /* cost of reg,reg fld/fst */
1304 {4, 4, 12}, /* cost of loading fp registers
1305 in SFmode, DFmode and XFmode */
1306 {6, 6, 8}, /* cost of storing fp registers
1307 in SFmode, DFmode and XFmode */
1308 2, /* cost of moving MMX register */
1309 {3, 3}, /* cost of loading MMX registers
1310 in SImode and DImode */
1311 {4, 4}, /* cost of storing MMX registers
1312 in SImode and DImode */
1313 2, /* cost of moving SSE register */
1314 {4, 4, 3}, /* cost of loading SSE registers
1315 in SImode, DImode and TImode */
1316 {4, 4, 5}, /* cost of storing SSE registers
1317 in SImode, DImode and TImode */
1318 3, /* MMX or SSE register to integer */
1319 /* On K8:
1320 MOVD reg64, xmmreg Double FSTORE 4
1321 MOVD reg32, xmmreg Double FSTORE 4
1322 On AMDFAM10:
1323 MOVD reg64, xmmreg Double FADD 3
1324 1/1 1/1
1325 MOVD reg32, xmmreg Double FADD 3
1326 1/1 1/1 */
1327 32, /* size of l1 cache. */
1328 512, /* size of l2 cache. */
1329 64, /* size of prefetch block */
1330 100, /* number of parallel prefetches */
1331 2, /* Branch cost */
1332 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1333 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1334 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1335 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1336 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1337 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1339 btver1_memcpy,
1340 btver1_memset,
1341 4, /* scalar_stmt_cost. */
1342 2, /* scalar load_cost. */
1343 2, /* scalar_store_cost. */
1344 6, /* vec_stmt_cost. */
1345 0, /* vec_to_scalar_cost. */
1346 2, /* scalar_to_vec_cost. */
1347 2, /* vec_align_load_cost. */
1348 2, /* vec_unalign_load_cost. */
1349 2, /* vec_store_cost. */
1350 2, /* cond_taken_branch_cost. */
1351 1, /* cond_not_taken_branch_cost. */
1354 static stringop_algs btver2_memcpy[2] = {
1355 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1356 {-1, rep_prefix_4_byte, false}}},
1357 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1358 {-1, libcall, false}}}};
1359 static stringop_algs btver2_memset[2] = {
1360 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1361 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1362 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1363 {-1, libcall, false}}}};
1364 const struct processor_costs btver2_cost = {
1365 COSTS_N_INSNS (1), /* cost of an add instruction */
1366 COSTS_N_INSNS (2), /* cost of a lea instruction */
1367 COSTS_N_INSNS (1), /* variable shift costs */
1368 COSTS_N_INSNS (1), /* constant shift costs */
1369 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1370 COSTS_N_INSNS (4), /* HI */
1371 COSTS_N_INSNS (3), /* SI */
1372 COSTS_N_INSNS (4), /* DI */
1373 COSTS_N_INSNS (5)}, /* other */
1374 0, /* cost of multiply per each bit set */
1375 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1376 COSTS_N_INSNS (35), /* HI */
1377 COSTS_N_INSNS (51), /* SI */
1378 COSTS_N_INSNS (83), /* DI */
1379 COSTS_N_INSNS (83)}, /* other */
1380 COSTS_N_INSNS (1), /* cost of movsx */
1381 COSTS_N_INSNS (1), /* cost of movzx */
1382 8, /* "large" insn */
1383 9, /* MOVE_RATIO */
1384 4, /* cost for loading QImode using movzbl */
1385 {3, 4, 3}, /* cost of loading integer registers
1386 in QImode, HImode and SImode.
1387 Relative to reg-reg move (2). */
1388 {3, 4, 3}, /* cost of storing integer registers */
1389 4, /* cost of reg,reg fld/fst */
1390 {4, 4, 12}, /* cost of loading fp registers
1391 in SFmode, DFmode and XFmode */
1392 {6, 6, 8}, /* cost of storing fp registers
1393 in SFmode, DFmode and XFmode */
1394 2, /* cost of moving MMX register */
1395 {3, 3}, /* cost of loading MMX registers
1396 in SImode and DImode */
1397 {4, 4}, /* cost of storing MMX registers
1398 in SImode and DImode */
1399 2, /* cost of moving SSE register */
1400 {4, 4, 3}, /* cost of loading SSE registers
1401 in SImode, DImode and TImode */
1402 {4, 4, 5}, /* cost of storing SSE registers
1403 in SImode, DImode and TImode */
1404 3, /* MMX or SSE register to integer */
1405 /* On K8:
1406 MOVD reg64, xmmreg Double FSTORE 4
1407 MOVD reg32, xmmreg Double FSTORE 4
1408 On AMDFAM10:
1409 MOVD reg64, xmmreg Double FADD 3
1410 1/1 1/1
1411 MOVD reg32, xmmreg Double FADD 3
1412 1/1 1/1 */
1413 32, /* size of l1 cache. */
1414 2048, /* size of l2 cache. */
1415 64, /* size of prefetch block */
1416 100, /* number of parallel prefetches */
1417 2, /* Branch cost */
1418 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1419 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1420 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1421 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1422 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1423 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1424 btver2_memcpy,
1425 btver2_memset,
1426 4, /* scalar_stmt_cost. */
1427 2, /* scalar load_cost. */
1428 2, /* scalar_store_cost. */
1429 6, /* vec_stmt_cost. */
1430 0, /* vec_to_scalar_cost. */
1431 2, /* scalar_to_vec_cost. */
1432 2, /* vec_align_load_cost. */
1433 2, /* vec_unalign_load_cost. */
1434 2, /* vec_store_cost. */
1435 2, /* cond_taken_branch_cost. */
1436 1, /* cond_not_taken_branch_cost. */
1439 static stringop_algs pentium4_memcpy[2] = {
1440 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1441 DUMMY_STRINGOP_ALGS};
1442 static stringop_algs pentium4_memset[2] = {
1443 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1444 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1445 DUMMY_STRINGOP_ALGS};
1447 static const
1448 struct processor_costs pentium4_cost = {
1449 COSTS_N_INSNS (1), /* cost of an add instruction */
1450 COSTS_N_INSNS (3), /* cost of a lea instruction */
1451 COSTS_N_INSNS (4), /* variable shift costs */
1452 COSTS_N_INSNS (4), /* constant shift costs */
1453 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1454 COSTS_N_INSNS (15), /* HI */
1455 COSTS_N_INSNS (15), /* SI */
1456 COSTS_N_INSNS (15), /* DI */
1457 COSTS_N_INSNS (15)}, /* other */
1458 0, /* cost of multiply per each bit set */
1459 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1460 COSTS_N_INSNS (56), /* HI */
1461 COSTS_N_INSNS (56), /* SI */
1462 COSTS_N_INSNS (56), /* DI */
1463 COSTS_N_INSNS (56)}, /* other */
1464 COSTS_N_INSNS (1), /* cost of movsx */
1465 COSTS_N_INSNS (1), /* cost of movzx */
1466 16, /* "large" insn */
1467 6, /* MOVE_RATIO */
1468 2, /* cost for loading QImode using movzbl */
1469 {4, 5, 4}, /* cost of loading integer registers
1470 in QImode, HImode and SImode.
1471 Relative to reg-reg move (2). */
1472 {2, 3, 2}, /* cost of storing integer registers */
1473 2, /* cost of reg,reg fld/fst */
1474 {2, 2, 6}, /* cost of loading fp registers
1475 in SFmode, DFmode and XFmode */
1476 {4, 4, 6}, /* cost of storing fp registers
1477 in SFmode, DFmode and XFmode */
1478 2, /* cost of moving MMX register */
1479 {2, 2}, /* cost of loading MMX registers
1480 in SImode and DImode */
1481 {2, 2}, /* cost of storing MMX registers
1482 in SImode and DImode */
1483 12, /* cost of moving SSE register */
1484 {12, 12, 12}, /* cost of loading SSE registers
1485 in SImode, DImode and TImode */
1486 {2, 2, 8}, /* cost of storing SSE registers
1487 in SImode, DImode and TImode */
1488 10, /* MMX or SSE register to integer */
1489 8, /* size of l1 cache. */
1490 256, /* size of l2 cache. */
1491 64, /* size of prefetch block */
1492 6, /* number of parallel prefetches */
1493 2, /* Branch cost */
1494 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1495 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1496 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1497 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1498 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1499 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1500 pentium4_memcpy,
1501 pentium4_memset,
1502 1, /* scalar_stmt_cost. */
1503 1, /* scalar load_cost. */
1504 1, /* scalar_store_cost. */
1505 1, /* vec_stmt_cost. */
1506 1, /* vec_to_scalar_cost. */
1507 1, /* scalar_to_vec_cost. */
1508 1, /* vec_align_load_cost. */
1509 2, /* vec_unalign_load_cost. */
1510 1, /* vec_store_cost. */
1511 3, /* cond_taken_branch_cost. */
1512 1, /* cond_not_taken_branch_cost. */
1515 static stringop_algs nocona_memcpy[2] = {
1516 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1517 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1518 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1520 static stringop_algs nocona_memset[2] = {
1521 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1522 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1523 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1524 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1526 static const
1527 struct processor_costs nocona_cost = {
1528 COSTS_N_INSNS (1), /* cost of an add instruction */
1529 COSTS_N_INSNS (1), /* cost of a lea instruction */
1530 COSTS_N_INSNS (1), /* variable shift costs */
1531 COSTS_N_INSNS (1), /* constant shift costs */
1532 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1533 COSTS_N_INSNS (10), /* HI */
1534 COSTS_N_INSNS (10), /* SI */
1535 COSTS_N_INSNS (10), /* DI */
1536 COSTS_N_INSNS (10)}, /* other */
1537 0, /* cost of multiply per each bit set */
1538 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1539 COSTS_N_INSNS (66), /* HI */
1540 COSTS_N_INSNS (66), /* SI */
1541 COSTS_N_INSNS (66), /* DI */
1542 COSTS_N_INSNS (66)}, /* other */
1543 COSTS_N_INSNS (1), /* cost of movsx */
1544 COSTS_N_INSNS (1), /* cost of movzx */
1545 16, /* "large" insn */
1546 17, /* MOVE_RATIO */
1547 4, /* cost for loading QImode using movzbl */
1548 {4, 4, 4}, /* cost of loading integer registers
1549 in QImode, HImode and SImode.
1550 Relative to reg-reg move (2). */
1551 {4, 4, 4}, /* cost of storing integer registers */
1552 3, /* cost of reg,reg fld/fst */
1553 {12, 12, 12}, /* cost of loading fp registers
1554 in SFmode, DFmode and XFmode */
1555 {4, 4, 4}, /* cost of storing fp registers
1556 in SFmode, DFmode and XFmode */
1557 6, /* cost of moving MMX register */
1558 {12, 12}, /* cost of loading MMX registers
1559 in SImode and DImode */
1560 {12, 12}, /* cost of storing MMX registers
1561 in SImode and DImode */
1562 6, /* cost of moving SSE register */
1563 {12, 12, 12}, /* cost of loading SSE registers
1564 in SImode, DImode and TImode */
1565 {12, 12, 12}, /* cost of storing SSE registers
1566 in SImode, DImode and TImode */
1567 8, /* MMX or SSE register to integer */
1568 8, /* size of l1 cache. */
1569 1024, /* size of l2 cache. */
1570 128, /* size of prefetch block */
1571 8, /* number of parallel prefetches */
1572 1, /* Branch cost */
1573 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1574 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1575 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1576 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1577 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1578 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1579 nocona_memcpy,
1580 nocona_memset,
1581 1, /* scalar_stmt_cost. */
1582 1, /* scalar load_cost. */
1583 1, /* scalar_store_cost. */
1584 1, /* vec_stmt_cost. */
1585 1, /* vec_to_scalar_cost. */
1586 1, /* scalar_to_vec_cost. */
1587 1, /* vec_align_load_cost. */
1588 2, /* vec_unalign_load_cost. */
1589 1, /* vec_store_cost. */
1590 3, /* cond_taken_branch_cost. */
1591 1, /* cond_not_taken_branch_cost. */
1594 static stringop_algs atom_memcpy[2] = {
1595 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1596 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1597 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1598 static stringop_algs atom_memset[2] = {
1599 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1600 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1601 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1602 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1603 static const
1604 struct processor_costs atom_cost = {
1605 COSTS_N_INSNS (1), /* cost of an add instruction */
1606 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1607 COSTS_N_INSNS (1), /* variable shift costs */
1608 COSTS_N_INSNS (1), /* constant shift costs */
1609 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1610 COSTS_N_INSNS (4), /* HI */
1611 COSTS_N_INSNS (3), /* SI */
1612 COSTS_N_INSNS (4), /* DI */
1613 COSTS_N_INSNS (2)}, /* other */
1614 0, /* cost of multiply per each bit set */
1615 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1616 COSTS_N_INSNS (26), /* HI */
1617 COSTS_N_INSNS (42), /* SI */
1618 COSTS_N_INSNS (74), /* DI */
1619 COSTS_N_INSNS (74)}, /* other */
1620 COSTS_N_INSNS (1), /* cost of movsx */
1621 COSTS_N_INSNS (1), /* cost of movzx */
1622 8, /* "large" insn */
1623 17, /* MOVE_RATIO */
1624 4, /* cost for loading QImode using movzbl */
1625 {4, 4, 4}, /* cost of loading integer registers
1626 in QImode, HImode and SImode.
1627 Relative to reg-reg move (2). */
1628 {4, 4, 4}, /* cost of storing integer registers */
1629 4, /* cost of reg,reg fld/fst */
1630 {12, 12, 12}, /* cost of loading fp registers
1631 in SFmode, DFmode and XFmode */
1632 {6, 6, 8}, /* cost of storing fp registers
1633 in SFmode, DFmode and XFmode */
1634 2, /* cost of moving MMX register */
1635 {8, 8}, /* cost of loading MMX registers
1636 in SImode and DImode */
1637 {8, 8}, /* cost of storing MMX registers
1638 in SImode and DImode */
1639 2, /* cost of moving SSE register */
1640 {8, 8, 8}, /* cost of loading SSE registers
1641 in SImode, DImode and TImode */
1642 {8, 8, 8}, /* cost of storing SSE registers
1643 in SImode, DImode and TImode */
1644 5, /* MMX or SSE register to integer */
1645 32, /* size of l1 cache. */
1646 256, /* size of l2 cache. */
1647 64, /* size of prefetch block */
1648 6, /* number of parallel prefetches */
1649 3, /* Branch cost */
1650 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1651 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1652 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1653 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1654 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1655 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1656 atom_memcpy,
1657 atom_memset,
1658 1, /* scalar_stmt_cost. */
1659 1, /* scalar load_cost. */
1660 1, /* scalar_store_cost. */
1661 1, /* vec_stmt_cost. */
1662 1, /* vec_to_scalar_cost. */
1663 1, /* scalar_to_vec_cost. */
1664 1, /* vec_align_load_cost. */
1665 2, /* vec_unalign_load_cost. */
1666 1, /* vec_store_cost. */
1667 3, /* cond_taken_branch_cost. */
1668 1, /* cond_not_taken_branch_cost. */
1671 static stringop_algs slm_memcpy[2] = {
1672 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1673 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1674 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1675 static stringop_algs slm_memset[2] = {
1676 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1677 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1678 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1679 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1680 static const
1681 struct processor_costs slm_cost = {
1682 COSTS_N_INSNS (1), /* cost of an add instruction */
1683 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1684 COSTS_N_INSNS (1), /* variable shift costs */
1685 COSTS_N_INSNS (1), /* constant shift costs */
1686 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1687 COSTS_N_INSNS (3), /* HI */
1688 COSTS_N_INSNS (3), /* SI */
1689 COSTS_N_INSNS (4), /* DI */
1690 COSTS_N_INSNS (2)}, /* other */
1691 0, /* cost of multiply per each bit set */
1692 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1693 COSTS_N_INSNS (26), /* HI */
1694 COSTS_N_INSNS (42), /* SI */
1695 COSTS_N_INSNS (74), /* DI */
1696 COSTS_N_INSNS (74)}, /* other */
1697 COSTS_N_INSNS (1), /* cost of movsx */
1698 COSTS_N_INSNS (1), /* cost of movzx */
1699 8, /* "large" insn */
1700 17, /* MOVE_RATIO */
1701 4, /* cost for loading QImode using movzbl */
1702 {4, 4, 4}, /* cost of loading integer registers
1703 in QImode, HImode and SImode.
1704 Relative to reg-reg move (2). */
1705 {4, 4, 4}, /* cost of storing integer registers */
1706 4, /* cost of reg,reg fld/fst */
1707 {12, 12, 12}, /* cost of loading fp registers
1708 in SFmode, DFmode and XFmode */
1709 {6, 6, 8}, /* cost of storing fp registers
1710 in SFmode, DFmode and XFmode */
1711 2, /* cost of moving MMX register */
1712 {8, 8}, /* cost of loading MMX registers
1713 in SImode and DImode */
1714 {8, 8}, /* cost of storing MMX registers
1715 in SImode and DImode */
1716 2, /* cost of moving SSE register */
1717 {8, 8, 8}, /* cost of loading SSE registers
1718 in SImode, DImode and TImode */
1719 {8, 8, 8}, /* cost of storing SSE registers
1720 in SImode, DImode and TImode */
1721 5, /* MMX or SSE register to integer */
1722 32, /* size of l1 cache. */
1723 256, /* size of l2 cache. */
1724 64, /* size of prefetch block */
1725 6, /* number of parallel prefetches */
1726 3, /* Branch cost */
1727 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1728 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1729 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1730 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1731 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1732 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1733 slm_memcpy,
1734 slm_memset,
1735 1, /* scalar_stmt_cost. */
1736 1, /* scalar load_cost. */
1737 1, /* scalar_store_cost. */
1738 1, /* vec_stmt_cost. */
1739 1, /* vec_to_scalar_cost. */
1740 1, /* scalar_to_vec_cost. */
1741 1, /* vec_align_load_cost. */
1742 2, /* vec_unalign_load_cost. */
1743 1, /* vec_store_cost. */
1744 3, /* cond_taken_branch_cost. */
1745 1, /* cond_not_taken_branch_cost. */
1748 /* Generic should produce code tuned for Core-i7 (and newer chips)
1749 and btver1 (and newer chips). */
1751 static stringop_algs generic_memcpy[2] = {
1752 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1753 {-1, libcall, false}}},
1754 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1755 {-1, libcall, false}}}};
1756 static stringop_algs generic_memset[2] = {
1757 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1758 {-1, libcall, false}}},
1759 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1760 {-1, libcall, false}}}};
1761 static const
1762 struct processor_costs generic_cost = {
1763 COSTS_N_INSNS (1), /* cost of an add instruction */
1764 /* On all chips taken into consideration lea is 2 cycles and more. With
1765 this cost however our current implementation of synth_mult results in
1766 use of unnecessary temporary registers causing regression on several
1767 SPECfp benchmarks. */
1768 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1769 COSTS_N_INSNS (1), /* variable shift costs */
1770 COSTS_N_INSNS (1), /* constant shift costs */
1771 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1772 COSTS_N_INSNS (4), /* HI */
1773 COSTS_N_INSNS (3), /* SI */
1774 COSTS_N_INSNS (4), /* DI */
1775 COSTS_N_INSNS (2)}, /* other */
1776 0, /* cost of multiply per each bit set */
1777 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1778 COSTS_N_INSNS (26), /* HI */
1779 COSTS_N_INSNS (42), /* SI */
1780 COSTS_N_INSNS (74), /* DI */
1781 COSTS_N_INSNS (74)}, /* other */
1782 COSTS_N_INSNS (1), /* cost of movsx */
1783 COSTS_N_INSNS (1), /* cost of movzx */
1784 8, /* "large" insn */
1785 17, /* MOVE_RATIO */
1786 4, /* cost for loading QImode using movzbl */
1787 {4, 4, 4}, /* cost of loading integer registers
1788 in QImode, HImode and SImode.
1789 Relative to reg-reg move (2). */
1790 {4, 4, 4}, /* cost of storing integer registers */
1791 4, /* cost of reg,reg fld/fst */
1792 {12, 12, 12}, /* cost of loading fp registers
1793 in SFmode, DFmode and XFmode */
1794 {6, 6, 8}, /* cost of storing fp registers
1795 in SFmode, DFmode and XFmode */
1796 2, /* cost of moving MMX register */
1797 {8, 8}, /* cost of loading MMX registers
1798 in SImode and DImode */
1799 {8, 8}, /* cost of storing MMX registers
1800 in SImode and DImode */
1801 2, /* cost of moving SSE register */
1802 {8, 8, 8}, /* cost of loading SSE registers
1803 in SImode, DImode and TImode */
1804 {8, 8, 8}, /* cost of storing SSE registers
1805 in SImode, DImode and TImode */
1806 5, /* MMX or SSE register to integer */
1807 32, /* size of l1 cache. */
1808 512, /* size of l2 cache. */
1809 64, /* size of prefetch block */
1810 6, /* number of parallel prefetches */
1811 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1812 value is increased to perhaps more appropriate value of 5. */
1813 3, /* Branch cost */
1814 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1815 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1816 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1817 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1818 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1819 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1820 generic_memcpy,
1821 generic_memset,
1822 1, /* scalar_stmt_cost. */
1823 1, /* scalar load_cost. */
1824 1, /* scalar_store_cost. */
1825 1, /* vec_stmt_cost. */
1826 1, /* vec_to_scalar_cost. */
1827 1, /* scalar_to_vec_cost. */
1828 1, /* vec_align_load_cost. */
1829 2, /* vec_unalign_load_cost. */
1830 1, /* vec_store_cost. */
1831 3, /* cond_taken_branch_cost. */
1832 1, /* cond_not_taken_branch_cost. */
1835 /* core_cost should produce code tuned for Core familly of CPUs. */
1836 static stringop_algs core_memcpy[2] = {
1837 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1838 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1839 {-1, libcall, false}}}};
1840 static stringop_algs core_memset[2] = {
1841 {libcall, {{6, loop_1_byte, true},
1842 {24, loop, true},
1843 {8192, rep_prefix_4_byte, true},
1844 {-1, libcall, false}}},
1845 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1846 {-1, libcall, false}}}};
1848 static const
1849 struct processor_costs core_cost = {
1850 COSTS_N_INSNS (1), /* cost of an add instruction */
1851 /* On all chips taken into consideration lea is 2 cycles and more. With
1852 this cost however our current implementation of synth_mult results in
1853 use of unnecessary temporary registers causing regression on several
1854 SPECfp benchmarks. */
1855 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1856 COSTS_N_INSNS (1), /* variable shift costs */
1857 COSTS_N_INSNS (1), /* constant shift costs */
1858 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1859 COSTS_N_INSNS (4), /* HI */
1860 COSTS_N_INSNS (3), /* SI */
1861 COSTS_N_INSNS (4), /* DI */
1862 COSTS_N_INSNS (2)}, /* other */
1863 0, /* cost of multiply per each bit set */
1864 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1865 COSTS_N_INSNS (26), /* HI */
1866 COSTS_N_INSNS (42), /* SI */
1867 COSTS_N_INSNS (74), /* DI */
1868 COSTS_N_INSNS (74)}, /* other */
1869 COSTS_N_INSNS (1), /* cost of movsx */
1870 COSTS_N_INSNS (1), /* cost of movzx */
1871 8, /* "large" insn */
1872 17, /* MOVE_RATIO */
1873 4, /* cost for loading QImode using movzbl */
1874 {4, 4, 4}, /* cost of loading integer registers
1875 in QImode, HImode and SImode.
1876 Relative to reg-reg move (2). */
1877 {4, 4, 4}, /* cost of storing integer registers */
1878 4, /* cost of reg,reg fld/fst */
1879 {12, 12, 12}, /* cost of loading fp registers
1880 in SFmode, DFmode and XFmode */
1881 {6, 6, 8}, /* cost of storing fp registers
1882 in SFmode, DFmode and XFmode */
1883 2, /* cost of moving MMX register */
1884 {8, 8}, /* cost of loading MMX registers
1885 in SImode and DImode */
1886 {8, 8}, /* cost of storing MMX registers
1887 in SImode and DImode */
1888 2, /* cost of moving SSE register */
1889 {8, 8, 8}, /* cost of loading SSE registers
1890 in SImode, DImode and TImode */
1891 {8, 8, 8}, /* cost of storing SSE registers
1892 in SImode, DImode and TImode */
1893 5, /* MMX or SSE register to integer */
1894 64, /* size of l1 cache. */
1895 512, /* size of l2 cache. */
1896 64, /* size of prefetch block */
1897 6, /* number of parallel prefetches */
1898 /* FIXME perhaps more appropriate value is 5. */
1899 3, /* Branch cost */
1900 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1901 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1902 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1903 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1904 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1905 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1906 core_memcpy,
1907 core_memset,
1908 1, /* scalar_stmt_cost. */
1909 1, /* scalar load_cost. */
1910 1, /* scalar_store_cost. */
1911 1, /* vec_stmt_cost. */
1912 1, /* vec_to_scalar_cost. */
1913 1, /* scalar_to_vec_cost. */
1914 1, /* vec_align_load_cost. */
1915 2, /* vec_unalign_load_cost. */
1916 1, /* vec_store_cost. */
1917 3, /* cond_taken_branch_cost. */
1918 1, /* cond_not_taken_branch_cost. */
1922 /* Set by -mtune. */
1923 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1925 /* Set by -mtune or -Os. */
1926 const struct processor_costs *ix86_cost = &pentium_cost;
1928 /* Processor feature/optimization bitmasks. */
1929 #define m_386 (1<<PROCESSOR_I386)
1930 #define m_486 (1<<PROCESSOR_I486)
1931 #define m_PENT (1<<PROCESSOR_PENTIUM)
1932 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1933 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1934 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1935 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1936 #define m_CORE2 (1<<PROCESSOR_CORE2)
1937 #define m_COREI7 (1<<PROCESSOR_COREI7)
1938 #define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
1939 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1940 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
1941 #define m_ATOM (1<<PROCESSOR_ATOM)
1942 #define m_SLM (1<<PROCESSOR_SLM)
1944 #define m_GEODE (1<<PROCESSOR_GEODE)
1945 #define m_K6 (1<<PROCESSOR_K6)
1946 #define m_K6_GEODE (m_K6 | m_GEODE)
1947 #define m_K8 (1<<PROCESSOR_K8)
1948 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1949 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1950 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1951 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1952 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1953 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1954 #define m_BDVER4 (1<<PROCESSOR_BDVER4)
1955 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1956 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1957 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
1958 #define m_BTVER (m_BTVER1 | m_BTVER2)
1959 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1961 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1963 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1964 #undef DEF_TUNE
1965 #define DEF_TUNE(tune, name, selector) name,
1966 #include "x86-tune.def"
1967 #undef DEF_TUNE
1970 /* Feature tests against the various tunings. */
1971 unsigned char ix86_tune_features[X86_TUNE_LAST];
1973 /* Feature tests against the various tunings used to create ix86_tune_features
1974 based on the processor mask. */
1975 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1976 #undef DEF_TUNE
1977 #define DEF_TUNE(tune, name, selector) selector,
1978 #include "x86-tune.def"
1979 #undef DEF_TUNE
1982 /* Feature tests against the various architecture variations. */
1983 unsigned char ix86_arch_features[X86_ARCH_LAST];
1985 /* Feature tests against the various architecture variations, used to create
1986 ix86_arch_features based on the processor mask. */
1987 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1988 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1989 ~(m_386 | m_486 | m_PENT | m_K6),
1991 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1992 ~m_386,
1994 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1995 ~(m_386 | m_486),
1997 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1998 ~m_386,
2000 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2001 ~m_386,
2004 /* In case the average insn count for single function invocation is
2005 lower than this constant, emit fast (but longer) prologue and
2006 epilogue code. */
2007 #define FAST_PROLOGUE_INSN_COUNT 20
2009 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2010 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2011 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2012 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2014 /* Array of the smallest class containing reg number REGNO, indexed by
2015 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2017 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2019 /* ax, dx, cx, bx */
2020 AREG, DREG, CREG, BREG,
2021 /* si, di, bp, sp */
2022 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2023 /* FP registers */
2024 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2025 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2026 /* arg pointer */
2027 NON_Q_REGS,
2028 /* flags, fpsr, fpcr, frame */
2029 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2030 /* SSE registers */
2031 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2032 SSE_REGS, SSE_REGS,
2033 /* MMX registers */
2034 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2035 MMX_REGS, MMX_REGS,
2036 /* REX registers */
2037 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2038 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2039 /* SSE REX registers */
2040 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2041 SSE_REGS, SSE_REGS,
2042 /* AVX-512 SSE registers */
2043 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2044 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2045 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2046 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
2047 /* Mask registers. */
2048 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2049 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
2052 /* The "default" register map used in 32bit mode. */
2054 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2056 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2057 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2058 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2059 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2060 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2061 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2062 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2063 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2064 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2065 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2068 /* The "default" register map used in 64bit mode. */
2070 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2072 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2073 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2074 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2075 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2076 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2077 8,9,10,11,12,13,14,15, /* extended integer registers */
2078 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2079 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
2080 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
2081 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
2084 /* Define the register numbers to be used in Dwarf debugging information.
2085 The SVR4 reference port C compiler uses the following register numbers
2086 in its Dwarf output code:
2087 0 for %eax (gcc regno = 0)
2088 1 for %ecx (gcc regno = 2)
2089 2 for %edx (gcc regno = 1)
2090 3 for %ebx (gcc regno = 3)
2091 4 for %esp (gcc regno = 7)
2092 5 for %ebp (gcc regno = 6)
2093 6 for %esi (gcc regno = 4)
2094 7 for %edi (gcc regno = 5)
2095 The following three DWARF register numbers are never generated by
2096 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2097 believes these numbers have these meanings.
2098 8 for %eip (no gcc equivalent)
2099 9 for %eflags (gcc regno = 17)
2100 10 for %trapno (no gcc equivalent)
2101 It is not at all clear how we should number the FP stack registers
2102 for the x86 architecture. If the version of SDB on x86/svr4 were
2103 a bit less brain dead with respect to floating-point then we would
2104 have a precedent to follow with respect to DWARF register numbers
2105 for x86 FP registers, but the SDB on x86/svr4 is so completely
2106 broken with respect to FP registers that it is hardly worth thinking
2107 of it as something to strive for compatibility with.
2108 The version of x86/svr4 SDB I have at the moment does (partially)
2109 seem to believe that DWARF register number 11 is associated with
2110 the x86 register %st(0), but that's about all. Higher DWARF
2111 register numbers don't seem to be associated with anything in
2112 particular, and even for DWARF regno 11, SDB only seems to under-
2113 stand that it should say that a variable lives in %st(0) (when
2114 asked via an `=' command) if we said it was in DWARF regno 11,
2115 but SDB still prints garbage when asked for the value of the
2116 variable in question (via a `/' command).
2117 (Also note that the labels SDB prints for various FP stack regs
2118 when doing an `x' command are all wrong.)
2119 Note that these problems generally don't affect the native SVR4
2120 C compiler because it doesn't allow the use of -O with -g and
2121 because when it is *not* optimizing, it allocates a memory
2122 location for each floating-point variable, and the memory
2123 location is what gets described in the DWARF AT_location
2124 attribute for the variable in question.
2125 Regardless of the severe mental illness of the x86/svr4 SDB, we
2126 do something sensible here and we use the following DWARF
2127 register numbers. Note that these are all stack-top-relative
2128 numbers.
2129 11 for %st(0) (gcc regno = 8)
2130 12 for %st(1) (gcc regno = 9)
2131 13 for %st(2) (gcc regno = 10)
2132 14 for %st(3) (gcc regno = 11)
2133 15 for %st(4) (gcc regno = 12)
2134 16 for %st(5) (gcc regno = 13)
2135 17 for %st(6) (gcc regno = 14)
2136 18 for %st(7) (gcc regno = 15)
2138 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2140 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2141 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2142 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2143 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2144 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2145 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2146 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2147 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2148 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2149 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2152 /* Define parameter passing and return registers. */
2154 static int const x86_64_int_parameter_registers[6] =
2156 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2159 static int const x86_64_ms_abi_int_parameter_registers[4] =
2161 CX_REG, DX_REG, R8_REG, R9_REG
2164 static int const x86_64_int_return_registers[4] =
2166 AX_REG, DX_REG, DI_REG, SI_REG
2169 /* Additional registers that are clobbered by SYSV calls. */
2171 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2173 SI_REG, DI_REG,
2174 XMM6_REG, XMM7_REG,
2175 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2176 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2179 /* Define the structure for the machine field in struct function. */
2181 struct GTY(()) stack_local_entry {
2182 unsigned short mode;
2183 unsigned short n;
2184 rtx rtl;
2185 struct stack_local_entry *next;
2188 /* Structure describing stack frame layout.
2189 Stack grows downward:
2191 [arguments]
2192 <- ARG_POINTER
2193 saved pc
2195 saved static chain if ix86_static_chain_on_stack
2197 saved frame pointer if frame_pointer_needed
2198 <- HARD_FRAME_POINTER
2199 [saved regs]
2200 <- regs_save_offset
2201 [padding0]
2203 [saved SSE regs]
2204 <- sse_regs_save_offset
2205 [padding1] |
2206 | <- FRAME_POINTER
2207 [va_arg registers] |
2209 [frame] |
2211 [padding2] | = to_allocate
2212 <- STACK_POINTER
2214 struct ix86_frame
2216 int nsseregs;
2217 int nregs;
2218 int va_arg_size;
2219 int red_zone_size;
2220 int outgoing_arguments_size;
2222 /* The offsets relative to ARG_POINTER. */
2223 HOST_WIDE_INT frame_pointer_offset;
2224 HOST_WIDE_INT hard_frame_pointer_offset;
2225 HOST_WIDE_INT stack_pointer_offset;
2226 HOST_WIDE_INT hfp_save_offset;
2227 HOST_WIDE_INT reg_save_offset;
2228 HOST_WIDE_INT sse_reg_save_offset;
2230 /* When save_regs_using_mov is set, emit prologue using
2231 move instead of push instructions. */
2232 bool save_regs_using_mov;
2235 /* Which cpu are we scheduling for. */
2236 enum attr_cpu ix86_schedule;
2238 /* Which cpu are we optimizing for. */
2239 enum processor_type ix86_tune;
2241 /* Which instruction set architecture to use. */
2242 enum processor_type ix86_arch;
2244 /* True if processor has SSE prefetch instruction. */
2245 unsigned char x86_prefetch_sse;
2247 /* -mstackrealign option */
2248 static const char ix86_force_align_arg_pointer_string[]
2249 = "force_align_arg_pointer";
2251 static rtx (*ix86_gen_leave) (void);
2252 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2253 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2254 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2255 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2256 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2257 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2258 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2259 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2260 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2261 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2262 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2264 /* Preferred alignment for stack boundary in bits. */
2265 unsigned int ix86_preferred_stack_boundary;
2267 /* Alignment for incoming stack boundary in bits specified at
2268 command line. */
2269 static unsigned int ix86_user_incoming_stack_boundary;
2271 /* Default alignment for incoming stack boundary in bits. */
2272 static unsigned int ix86_default_incoming_stack_boundary;
2274 /* Alignment for incoming stack boundary in bits. */
2275 unsigned int ix86_incoming_stack_boundary;
2277 /* Calling abi specific va_list type nodes. */
2278 static GTY(()) tree sysv_va_list_type_node;
2279 static GTY(()) tree ms_va_list_type_node;
2281 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2282 char internal_label_prefix[16];
2283 int internal_label_prefix_len;
2285 /* Fence to use after loop using movnt. */
2286 tree x86_mfence;
2288 /* Register class used for passing given 64bit part of the argument.
2289 These represent classes as documented by the PS ABI, with the exception
2290 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2291 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2293 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2294 whenever possible (upper half does contain padding). */
2295 enum x86_64_reg_class
2297 X86_64_NO_CLASS,
2298 X86_64_INTEGER_CLASS,
2299 X86_64_INTEGERSI_CLASS,
2300 X86_64_SSE_CLASS,
2301 X86_64_SSESF_CLASS,
2302 X86_64_SSEDF_CLASS,
2303 X86_64_SSEUP_CLASS,
2304 X86_64_X87_CLASS,
2305 X86_64_X87UP_CLASS,
2306 X86_64_COMPLEX_X87_CLASS,
2307 X86_64_MEMORY_CLASS
2310 #define MAX_CLASSES 4
2312 /* Table of constants used by fldpi, fldln2, etc.... */
2313 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2314 static bool ext_80387_constants_init = 0;
2317 static struct machine_function * ix86_init_machine_status (void);
2318 static rtx ix86_function_value (const_tree, const_tree, bool);
2319 static bool ix86_function_value_regno_p (const unsigned int);
2320 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2321 const_tree);
2322 static rtx ix86_static_chain (const_tree, bool);
2323 static int ix86_function_regparm (const_tree, const_tree);
2324 static void ix86_compute_frame_layout (struct ix86_frame *);
2325 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2326 rtx, rtx, int);
2327 static void ix86_add_new_builtins (HOST_WIDE_INT);
2328 static tree ix86_canonical_va_list_type (tree);
2329 static void predict_jump (int);
2330 static unsigned int split_stack_prologue_scratch_regno (void);
2331 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2333 enum ix86_function_specific_strings
2335 IX86_FUNCTION_SPECIFIC_ARCH,
2336 IX86_FUNCTION_SPECIFIC_TUNE,
2337 IX86_FUNCTION_SPECIFIC_MAX
2340 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2341 const char *, enum fpmath_unit, bool);
2342 static void ix86_function_specific_save (struct cl_target_option *,
2343 struct gcc_options *opts);
2344 static void ix86_function_specific_restore (struct gcc_options *opts,
2345 struct cl_target_option *);
2346 static void ix86_function_specific_print (FILE *, int,
2347 struct cl_target_option *);
2348 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2349 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2350 struct gcc_options *,
2351 struct gcc_options *,
2352 struct gcc_options *);
2353 static bool ix86_can_inline_p (tree, tree);
2354 static void ix86_set_current_function (tree);
2355 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2357 static enum calling_abi ix86_function_abi (const_tree);
2360 #ifndef SUBTARGET32_DEFAULT_CPU
2361 #define SUBTARGET32_DEFAULT_CPU "i386"
2362 #endif
2364 /* Whether -mtune= or -march= were specified */
2365 static int ix86_tune_defaulted;
2366 static int ix86_arch_specified;
2368 /* Vectorization library interface and handlers. */
2369 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2371 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2372 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2374 /* Processor target table, indexed by processor number */
2375 struct ptt
2377 const struct processor_costs *cost; /* Processor costs */
2378 const int align_loop; /* Default alignments. */
2379 const int align_loop_max_skip;
2380 const int align_jump;
2381 const int align_jump_max_skip;
2382 const int align_func;
2385 static const struct ptt processor_target_table[PROCESSOR_max] =
2387 {&i386_cost, 4, 3, 4, 3, 4},
2388 {&i486_cost, 16, 15, 16, 15, 16},
2389 {&pentium_cost, 16, 7, 16, 7, 16},
2390 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2391 {&geode_cost, 0, 0, 0, 0, 0},
2392 {&k6_cost, 32, 7, 32, 7, 32},
2393 {&athlon_cost, 16, 7, 16, 7, 16},
2394 {&pentium4_cost, 0, 0, 0, 0, 0},
2395 {&k8_cost, 16, 7, 16, 7, 16},
2396 {&nocona_cost, 0, 0, 0, 0, 0},
2397 /* Core 2 */
2398 {&core_cost, 16, 10, 16, 10, 16},
2399 /* Core i7 */
2400 {&core_cost, 16, 10, 16, 10, 16},
2401 /* Core i7 avx */
2402 {&core_cost, 16, 10, 16, 10, 16},
2403 /* Core avx2 */
2404 {&core_cost, 16, 10, 16, 10, 16},
2405 {&generic_cost, 16, 10, 16, 10, 16},
2406 {&amdfam10_cost, 32, 24, 32, 7, 32},
2407 {&bdver1_cost, 16, 10, 16, 7, 11},
2408 {&bdver2_cost, 16, 10, 16, 7, 11},
2409 {&bdver3_cost, 16, 10, 16, 7, 11},
2410 {&bdver4_cost, 16, 10, 16, 7, 11},
2411 {&btver1_cost, 16, 10, 16, 7, 11},
2412 {&btver2_cost, 16, 10, 16, 7, 11},
2413 {&atom_cost, 16, 15, 16, 7, 16},
2414 {&slm_cost, 16, 15, 16, 7, 16}
2417 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2419 "generic",
2420 "i386",
2421 "i486",
2422 "pentium",
2423 "pentium-mmx",
2424 "pentiumpro",
2425 "pentium2",
2426 "pentium3",
2427 "pentium4",
2428 "pentium-m",
2429 "prescott",
2430 "nocona",
2431 "core2",
2432 "corei7",
2433 "corei7-avx",
2434 "core-avx2",
2435 "atom",
2436 "slm",
2437 "intel",
2438 "geode",
2439 "k6",
2440 "k6-2",
2441 "k6-3",
2442 "athlon",
2443 "athlon-4",
2444 "k8",
2445 "amdfam10",
2446 "bdver1",
2447 "bdver2",
2448 "bdver3",
2449 "bdver4",
2450 "btver1",
2451 "btver2"
2454 static bool
2455 gate_insert_vzeroupper (void)
2457 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2460 static unsigned int
2461 rest_of_handle_insert_vzeroupper (void)
2463 int i;
2465 /* vzeroupper instructions are inserted immediately after reload to
2466 account for possible spills from 256bit registers. The pass
2467 reuses mode switching infrastructure by re-running mode insertion
2468 pass, so disable entities that have already been processed. */
2469 for (i = 0; i < MAX_386_ENTITIES; i++)
2470 ix86_optimize_mode_switching[i] = 0;
2472 ix86_optimize_mode_switching[AVX_U128] = 1;
2474 /* Call optimize_mode_switching. */
2475 g->get_passes ()->execute_pass_mode_switching ();
2476 return 0;
2479 namespace {
2481 const pass_data pass_data_insert_vzeroupper =
2483 RTL_PASS, /* type */
2484 "vzeroupper", /* name */
2485 OPTGROUP_NONE, /* optinfo_flags */
2486 true, /* has_gate */
2487 true, /* has_execute */
2488 TV_NONE, /* tv_id */
2489 0, /* properties_required */
2490 0, /* properties_provided */
2491 0, /* properties_destroyed */
2492 0, /* todo_flags_start */
2493 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2496 class pass_insert_vzeroupper : public rtl_opt_pass
2498 public:
2499 pass_insert_vzeroupper(gcc::context *ctxt)
2500 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2503 /* opt_pass methods: */
2504 bool gate () { return gate_insert_vzeroupper (); }
2505 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2507 }; // class pass_insert_vzeroupper
2509 } // anon namespace
2511 rtl_opt_pass *
2512 make_pass_insert_vzeroupper (gcc::context *ctxt)
2514 return new pass_insert_vzeroupper (ctxt);
2517 /* Return true if a red-zone is in use. */
2519 static inline bool
2520 ix86_using_red_zone (void)
2522 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2525 /* Return a string that documents the current -m options. The caller is
2526 responsible for freeing the string. */
2528 static char *
2529 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2530 const char *tune, enum fpmath_unit fpmath,
2531 bool add_nl_p)
2533 struct ix86_target_opts
2535 const char *option; /* option string */
2536 HOST_WIDE_INT mask; /* isa mask options */
2539 /* This table is ordered so that options like -msse4.2 that imply
2540 preceding options while match those first. */
2541 static struct ix86_target_opts isa_opts[] =
2543 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2544 { "-mfma", OPTION_MASK_ISA_FMA },
2545 { "-mxop", OPTION_MASK_ISA_XOP },
2546 { "-mlwp", OPTION_MASK_ISA_LWP },
2547 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2548 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2549 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2550 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2551 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2552 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2553 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2554 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2555 { "-msse3", OPTION_MASK_ISA_SSE3 },
2556 { "-msse2", OPTION_MASK_ISA_SSE2 },
2557 { "-msse", OPTION_MASK_ISA_SSE },
2558 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2559 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2560 { "-mmmx", OPTION_MASK_ISA_MMX },
2561 { "-mabm", OPTION_MASK_ISA_ABM },
2562 { "-mbmi", OPTION_MASK_ISA_BMI },
2563 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2564 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2565 { "-mhle", OPTION_MASK_ISA_HLE },
2566 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2567 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2568 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2569 { "-madx", OPTION_MASK_ISA_ADX },
2570 { "-mtbm", OPTION_MASK_ISA_TBM },
2571 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2572 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2573 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2574 { "-maes", OPTION_MASK_ISA_AES },
2575 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2576 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2577 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2578 { "-mf16c", OPTION_MASK_ISA_F16C },
2579 { "-mrtm", OPTION_MASK_ISA_RTM },
2580 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2581 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2584 /* Flag options. */
2585 static struct ix86_target_opts flag_opts[] =
2587 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2588 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2589 { "-m80387", MASK_80387 },
2590 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2591 { "-malign-double", MASK_ALIGN_DOUBLE },
2592 { "-mcld", MASK_CLD },
2593 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2594 { "-mieee-fp", MASK_IEEE_FP },
2595 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2596 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2597 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2598 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2599 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2600 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2601 { "-mno-red-zone", MASK_NO_RED_ZONE },
2602 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2603 { "-mrecip", MASK_RECIP },
2604 { "-mrtd", MASK_RTD },
2605 { "-msseregparm", MASK_SSEREGPARM },
2606 { "-mstack-arg-probe", MASK_STACK_PROBE },
2607 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2608 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2609 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2610 { "-mvzeroupper", MASK_VZEROUPPER },
2611 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2612 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2613 { "-mprefer-avx128", MASK_PREFER_AVX128},
2616 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2618 char isa_other[40];
2619 char target_other[40];
2620 unsigned num = 0;
2621 unsigned i, j;
2622 char *ret;
2623 char *ptr;
2624 size_t len;
2625 size_t line_len;
2626 size_t sep_len;
2627 const char *abi;
2629 memset (opts, '\0', sizeof (opts));
2631 /* Add -march= option. */
2632 if (arch)
2634 opts[num][0] = "-march=";
2635 opts[num++][1] = arch;
2638 /* Add -mtune= option. */
2639 if (tune)
2641 opts[num][0] = "-mtune=";
2642 opts[num++][1] = tune;
2645 /* Add -m32/-m64/-mx32. */
2646 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2648 if ((isa & OPTION_MASK_ABI_64) != 0)
2649 abi = "-m64";
2650 else
2651 abi = "-mx32";
2652 isa &= ~ (OPTION_MASK_ISA_64BIT
2653 | OPTION_MASK_ABI_64
2654 | OPTION_MASK_ABI_X32);
2656 else
2657 abi = "-m32";
2658 opts[num++][0] = abi;
2660 /* Pick out the options in isa options. */
2661 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2663 if ((isa & isa_opts[i].mask) != 0)
2665 opts[num++][0] = isa_opts[i].option;
2666 isa &= ~ isa_opts[i].mask;
2670 if (isa && add_nl_p)
2672 opts[num++][0] = isa_other;
2673 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2674 isa);
2677 /* Add flag options. */
2678 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2680 if ((flags & flag_opts[i].mask) != 0)
2682 opts[num++][0] = flag_opts[i].option;
2683 flags &= ~ flag_opts[i].mask;
2687 if (flags && add_nl_p)
2689 opts[num++][0] = target_other;
2690 sprintf (target_other, "(other flags: %#x)", flags);
2693 /* Add -fpmath= option. */
2694 if (fpmath)
2696 opts[num][0] = "-mfpmath=";
2697 switch ((int) fpmath)
2699 case FPMATH_387:
2700 opts[num++][1] = "387";
2701 break;
2703 case FPMATH_SSE:
2704 opts[num++][1] = "sse";
2705 break;
2707 case FPMATH_387 | FPMATH_SSE:
2708 opts[num++][1] = "sse+387";
2709 break;
2711 default:
2712 gcc_unreachable ();
2716 /* Any options? */
2717 if (num == 0)
2718 return NULL;
2720 gcc_assert (num < ARRAY_SIZE (opts));
2722 /* Size the string. */
2723 len = 0;
2724 sep_len = (add_nl_p) ? 3 : 1;
2725 for (i = 0; i < num; i++)
2727 len += sep_len;
2728 for (j = 0; j < 2; j++)
2729 if (opts[i][j])
2730 len += strlen (opts[i][j]);
2733 /* Build the string. */
2734 ret = ptr = (char *) xmalloc (len);
2735 line_len = 0;
2737 for (i = 0; i < num; i++)
2739 size_t len2[2];
2741 for (j = 0; j < 2; j++)
2742 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2744 if (i != 0)
2746 *ptr++ = ' ';
2747 line_len++;
2749 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2751 *ptr++ = '\\';
2752 *ptr++ = '\n';
2753 line_len = 0;
2757 for (j = 0; j < 2; j++)
2758 if (opts[i][j])
2760 memcpy (ptr, opts[i][j], len2[j]);
2761 ptr += len2[j];
2762 line_len += len2[j];
2766 *ptr = '\0';
2767 gcc_assert (ret + len >= ptr);
2769 return ret;
2772 /* Return true, if profiling code should be emitted before
2773 prologue. Otherwise it returns false.
2774 Note: For x86 with "hotfix" it is sorried. */
2775 static bool
2776 ix86_profile_before_prologue (void)
2778 return flag_fentry != 0;
2781 /* Function that is callable from the debugger to print the current
2782 options. */
2783 void ATTRIBUTE_UNUSED
2784 ix86_debug_options (void)
2786 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2787 ix86_arch_string, ix86_tune_string,
2788 ix86_fpmath, true);
2790 if (opts)
2792 fprintf (stderr, "%s\n\n", opts);
2793 free (opts);
2795 else
2796 fputs ("<no options>\n\n", stderr);
2798 return;
2801 static const char *stringop_alg_names[] = {
2802 #define DEF_ENUM
2803 #define DEF_ALG(alg, name) #name,
2804 #include "stringop.def"
2805 #undef DEF_ENUM
2806 #undef DEF_ALG
2809 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2810 The string is of the following form (or comma separated list of it):
2812 strategy_alg:max_size:[align|noalign]
2814 where the full size range for the strategy is either [0, max_size] or
2815 [min_size, max_size], in which min_size is the max_size + 1 of the
2816 preceding range. The last size range must have max_size == -1.
2818 Examples:
2821 -mmemcpy-strategy=libcall:-1:noalign
2823 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2827 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2829 This is to tell the compiler to use the following strategy for memset
2830 1) when the expected size is between [1, 16], use rep_8byte strategy;
2831 2) when the size is between [17, 2048], use vector_loop;
2832 3) when the size is > 2048, use libcall. */
2834 struct stringop_size_range
2836 int max;
2837 stringop_alg alg;
2838 bool noalign;
2841 static void
2842 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2844 const struct stringop_algs *default_algs;
2845 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2846 char *curr_range_str, *next_range_str;
2847 int i = 0, n = 0;
2849 if (is_memset)
2850 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2851 else
2852 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2854 curr_range_str = strategy_str;
2858 int maxs;
2859 stringop_alg alg;
2860 char alg_name[128];
2861 char align[16];
2862 next_range_str = strchr (curr_range_str, ',');
2863 if (next_range_str)
2864 *next_range_str++ = '\0';
2866 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2867 alg_name, &maxs, align))
2869 error ("wrong arg %s to option %s", curr_range_str,
2870 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2871 return;
2874 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2876 error ("size ranges of option %s should be increasing",
2877 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2878 return;
2881 for (i = 0; i < last_alg; i++)
2883 if (!strcmp (alg_name, stringop_alg_names[i]))
2885 alg = (stringop_alg) i;
2886 break;
2890 if (i == last_alg)
2892 error ("wrong stringop strategy name %s specified for option %s",
2893 alg_name,
2894 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2895 return;
2898 input_ranges[n].max = maxs;
2899 input_ranges[n].alg = alg;
2900 if (!strcmp (align, "align"))
2901 input_ranges[n].noalign = false;
2902 else if (!strcmp (align, "noalign"))
2903 input_ranges[n].noalign = true;
2904 else
2906 error ("unknown alignment %s specified for option %s",
2907 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2908 return;
2910 n++;
2911 curr_range_str = next_range_str;
2913 while (curr_range_str);
2915 if (input_ranges[n - 1].max != -1)
2917 error ("the max value for the last size range should be -1"
2918 " for option %s",
2919 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2920 return;
2923 if (n > MAX_STRINGOP_ALGS)
2925 error ("too many size ranges specified in option %s",
2926 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2927 return;
2930 /* Now override the default algs array. */
2931 for (i = 0; i < n; i++)
2933 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2934 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2935 = input_ranges[i].alg;
2936 *const_cast<int *>(&default_algs->size[i].noalign)
2937 = input_ranges[i].noalign;
2942 /* parse -mtune-ctrl= option. When DUMP is true,
2943 print the features that are explicitly set. */
2945 static void
2946 parse_mtune_ctrl_str (bool dump)
2948 if (!ix86_tune_ctrl_string)
2949 return;
2951 char *next_feature_string = NULL;
2952 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2953 char *orig = curr_feature_string;
2954 int i;
2957 bool clear = false;
2959 next_feature_string = strchr (curr_feature_string, ',');
2960 if (next_feature_string)
2961 *next_feature_string++ = '\0';
2962 if (*curr_feature_string == '^')
2964 curr_feature_string++;
2965 clear = true;
2967 for (i = 0; i < X86_TUNE_LAST; i++)
2969 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2971 ix86_tune_features[i] = !clear;
2972 if (dump)
2973 fprintf (stderr, "Explicitly %s feature %s\n",
2974 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2975 break;
2978 if (i == X86_TUNE_LAST)
2979 error ("Unknown parameter to option -mtune-ctrl: %s",
2980 clear ? curr_feature_string - 1 : curr_feature_string);
2981 curr_feature_string = next_feature_string;
2983 while (curr_feature_string);
2984 free (orig);
2987 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2988 processor type. */
2990 static void
2991 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2993 unsigned int ix86_tune_mask = 1u << ix86_tune;
2994 int i;
2996 for (i = 0; i < X86_TUNE_LAST; ++i)
2998 if (ix86_tune_no_default)
2999 ix86_tune_features[i] = 0;
3000 else
3001 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3004 if (dump)
3006 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3007 for (i = 0; i < X86_TUNE_LAST; i++)
3008 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3009 ix86_tune_features[i] ? "on" : "off");
3012 parse_mtune_ctrl_str (dump);
3016 /* Override various settings based on options. If MAIN_ARGS_P, the
3017 options are from the command line, otherwise they are from
3018 attributes. */
3020 static void
3021 ix86_option_override_internal (bool main_args_p,
3022 struct gcc_options *opts,
3023 struct gcc_options *opts_set)
3025 int i;
3026 unsigned int ix86_arch_mask;
3027 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3028 const char *prefix;
3029 const char *suffix;
3030 const char *sw;
3032 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3033 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3034 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3035 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3036 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3037 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3038 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3039 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3040 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3041 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3042 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3043 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3044 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3045 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3046 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3047 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3048 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3049 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3050 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3051 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3052 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3053 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3054 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3055 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3056 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3057 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3058 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3059 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3060 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3061 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3062 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3063 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3064 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3065 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3066 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3067 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3068 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3069 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3070 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3071 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3072 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3073 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3074 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3075 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3077 /* if this reaches 64, need to widen struct pta flags below */
3079 static struct pta
3081 const char *const name; /* processor name or nickname. */
3082 const enum processor_type processor;
3083 const enum attr_cpu schedule;
3084 const unsigned HOST_WIDE_INT flags;
3086 const processor_alias_table[] =
3088 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3089 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3090 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3091 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3092 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3093 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3094 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3095 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3096 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3097 PTA_MMX | PTA_SSE | PTA_FXSR},
3098 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3099 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3100 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3101 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3102 PTA_MMX | PTA_SSE | PTA_FXSR},
3103 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3104 PTA_MMX | PTA_SSE | PTA_FXSR},
3105 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3106 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3107 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3108 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3109 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3110 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3111 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3112 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3113 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3116 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3117 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3118 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3119 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3120 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3121 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3122 {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
3123 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3124 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3125 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3126 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3127 {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
3128 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3129 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3130 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3131 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3132 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3133 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3134 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3135 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3136 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3137 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3138 | PTA_XSAVEOPT},
3139 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3140 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3141 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3142 {"slm", PROCESSOR_SLM, CPU_SLM,
3143 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3144 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_AES
3145 | PTA_PCLMUL | PTA_RDRND | PTA_MOVBE | PTA_FXSR},
3146 {"intel", PROCESSOR_SLM, CPU_SLM,
3147 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3148 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3149 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3150 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3151 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3152 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3153 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3154 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3155 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3156 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3157 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3158 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3159 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3160 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3161 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3162 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3163 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3164 {"x86-64", PROCESSOR_K8, CPU_K8,
3165 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3166 {"k8", PROCESSOR_K8, CPU_K8,
3167 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3168 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3169 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3170 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3171 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3172 {"opteron", PROCESSOR_K8, CPU_K8,
3173 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3174 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3175 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3176 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3177 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3178 {"athlon64", PROCESSOR_K8, CPU_K8,
3179 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3180 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3181 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3182 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3183 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3184 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3185 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3186 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3187 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3188 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3189 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3190 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3191 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3192 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3193 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3194 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3195 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3196 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3197 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3198 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3199 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3200 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3201 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3202 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3203 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3204 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3205 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3206 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3207 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3208 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3209 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3210 | PTA_XSAVEOPT | PTA_FSGSBASE},
3211 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3212 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3213 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3214 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3215 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3216 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3217 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE},
3218 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3219 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3220 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3221 | PTA_FXSR | PTA_XSAVE},
3222 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3223 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3224 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3225 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3226 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3227 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3229 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3230 PTA_64BIT
3231 | PTA_HLE /* flags are only used for -march switch. */ },
3234 /* -mrecip options. */
3235 static struct
3237 const char *string; /* option name */
3238 unsigned int mask; /* mask bits to set */
3240 const recip_options[] =
3242 { "all", RECIP_MASK_ALL },
3243 { "none", RECIP_MASK_NONE },
3244 { "div", RECIP_MASK_DIV },
3245 { "sqrt", RECIP_MASK_SQRT },
3246 { "vec-div", RECIP_MASK_VEC_DIV },
3247 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3250 int const pta_size = ARRAY_SIZE (processor_alias_table);
3252 /* Set up prefix/suffix so the error messages refer to either the command
3253 line argument, or the attribute(target). */
3254 if (main_args_p)
3256 prefix = "-m";
3257 suffix = "";
3258 sw = "switch";
3260 else
3262 prefix = "option(\"";
3263 suffix = "\")";
3264 sw = "attribute";
3267 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3268 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3269 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3270 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3271 #ifdef TARGET_BI_ARCH
3272 else
3274 #if TARGET_BI_ARCH == 1
3275 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3276 is on and OPTION_MASK_ABI_X32 is off. We turn off
3277 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3278 -mx32. */
3279 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3280 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3281 #else
3282 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3283 on and OPTION_MASK_ABI_64 is off. We turn off
3284 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3285 -m64. */
3286 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3287 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3288 #endif
3290 #endif
3292 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3294 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3295 OPTION_MASK_ABI_64 for TARGET_X32. */
3296 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3297 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3299 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3301 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3302 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3303 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3304 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3307 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3308 SUBTARGET_OVERRIDE_OPTIONS;
3309 #endif
3311 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3312 SUBSUBTARGET_OVERRIDE_OPTIONS;
3313 #endif
3315 /* -fPIC is the default for x86_64. */
3316 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3317 opts->x_flag_pic = 2;
3319 /* Need to check -mtune=generic first. */
3320 if (opts->x_ix86_tune_string)
3322 if (!strcmp (opts->x_ix86_tune_string, "generic")
3323 || !strcmp (opts->x_ix86_tune_string, "i686")
3324 /* As special support for cross compilers we read -mtune=native
3325 as -mtune=generic. With native compilers we won't see the
3326 -mtune=native, as it was changed by the driver. */
3327 || !strcmp (opts->x_ix86_tune_string, "native"))
3329 opts->x_ix86_tune_string = "generic";
3331 /* If this call is for setting the option attribute, allow the
3332 generic that was previously set. */
3333 else if (!main_args_p
3334 && !strcmp (opts->x_ix86_tune_string, "generic"))
3336 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3337 error ("bad value (%s) for %stune=%s %s",
3338 opts->x_ix86_tune_string, prefix, suffix, sw);
3339 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3340 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3341 "%stune=k8%s or %stune=generic%s instead as appropriate",
3342 prefix, suffix, prefix, suffix, prefix, suffix);
3344 else
3346 if (opts->x_ix86_arch_string)
3347 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3348 if (!opts->x_ix86_tune_string)
3350 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3351 ix86_tune_defaulted = 1;
3354 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3355 or defaulted. We need to use a sensible tune option. */
3356 if (!strcmp (opts->x_ix86_tune_string, "generic")
3357 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3358 || !strcmp (opts->x_ix86_tune_string, "i686"))
3360 opts->x_ix86_tune_string = "generic";
3364 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3365 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3367 /* rep; movq isn't available in 32-bit code. */
3368 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3369 opts->x_ix86_stringop_alg = no_stringop;
3372 if (!opts->x_ix86_arch_string)
3373 opts->x_ix86_arch_string
3374 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3375 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3376 else
3377 ix86_arch_specified = 1;
3379 if (opts_set->x_ix86_pmode)
3381 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3382 && opts->x_ix86_pmode == PMODE_SI)
3383 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3384 && opts->x_ix86_pmode == PMODE_DI))
3385 error ("address mode %qs not supported in the %s bit mode",
3386 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3387 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3389 else
3390 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3391 ? PMODE_DI : PMODE_SI;
3393 if (!opts_set->x_ix86_abi)
3394 opts->x_ix86_abi = DEFAULT_ABI;
3396 /* For targets using ms ABI enable ms-extensions, if not
3397 explicit turned off. For non-ms ABI we turn off this
3398 option. */
3399 if (!opts_set->x_flag_ms_extensions)
3400 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3402 if (opts_set->x_ix86_cmodel)
3404 switch (opts->x_ix86_cmodel)
3406 case CM_SMALL:
3407 case CM_SMALL_PIC:
3408 if (opts->x_flag_pic)
3409 opts->x_ix86_cmodel = CM_SMALL_PIC;
3410 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3411 error ("code model %qs not supported in the %s bit mode",
3412 "small", "32");
3413 break;
3415 case CM_MEDIUM:
3416 case CM_MEDIUM_PIC:
3417 if (opts->x_flag_pic)
3418 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3419 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3420 error ("code model %qs not supported in the %s bit mode",
3421 "medium", "32");
3422 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3423 error ("code model %qs not supported in x32 mode",
3424 "medium");
3425 break;
3427 case CM_LARGE:
3428 case CM_LARGE_PIC:
3429 if (opts->x_flag_pic)
3430 opts->x_ix86_cmodel = CM_LARGE_PIC;
3431 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3432 error ("code model %qs not supported in the %s bit mode",
3433 "large", "32");
3434 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3435 error ("code model %qs not supported in x32 mode",
3436 "large");
3437 break;
3439 case CM_32:
3440 if (opts->x_flag_pic)
3441 error ("code model %s does not support PIC mode", "32");
3442 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3443 error ("code model %qs not supported in the %s bit mode",
3444 "32", "64");
3445 break;
3447 case CM_KERNEL:
3448 if (opts->x_flag_pic)
3450 error ("code model %s does not support PIC mode", "kernel");
3451 opts->x_ix86_cmodel = CM_32;
3453 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3454 error ("code model %qs not supported in the %s bit mode",
3455 "kernel", "32");
3456 break;
3458 default:
3459 gcc_unreachable ();
3462 else
3464 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3465 use of rip-relative addressing. This eliminates fixups that
3466 would otherwise be needed if this object is to be placed in a
3467 DLL, and is essentially just as efficient as direct addressing. */
3468 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3469 && (TARGET_RDOS || TARGET_PECOFF))
3470 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3471 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3472 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3473 else
3474 opts->x_ix86_cmodel = CM_32;
3476 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3478 error ("-masm=intel not supported in this configuration");
3479 opts->x_ix86_asm_dialect = ASM_ATT;
3481 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3482 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3483 sorry ("%i-bit mode not compiled in",
3484 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3486 for (i = 0; i < pta_size; i++)
3487 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3489 ix86_schedule = processor_alias_table[i].schedule;
3490 ix86_arch = processor_alias_table[i].processor;
3491 /* Default cpu tuning to the architecture. */
3492 ix86_tune = ix86_arch;
3494 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3495 && !(processor_alias_table[i].flags & PTA_64BIT))
3496 error ("CPU you selected does not support x86-64 "
3497 "instruction set");
3499 if (processor_alias_table[i].flags & PTA_MMX
3500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3502 if (processor_alias_table[i].flags & PTA_3DNOW
3503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3505 if (processor_alias_table[i].flags & PTA_3DNOW_A
3506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3508 if (processor_alias_table[i].flags & PTA_SSE
3509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3511 if (processor_alias_table[i].flags & PTA_SSE2
3512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3514 if (processor_alias_table[i].flags & PTA_SSE3
3515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3517 if (processor_alias_table[i].flags & PTA_SSSE3
3518 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3519 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3520 if (processor_alias_table[i].flags & PTA_SSE4_1
3521 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3522 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3523 if (processor_alias_table[i].flags & PTA_SSE4_2
3524 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3525 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3526 if (processor_alias_table[i].flags & PTA_AVX
3527 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3528 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3529 if (processor_alias_table[i].flags & PTA_AVX2
3530 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3531 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3532 if (processor_alias_table[i].flags & PTA_FMA
3533 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3534 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3535 if (processor_alias_table[i].flags & PTA_SSE4A
3536 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3537 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3538 if (processor_alias_table[i].flags & PTA_FMA4
3539 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3540 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3541 if (processor_alias_table[i].flags & PTA_XOP
3542 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3543 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3544 if (processor_alias_table[i].flags & PTA_LWP
3545 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3546 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3547 if (processor_alias_table[i].flags & PTA_ABM
3548 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3549 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3550 if (processor_alias_table[i].flags & PTA_BMI
3551 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3552 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3553 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3554 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3555 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3556 if (processor_alias_table[i].flags & PTA_TBM
3557 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3558 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3559 if (processor_alias_table[i].flags & PTA_BMI2
3560 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3561 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3562 if (processor_alias_table[i].flags & PTA_CX16
3563 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3564 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3565 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3566 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3567 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3568 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3569 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3570 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3571 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3572 if (processor_alias_table[i].flags & PTA_MOVBE
3573 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3574 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3575 if (processor_alias_table[i].flags & PTA_AES
3576 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3577 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3578 if (processor_alias_table[i].flags & PTA_PCLMUL
3579 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3580 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3581 if (processor_alias_table[i].flags & PTA_FSGSBASE
3582 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3583 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3584 if (processor_alias_table[i].flags & PTA_RDRND
3585 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3586 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3587 if (processor_alias_table[i].flags & PTA_F16C
3588 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3589 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3590 if (processor_alias_table[i].flags & PTA_RTM
3591 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3592 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3593 if (processor_alias_table[i].flags & PTA_HLE
3594 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3595 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3596 if (processor_alias_table[i].flags & PTA_PRFCHW
3597 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3598 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3599 if (processor_alias_table[i].flags & PTA_RDSEED
3600 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3601 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3602 if (processor_alias_table[i].flags & PTA_ADX
3603 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3604 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3605 if (processor_alias_table[i].flags & PTA_FXSR
3606 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3607 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3608 if (processor_alias_table[i].flags & PTA_XSAVE
3609 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3610 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3611 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3612 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3613 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3614 if (processor_alias_table[i].flags & PTA_AVX512F
3615 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3616 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3617 if (processor_alias_table[i].flags & PTA_AVX512ER
3618 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3619 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3620 if (processor_alias_table[i].flags & PTA_AVX512PF
3621 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3622 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3623 if (processor_alias_table[i].flags & PTA_AVX512CD
3624 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3625 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3626 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3627 x86_prefetch_sse = true;
3629 break;
3632 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3633 error ("generic CPU can be used only for %stune=%s %s",
3634 prefix, suffix, sw);
3635 else if (!strcmp (ix86_arch_string, "intel"))
3636 error ("intel CPU can be used only for %stune=%s %s",
3637 prefix, suffix, sw);
3638 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3639 error ("bad value (%s) for %sarch=%s %s",
3640 opts->x_ix86_arch_string, prefix, suffix, sw);
3642 ix86_arch_mask = 1u << ix86_arch;
3643 for (i = 0; i < X86_ARCH_LAST; ++i)
3644 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3646 for (i = 0; i < pta_size; i++)
3647 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3649 ix86_schedule = processor_alias_table[i].schedule;
3650 ix86_tune = processor_alias_table[i].processor;
3651 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3653 if (!(processor_alias_table[i].flags & PTA_64BIT))
3655 if (ix86_tune_defaulted)
3657 opts->x_ix86_tune_string = "x86-64";
3658 for (i = 0; i < pta_size; i++)
3659 if (! strcmp (opts->x_ix86_tune_string,
3660 processor_alias_table[i].name))
3661 break;
3662 ix86_schedule = processor_alias_table[i].schedule;
3663 ix86_tune = processor_alias_table[i].processor;
3665 else
3666 error ("CPU you selected does not support x86-64 "
3667 "instruction set");
3670 /* Intel CPUs have always interpreted SSE prefetch instructions as
3671 NOPs; so, we can enable SSE prefetch instructions even when
3672 -mtune (rather than -march) points us to a processor that has them.
3673 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3674 higher processors. */
3675 if (TARGET_CMOV
3676 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3677 x86_prefetch_sse = true;
3678 break;
3681 if (ix86_tune_specified && i == pta_size)
3682 error ("bad value (%s) for %stune=%s %s",
3683 opts->x_ix86_tune_string, prefix, suffix, sw);
3685 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3687 #ifndef USE_IX86_FRAME_POINTER
3688 #define USE_IX86_FRAME_POINTER 0
3689 #endif
3691 #ifndef USE_X86_64_FRAME_POINTER
3692 #define USE_X86_64_FRAME_POINTER 0
3693 #endif
3695 /* Set the default values for switches whose default depends on TARGET_64BIT
3696 in case they weren't overwritten by command line options. */
3697 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3699 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3700 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3701 if (opts->x_flag_asynchronous_unwind_tables
3702 && !opts_set->x_flag_unwind_tables
3703 && TARGET_64BIT_MS_ABI)
3704 opts->x_flag_unwind_tables = 1;
3705 if (opts->x_flag_asynchronous_unwind_tables == 2)
3706 opts->x_flag_unwind_tables
3707 = opts->x_flag_asynchronous_unwind_tables = 1;
3708 if (opts->x_flag_pcc_struct_return == 2)
3709 opts->x_flag_pcc_struct_return = 0;
3711 else
3713 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3714 opts->x_flag_omit_frame_pointer
3715 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3716 if (opts->x_flag_asynchronous_unwind_tables == 2)
3717 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3718 if (opts->x_flag_pcc_struct_return == 2)
3719 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3722 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3723 if (opts->x_optimize_size)
3724 ix86_cost = &ix86_size_cost;
3725 else
3726 ix86_cost = ix86_tune_cost;
3728 /* Arrange to set up i386_stack_locals for all functions. */
3729 init_machine_status = ix86_init_machine_status;
3731 /* Validate -mregparm= value. */
3732 if (opts_set->x_ix86_regparm)
3734 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3735 warning (0, "-mregparm is ignored in 64-bit mode");
3736 if (opts->x_ix86_regparm > REGPARM_MAX)
3738 error ("-mregparm=%d is not between 0 and %d",
3739 opts->x_ix86_regparm, REGPARM_MAX);
3740 opts->x_ix86_regparm = 0;
3743 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3744 opts->x_ix86_regparm = REGPARM_MAX;
3746 /* Default align_* from the processor table. */
3747 if (opts->x_align_loops == 0)
3749 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3750 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3752 if (opts->x_align_jumps == 0)
3754 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3755 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3757 if (opts->x_align_functions == 0)
3759 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3762 /* Provide default for -mbranch-cost= value. */
3763 if (!opts_set->x_ix86_branch_cost)
3764 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3766 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3768 opts->x_target_flags
3769 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3771 /* Enable by default the SSE and MMX builtins. Do allow the user to
3772 explicitly disable any of these. In particular, disabling SSE and
3773 MMX for kernel code is extremely useful. */
3774 if (!ix86_arch_specified)
3775 opts->x_ix86_isa_flags
3776 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3777 | TARGET_SUBTARGET64_ISA_DEFAULT)
3778 & ~opts->x_ix86_isa_flags_explicit);
3780 if (TARGET_RTD_P (opts->x_target_flags))
3781 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3783 else
3785 opts->x_target_flags
3786 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3788 if (!ix86_arch_specified)
3789 opts->x_ix86_isa_flags
3790 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3792 /* i386 ABI does not specify red zone. It still makes sense to use it
3793 when programmer takes care to stack from being destroyed. */
3794 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3795 opts->x_target_flags |= MASK_NO_RED_ZONE;
3798 /* Keep nonleaf frame pointers. */
3799 if (opts->x_flag_omit_frame_pointer)
3800 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3801 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3802 opts->x_flag_omit_frame_pointer = 1;
3804 /* If we're doing fast math, we don't care about comparison order
3805 wrt NaNs. This lets us use a shorter comparison sequence. */
3806 if (opts->x_flag_finite_math_only)
3807 opts->x_target_flags &= ~MASK_IEEE_FP;
3809 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3810 since the insns won't need emulation. */
3811 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3812 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3814 /* Likewise, if the target doesn't have a 387, or we've specified
3815 software floating point, don't use 387 inline intrinsics. */
3816 if (!TARGET_80387_P (opts->x_target_flags))
3817 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3819 /* Turn on MMX builtins for -msse. */
3820 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3821 opts->x_ix86_isa_flags
3822 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3824 /* Enable SSE prefetch. */
3825 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3826 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3827 x86_prefetch_sse = true;
3829 /* Enable prefetch{,w} instructions for -m3dnow. */
3830 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3831 opts->x_ix86_isa_flags
3832 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3834 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3835 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3836 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3837 opts->x_ix86_isa_flags
3838 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3840 /* Enable lzcnt instruction for -mabm. */
3841 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3842 opts->x_ix86_isa_flags
3843 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3845 /* Validate -mpreferred-stack-boundary= value or default it to
3846 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3847 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3848 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3850 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3851 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3852 int max = (TARGET_SEH ? 4 : 12);
3854 if (opts->x_ix86_preferred_stack_boundary_arg < min
3855 || opts->x_ix86_preferred_stack_boundary_arg > max)
3857 if (min == max)
3858 error ("-mpreferred-stack-boundary is not supported "
3859 "for this target");
3860 else
3861 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3862 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3864 else
3865 ix86_preferred_stack_boundary
3866 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3869 /* Set the default value for -mstackrealign. */
3870 if (opts->x_ix86_force_align_arg_pointer == -1)
3871 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3873 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3875 /* Validate -mincoming-stack-boundary= value or default it to
3876 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3877 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3878 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3880 if (opts->x_ix86_incoming_stack_boundary_arg
3881 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3882 || opts->x_ix86_incoming_stack_boundary_arg > 12)
3883 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3884 opts->x_ix86_incoming_stack_boundary_arg,
3885 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3886 else
3888 ix86_user_incoming_stack_boundary
3889 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3890 ix86_incoming_stack_boundary
3891 = ix86_user_incoming_stack_boundary;
3895 /* Accept -msseregparm only if at least SSE support is enabled. */
3896 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3897 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3898 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3900 if (opts_set->x_ix86_fpmath)
3902 if (opts->x_ix86_fpmath & FPMATH_SSE)
3904 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3906 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3907 opts->x_ix86_fpmath = FPMATH_387;
3909 else if ((opts->x_ix86_fpmath & FPMATH_387)
3910 && !TARGET_80387_P (opts->x_target_flags))
3912 warning (0, "387 instruction set disabled, using SSE arithmetics");
3913 opts->x_ix86_fpmath = FPMATH_SSE;
3917 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3918 fpmath=387. The second is however default at many targets since the
3919 extra 80bit precision of temporaries is considered to be part of ABI.
3920 Overwrite the default at least for -ffast-math.
3921 TODO: -mfpmath=both seems to produce same performing code with bit
3922 smaller binaries. It is however not clear if register allocation is
3923 ready for this setting.
3924 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3925 codegen. We may switch to 387 with -ffast-math for size optimized
3926 functions. */
3927 else if (fast_math_flags_set_p (&global_options)
3928 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3929 opts->x_ix86_fpmath = FPMATH_SSE;
3930 else
3931 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3933 /* If the i387 is disabled, then do not return values in it. */
3934 if (!TARGET_80387_P (opts->x_target_flags))
3935 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3937 /* Use external vectorized library in vectorizing intrinsics. */
3938 if (opts_set->x_ix86_veclibabi_type)
3939 switch (opts->x_ix86_veclibabi_type)
3941 case ix86_veclibabi_type_svml:
3942 ix86_veclib_handler = ix86_veclibabi_svml;
3943 break;
3945 case ix86_veclibabi_type_acml:
3946 ix86_veclib_handler = ix86_veclibabi_acml;
3947 break;
3949 default:
3950 gcc_unreachable ();
3953 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3954 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3955 && !opts->x_optimize_size)
3956 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3958 /* If stack probes are required, the space used for large function
3959 arguments on the stack must also be probed, so enable
3960 -maccumulate-outgoing-args so this happens in the prologue. */
3961 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3962 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3964 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3965 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3966 "for correctness", prefix, suffix);
3967 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3970 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3972 char *p;
3973 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3974 p = strchr (internal_label_prefix, 'X');
3975 internal_label_prefix_len = p - internal_label_prefix;
3976 *p = '\0';
3979 /* When scheduling description is not available, disable scheduler pass
3980 so it won't slow down the compilation and make x87 code slower. */
3981 if (!TARGET_SCHEDULE)
3982 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3984 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3985 ix86_tune_cost->simultaneous_prefetches,
3986 opts->x_param_values,
3987 opts_set->x_param_values);
3988 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3989 ix86_tune_cost->prefetch_block,
3990 opts->x_param_values,
3991 opts_set->x_param_values);
3992 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3993 ix86_tune_cost->l1_cache_size,
3994 opts->x_param_values,
3995 opts_set->x_param_values);
3996 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3997 ix86_tune_cost->l2_cache_size,
3998 opts->x_param_values,
3999 opts_set->x_param_values);
4001 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4002 if (opts->x_flag_prefetch_loop_arrays < 0
4003 && HAVE_prefetch
4004 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4005 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4006 opts->x_flag_prefetch_loop_arrays = 1;
4008 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4009 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4010 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4011 targetm.expand_builtin_va_start = NULL;
4013 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4015 ix86_gen_leave = gen_leave_rex64;
4016 if (Pmode == DImode)
4018 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4019 ix86_gen_tls_local_dynamic_base_64
4020 = gen_tls_local_dynamic_base_64_di;
4022 else
4024 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4025 ix86_gen_tls_local_dynamic_base_64
4026 = gen_tls_local_dynamic_base_64_si;
4029 else
4030 ix86_gen_leave = gen_leave;
4032 if (Pmode == DImode)
4034 ix86_gen_add3 = gen_adddi3;
4035 ix86_gen_sub3 = gen_subdi3;
4036 ix86_gen_sub3_carry = gen_subdi3_carry;
4037 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4038 ix86_gen_andsp = gen_anddi3;
4039 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4040 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4041 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4042 ix86_gen_monitor = gen_sse3_monitor_di;
4044 else
4046 ix86_gen_add3 = gen_addsi3;
4047 ix86_gen_sub3 = gen_subsi3;
4048 ix86_gen_sub3_carry = gen_subsi3_carry;
4049 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4050 ix86_gen_andsp = gen_andsi3;
4051 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4052 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4053 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4054 ix86_gen_monitor = gen_sse3_monitor_si;
4057 #ifdef USE_IX86_CLD
4058 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4059 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4060 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4061 #endif
4063 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
4065 if (opts->x_flag_fentry > 0)
4066 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4067 "with -fpic");
4068 opts->x_flag_fentry = 0;
4070 else if (TARGET_SEH)
4072 if (opts->x_flag_fentry == 0)
4073 sorry ("-mno-fentry isn%'t compatible with SEH");
4074 opts->x_flag_fentry = 1;
4076 else if (opts->x_flag_fentry < 0)
4078 #if defined(PROFILE_BEFORE_PROLOGUE)
4079 opts->x_flag_fentry = 1;
4080 #else
4081 opts->x_flag_fentry = 0;
4082 #endif
4085 /* When not opts->x_optimize for size, enable vzeroupper optimization for
4086 TARGET_AVX with -fexpensive-optimizations and split 32-byte
4087 AVX unaligned load/store. */
4088 if (!opts->x_optimize_size)
4090 if (flag_expensive_optimizations
4091 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
4092 opts->x_target_flags |= MASK_VZEROUPPER;
4093 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4094 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4095 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4096 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4097 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4098 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4099 /* Enable 128-bit AVX instruction generation
4100 for the auto-vectorizer. */
4101 if (TARGET_AVX128_OPTIMAL
4102 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
4103 opts->x_target_flags |= MASK_PREFER_AVX128;
4106 if (opts->x_ix86_recip_name)
4108 char *p = ASTRDUP (opts->x_ix86_recip_name);
4109 char *q;
4110 unsigned int mask, i;
4111 bool invert;
4113 while ((q = strtok (p, ",")) != NULL)
4115 p = NULL;
4116 if (*q == '!')
4118 invert = true;
4119 q++;
4121 else
4122 invert = false;
4124 if (!strcmp (q, "default"))
4125 mask = RECIP_MASK_ALL;
4126 else
4128 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4129 if (!strcmp (q, recip_options[i].string))
4131 mask = recip_options[i].mask;
4132 break;
4135 if (i == ARRAY_SIZE (recip_options))
4137 error ("unknown option for -mrecip=%s", q);
4138 invert = false;
4139 mask = RECIP_MASK_NONE;
4143 opts->x_recip_mask_explicit |= mask;
4144 if (invert)
4145 opts->x_recip_mask &= ~mask;
4146 else
4147 opts->x_recip_mask |= mask;
4151 if (TARGET_RECIP_P (opts->x_target_flags))
4152 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4153 else if (opts_set->x_target_flags & MASK_RECIP)
4154 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4156 /* Default long double to 64-bit for Bionic. */
4157 if (TARGET_HAS_BIONIC
4158 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4159 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4161 /* Save the initial options in case the user does function specific
4162 options. */
4163 if (main_args_p)
4164 target_option_default_node = target_option_current_node
4165 = build_target_option_node (opts);
4167 /* Handle stack protector */
4168 if (!opts_set->x_ix86_stack_protector_guard)
4169 opts->x_ix86_stack_protector_guard
4170 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4172 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4173 if (opts->x_ix86_tune_memcpy_strategy)
4175 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4176 ix86_parse_stringop_strategy_string (str, false);
4177 free (str);
4180 if (opts->x_ix86_tune_memset_strategy)
4182 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4183 ix86_parse_stringop_strategy_string (str, true);
4184 free (str);
4188 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4190 static void
4191 ix86_option_override (void)
4193 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4194 static struct register_pass_info insert_vzeroupper_info
4195 = { pass_insert_vzeroupper, "reload",
4196 1, PASS_POS_INSERT_AFTER
4199 ix86_option_override_internal (true, &global_options, &global_options_set);
4202 /* This needs to be done at start up. It's convenient to do it here. */
4203 register_pass (&insert_vzeroupper_info);
4206 /* Update register usage after having seen the compiler flags. */
4208 static void
4209 ix86_conditional_register_usage (void)
4211 int i, c_mask;
4212 unsigned int j;
4214 /* The PIC register, if it exists, is fixed. */
4215 j = PIC_OFFSET_TABLE_REGNUM;
4216 if (j != INVALID_REGNUM)
4217 fixed_regs[j] = call_used_regs[j] = 1;
4219 /* For 32-bit targets, squash the REX registers. */
4220 if (! TARGET_64BIT)
4222 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4223 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4224 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4225 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4226 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4227 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4230 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4231 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4232 : TARGET_64BIT ? (1 << 2)
4233 : (1 << 1));
4235 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4237 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4239 /* Set/reset conditionally defined registers from
4240 CALL_USED_REGISTERS initializer. */
4241 if (call_used_regs[i] > 1)
4242 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4244 /* Calculate registers of CLOBBERED_REGS register set
4245 as call used registers from GENERAL_REGS register set. */
4246 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4247 && call_used_regs[i])
4248 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4251 /* If MMX is disabled, squash the registers. */
4252 if (! TARGET_MMX)
4253 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4254 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4255 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4257 /* If SSE is disabled, squash the registers. */
4258 if (! TARGET_SSE)
4259 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4260 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4261 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4263 /* If the FPU is disabled, squash the registers. */
4264 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4265 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4266 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4267 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4269 /* If AVX512F is disabled, squash the registers. */
4270 if (! TARGET_AVX512F)
4272 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4273 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4275 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4276 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4281 /* Save the current options */
4283 static void
4284 ix86_function_specific_save (struct cl_target_option *ptr,
4285 struct gcc_options *opts)
4287 ptr->arch = ix86_arch;
4288 ptr->schedule = ix86_schedule;
4289 ptr->tune = ix86_tune;
4290 ptr->branch_cost = ix86_branch_cost;
4291 ptr->tune_defaulted = ix86_tune_defaulted;
4292 ptr->arch_specified = ix86_arch_specified;
4293 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4294 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4295 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4296 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
4297 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
4298 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
4299 ptr->x_ix86_abi = opts->x_ix86_abi;
4300 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
4301 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
4302 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
4303 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
4304 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
4305 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
4306 ptr->x_ix86_pmode = opts->x_ix86_pmode;
4307 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
4308 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
4309 ptr->x_ix86_regparm = opts->x_ix86_regparm;
4310 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
4311 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
4312 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
4313 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
4314 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
4315 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
4316 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
4317 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
4318 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
4319 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
4321 /* The fields are char but the variables are not; make sure the
4322 values fit in the fields. */
4323 gcc_assert (ptr->arch == ix86_arch);
4324 gcc_assert (ptr->schedule == ix86_schedule);
4325 gcc_assert (ptr->tune == ix86_tune);
4326 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4329 /* Restore the current options */
4331 static void
4332 ix86_function_specific_restore (struct gcc_options *opts,
4333 struct cl_target_option *ptr)
4335 enum processor_type old_tune = ix86_tune;
4336 enum processor_type old_arch = ix86_arch;
4337 unsigned int ix86_arch_mask;
4338 int i;
4340 /* We don't change -fPIC. */
4341 opts->x_flag_pic = flag_pic;
4343 ix86_arch = (enum processor_type) ptr->arch;
4344 ix86_schedule = (enum attr_cpu) ptr->schedule;
4345 ix86_tune = (enum processor_type) ptr->tune;
4346 opts->x_ix86_branch_cost = ptr->branch_cost;
4347 ix86_tune_defaulted = ptr->tune_defaulted;
4348 ix86_arch_specified = ptr->arch_specified;
4349 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4350 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4351 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4352 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
4353 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
4354 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
4355 opts->x_ix86_abi = ptr->x_ix86_abi;
4356 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
4357 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
4358 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
4359 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
4360 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
4361 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
4362 opts->x_ix86_pmode = ptr->x_ix86_pmode;
4363 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
4364 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
4365 opts->x_ix86_regparm = ptr->x_ix86_regparm;
4366 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
4367 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
4368 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
4369 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
4370 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
4371 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
4372 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
4373 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
4374 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
4375 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
4377 /* Recreate the arch feature tests if the arch changed */
4378 if (old_arch != ix86_arch)
4380 ix86_arch_mask = 1u << ix86_arch;
4381 for (i = 0; i < X86_ARCH_LAST; ++i)
4382 ix86_arch_features[i]
4383 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4386 /* Recreate the tune optimization tests */
4387 if (old_tune != ix86_tune)
4388 set_ix86_tune_features (ix86_tune, false);
4391 /* Print the current options */
4393 static void
4394 ix86_function_specific_print (FILE *file, int indent,
4395 struct cl_target_option *ptr)
4397 char *target_string
4398 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4399 NULL, NULL, ptr->x_ix86_fpmath, false);
4401 fprintf (file, "%*sarch = %d (%s)\n",
4402 indent, "",
4403 ptr->arch,
4404 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4405 ? cpu_names[ptr->arch]
4406 : "<unknown>"));
4408 fprintf (file, "%*stune = %d (%s)\n",
4409 indent, "",
4410 ptr->tune,
4411 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4412 ? cpu_names[ptr->tune]
4413 : "<unknown>"));
4415 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4417 if (target_string)
4419 fprintf (file, "%*s%s\n", indent, "", target_string);
4420 free (target_string);
4425 /* Inner function to process the attribute((target(...))), take an argument and
4426 set the current options from the argument. If we have a list, recursively go
4427 over the list. */
4429 static bool
4430 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4431 struct gcc_options *opts,
4432 struct gcc_options *opts_set,
4433 struct gcc_options *enum_opts_set)
4435 char *next_optstr;
4436 bool ret = true;
4438 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4439 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4440 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4441 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4442 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4444 enum ix86_opt_type
4446 ix86_opt_unknown,
4447 ix86_opt_yes,
4448 ix86_opt_no,
4449 ix86_opt_str,
4450 ix86_opt_enum,
4451 ix86_opt_isa
4454 static const struct
4456 const char *string;
4457 size_t len;
4458 enum ix86_opt_type type;
4459 int opt;
4460 int mask;
4461 } attrs[] = {
4462 /* isa options */
4463 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4464 IX86_ATTR_ISA ("abm", OPT_mabm),
4465 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4466 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4467 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4468 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4469 IX86_ATTR_ISA ("aes", OPT_maes),
4470 IX86_ATTR_ISA ("avx", OPT_mavx),
4471 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4472 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4473 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4474 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4475 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4476 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4477 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4478 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4479 IX86_ATTR_ISA ("sse", OPT_msse),
4480 IX86_ATTR_ISA ("sse2", OPT_msse2),
4481 IX86_ATTR_ISA ("sse3", OPT_msse3),
4482 IX86_ATTR_ISA ("sse4", OPT_msse4),
4483 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4484 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4485 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4486 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4487 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4488 IX86_ATTR_ISA ("fma", OPT_mfma),
4489 IX86_ATTR_ISA ("xop", OPT_mxop),
4490 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4491 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4492 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4493 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4494 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4495 IX86_ATTR_ISA ("hle", OPT_mhle),
4496 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4497 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4498 IX86_ATTR_ISA ("adx", OPT_madx),
4499 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4500 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4501 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4503 /* enum options */
4504 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4506 /* string options */
4507 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4508 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4510 /* flag options */
4511 IX86_ATTR_YES ("cld",
4512 OPT_mcld,
4513 MASK_CLD),
4515 IX86_ATTR_NO ("fancy-math-387",
4516 OPT_mfancy_math_387,
4517 MASK_NO_FANCY_MATH_387),
4519 IX86_ATTR_YES ("ieee-fp",
4520 OPT_mieee_fp,
4521 MASK_IEEE_FP),
4523 IX86_ATTR_YES ("inline-all-stringops",
4524 OPT_minline_all_stringops,
4525 MASK_INLINE_ALL_STRINGOPS),
4527 IX86_ATTR_YES ("inline-stringops-dynamically",
4528 OPT_minline_stringops_dynamically,
4529 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4531 IX86_ATTR_NO ("align-stringops",
4532 OPT_mno_align_stringops,
4533 MASK_NO_ALIGN_STRINGOPS),
4535 IX86_ATTR_YES ("recip",
4536 OPT_mrecip,
4537 MASK_RECIP),
4541 /* If this is a list, recurse to get the options. */
4542 if (TREE_CODE (args) == TREE_LIST)
4544 bool ret = true;
4546 for (; args; args = TREE_CHAIN (args))
4547 if (TREE_VALUE (args)
4548 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4549 p_strings, opts, opts_set,
4550 enum_opts_set))
4551 ret = false;
4553 return ret;
4556 else if (TREE_CODE (args) != STRING_CST)
4558 error ("attribute %<target%> argument not a string");
4559 return false;
4562 /* Handle multiple arguments separated by commas. */
4563 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4565 while (next_optstr && *next_optstr != '\0')
4567 char *p = next_optstr;
4568 char *orig_p = p;
4569 char *comma = strchr (next_optstr, ',');
4570 const char *opt_string;
4571 size_t len, opt_len;
4572 int opt;
4573 bool opt_set_p;
4574 char ch;
4575 unsigned i;
4576 enum ix86_opt_type type = ix86_opt_unknown;
4577 int mask = 0;
4579 if (comma)
4581 *comma = '\0';
4582 len = comma - next_optstr;
4583 next_optstr = comma + 1;
4585 else
4587 len = strlen (p);
4588 next_optstr = NULL;
4591 /* Recognize no-xxx. */
4592 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4594 opt_set_p = false;
4595 p += 3;
4596 len -= 3;
4598 else
4599 opt_set_p = true;
4601 /* Find the option. */
4602 ch = *p;
4603 opt = N_OPTS;
4604 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4606 type = attrs[i].type;
4607 opt_len = attrs[i].len;
4608 if (ch == attrs[i].string[0]
4609 && ((type != ix86_opt_str && type != ix86_opt_enum)
4610 ? len == opt_len
4611 : len > opt_len)
4612 && memcmp (p, attrs[i].string, opt_len) == 0)
4614 opt = attrs[i].opt;
4615 mask = attrs[i].mask;
4616 opt_string = attrs[i].string;
4617 break;
4621 /* Process the option. */
4622 if (opt == N_OPTS)
4624 error ("attribute(target(\"%s\")) is unknown", orig_p);
4625 ret = false;
4628 else if (type == ix86_opt_isa)
4630 struct cl_decoded_option decoded;
4632 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4633 ix86_handle_option (opts, opts_set,
4634 &decoded, input_location);
4637 else if (type == ix86_opt_yes || type == ix86_opt_no)
4639 if (type == ix86_opt_no)
4640 opt_set_p = !opt_set_p;
4642 if (opt_set_p)
4643 opts->x_target_flags |= mask;
4644 else
4645 opts->x_target_flags &= ~mask;
4648 else if (type == ix86_opt_str)
4650 if (p_strings[opt])
4652 error ("option(\"%s\") was already specified", opt_string);
4653 ret = false;
4655 else
4656 p_strings[opt] = xstrdup (p + opt_len);
4659 else if (type == ix86_opt_enum)
4661 bool arg_ok;
4662 int value;
4664 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4665 if (arg_ok)
4666 set_option (opts, enum_opts_set, opt, value,
4667 p + opt_len, DK_UNSPECIFIED, input_location,
4668 global_dc);
4669 else
4671 error ("attribute(target(\"%s\")) is unknown", orig_p);
4672 ret = false;
4676 else
4677 gcc_unreachable ();
4680 return ret;
4683 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4685 tree
4686 ix86_valid_target_attribute_tree (tree args,
4687 struct gcc_options *opts,
4688 struct gcc_options *opts_set)
4690 const char *orig_arch_string = opts->x_ix86_arch_string;
4691 const char *orig_tune_string = opts->x_ix86_tune_string;
4692 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4693 int orig_tune_defaulted = ix86_tune_defaulted;
4694 int orig_arch_specified = ix86_arch_specified;
4695 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4696 tree t = NULL_TREE;
4697 int i;
4698 struct cl_target_option *def
4699 = TREE_TARGET_OPTION (target_option_default_node);
4700 struct gcc_options enum_opts_set;
4702 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4704 /* Process each of the options on the chain. */
4705 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4706 opts_set, &enum_opts_set))
4707 return error_mark_node;
4709 /* If the changed options are different from the default, rerun
4710 ix86_option_override_internal, and then save the options away.
4711 The string options are are attribute options, and will be undone
4712 when we copy the save structure. */
4713 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4714 || opts->x_target_flags != def->x_target_flags
4715 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4716 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4717 || enum_opts_set.x_ix86_fpmath)
4719 /* If we are using the default tune= or arch=, undo the string assigned,
4720 and use the default. */
4721 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4722 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4723 else if (!orig_arch_specified)
4724 opts->x_ix86_arch_string = NULL;
4726 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4727 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4728 else if (orig_tune_defaulted)
4729 opts->x_ix86_tune_string = NULL;
4731 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4732 if (enum_opts_set.x_ix86_fpmath)
4733 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4734 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4735 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4737 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4738 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4741 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4742 ix86_option_override_internal (false, opts, opts_set);
4744 /* Add any builtin functions with the new isa if any. */
4745 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4747 /* Save the current options unless we are validating options for
4748 #pragma. */
4749 t = build_target_option_node (opts);
4751 opts->x_ix86_arch_string = orig_arch_string;
4752 opts->x_ix86_tune_string = orig_tune_string;
4753 opts_set->x_ix86_fpmath = orig_fpmath_set;
4755 /* Free up memory allocated to hold the strings */
4756 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4757 free (option_strings[i]);
4760 return t;
4763 /* Hook to validate attribute((target("string"))). */
4765 static bool
4766 ix86_valid_target_attribute_p (tree fndecl,
4767 tree ARG_UNUSED (name),
4768 tree args,
4769 int ARG_UNUSED (flags))
4771 struct gcc_options func_options;
4772 tree new_target, new_optimize;
4773 bool ret = true;
4775 /* attribute((target("default"))) does nothing, beyond
4776 affecting multi-versioning. */
4777 if (TREE_VALUE (args)
4778 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4779 && TREE_CHAIN (args) == NULL_TREE
4780 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4781 return true;
4783 tree old_optimize = build_optimization_node (&global_options);
4785 /* Get the optimization options of the current function. */
4786 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4788 if (!func_optimize)
4789 func_optimize = old_optimize;
4791 /* Init func_options. */
4792 memset (&func_options, 0, sizeof (func_options));
4793 init_options_struct (&func_options, NULL);
4794 lang_hooks.init_options_struct (&func_options);
4796 cl_optimization_restore (&func_options,
4797 TREE_OPTIMIZATION (func_optimize));
4799 /* Initialize func_options to the default before its target options can
4800 be set. */
4801 cl_target_option_restore (&func_options,
4802 TREE_TARGET_OPTION (target_option_default_node));
4804 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4805 &global_options_set);
4807 new_optimize = build_optimization_node (&func_options);
4809 if (new_target == error_mark_node)
4810 ret = false;
4812 else if (fndecl && new_target)
4814 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4816 if (old_optimize != new_optimize)
4817 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4820 return ret;
4824 /* Hook to determine if one function can safely inline another. */
4826 static bool
4827 ix86_can_inline_p (tree caller, tree callee)
4829 bool ret = false;
4830 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4831 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4833 /* If callee has no option attributes, then it is ok to inline. */
4834 if (!callee_tree)
4835 ret = true;
4837 /* If caller has no option attributes, but callee does then it is not ok to
4838 inline. */
4839 else if (!caller_tree)
4840 ret = false;
4842 else
4844 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4845 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4847 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4848 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4849 function. */
4850 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4851 != callee_opts->x_ix86_isa_flags)
4852 ret = false;
4854 /* See if we have the same non-isa options. */
4855 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4856 ret = false;
4858 /* See if arch, tune, etc. are the same. */
4859 else if (caller_opts->arch != callee_opts->arch)
4860 ret = false;
4862 else if (caller_opts->tune != callee_opts->tune)
4863 ret = false;
4865 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4866 ret = false;
4868 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4869 ret = false;
4871 else
4872 ret = true;
4875 return ret;
4879 /* Remember the last target of ix86_set_current_function. */
4880 static GTY(()) tree ix86_previous_fndecl;
4882 /* Invalidate ix86_previous_fndecl cache. */
4883 void
4884 ix86_reset_previous_fndecl (void)
4886 ix86_previous_fndecl = NULL_TREE;
4889 /* Establish appropriate back-end context for processing the function
4890 FNDECL. The argument might be NULL to indicate processing at top
4891 level, outside of any function scope. */
4892 static void
4893 ix86_set_current_function (tree fndecl)
4895 /* Only change the context if the function changes. This hook is called
4896 several times in the course of compiling a function, and we don't want to
4897 slow things down too much or call target_reinit when it isn't safe. */
4898 if (fndecl && fndecl != ix86_previous_fndecl)
4900 tree old_tree = (ix86_previous_fndecl
4901 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4902 : NULL_TREE);
4904 tree new_tree = (fndecl
4905 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4906 : NULL_TREE);
4908 ix86_previous_fndecl = fndecl;
4909 if (old_tree == new_tree)
4912 else if (new_tree)
4914 cl_target_option_restore (&global_options,
4915 TREE_TARGET_OPTION (new_tree));
4916 target_reinit ();
4919 else if (old_tree)
4921 struct cl_target_option *def
4922 = TREE_TARGET_OPTION (target_option_current_node);
4924 cl_target_option_restore (&global_options, def);
4925 target_reinit ();
4931 /* Return true if this goes in large data/bss. */
4933 static bool
4934 ix86_in_large_data_p (tree exp)
4936 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4937 return false;
4939 /* Functions are never large data. */
4940 if (TREE_CODE (exp) == FUNCTION_DECL)
4941 return false;
4943 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4945 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4946 if (strcmp (section, ".ldata") == 0
4947 || strcmp (section, ".lbss") == 0)
4948 return true;
4949 return false;
4951 else
4953 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4955 /* If this is an incomplete type with size 0, then we can't put it
4956 in data because it might be too big when completed. */
4957 if (!size || size > ix86_section_threshold)
4958 return true;
4961 return false;
4964 /* Switch to the appropriate section for output of DECL.
4965 DECL is either a `VAR_DECL' node or a constant of some sort.
4966 RELOC indicates whether forming the initial value of DECL requires
4967 link-time relocations. */
4969 ATTRIBUTE_UNUSED static section *
4970 x86_64_elf_select_section (tree decl, int reloc,
4971 unsigned HOST_WIDE_INT align)
4973 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4974 && ix86_in_large_data_p (decl))
4976 const char *sname = NULL;
4977 unsigned int flags = SECTION_WRITE;
4978 switch (categorize_decl_for_section (decl, reloc))
4980 case SECCAT_DATA:
4981 sname = ".ldata";
4982 break;
4983 case SECCAT_DATA_REL:
4984 sname = ".ldata.rel";
4985 break;
4986 case SECCAT_DATA_REL_LOCAL:
4987 sname = ".ldata.rel.local";
4988 break;
4989 case SECCAT_DATA_REL_RO:
4990 sname = ".ldata.rel.ro";
4991 break;
4992 case SECCAT_DATA_REL_RO_LOCAL:
4993 sname = ".ldata.rel.ro.local";
4994 break;
4995 case SECCAT_BSS:
4996 sname = ".lbss";
4997 flags |= SECTION_BSS;
4998 break;
4999 case SECCAT_RODATA:
5000 case SECCAT_RODATA_MERGE_STR:
5001 case SECCAT_RODATA_MERGE_STR_INIT:
5002 case SECCAT_RODATA_MERGE_CONST:
5003 sname = ".lrodata";
5004 flags = 0;
5005 break;
5006 case SECCAT_SRODATA:
5007 case SECCAT_SDATA:
5008 case SECCAT_SBSS:
5009 gcc_unreachable ();
5010 case SECCAT_TEXT:
5011 case SECCAT_TDATA:
5012 case SECCAT_TBSS:
5013 /* We don't split these for medium model. Place them into
5014 default sections and hope for best. */
5015 break;
5017 if (sname)
5019 /* We might get called with string constants, but get_named_section
5020 doesn't like them as they are not DECLs. Also, we need to set
5021 flags in that case. */
5022 if (!DECL_P (decl))
5023 return get_section (sname, flags, NULL);
5024 return get_named_section (decl, sname, reloc);
5027 return default_elf_select_section (decl, reloc, align);
5030 /* Select a set of attributes for section NAME based on the properties
5031 of DECL and whether or not RELOC indicates that DECL's initializer
5032 might contain runtime relocations. */
5034 static unsigned int ATTRIBUTE_UNUSED
5035 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
5037 unsigned int flags = default_section_type_flags (decl, name, reloc);
5039 if (decl == NULL_TREE
5040 && (strcmp (name, ".ldata.rel.ro") == 0
5041 || strcmp (name, ".ldata.rel.ro.local") == 0))
5042 flags |= SECTION_RELRO;
5044 if (strcmp (name, ".lbss") == 0
5045 || strncmp (name, ".lbss.", 5) == 0
5046 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
5047 flags |= SECTION_BSS;
5049 return flags;
5052 /* Build up a unique section name, expressed as a
5053 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
5054 RELOC indicates whether the initial value of EXP requires
5055 link-time relocations. */
5057 static void ATTRIBUTE_UNUSED
5058 x86_64_elf_unique_section (tree decl, int reloc)
5060 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5061 && ix86_in_large_data_p (decl))
5063 const char *prefix = NULL;
5064 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
5065 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
5067 switch (categorize_decl_for_section (decl, reloc))
5069 case SECCAT_DATA:
5070 case SECCAT_DATA_REL:
5071 case SECCAT_DATA_REL_LOCAL:
5072 case SECCAT_DATA_REL_RO:
5073 case SECCAT_DATA_REL_RO_LOCAL:
5074 prefix = one_only ? ".ld" : ".ldata";
5075 break;
5076 case SECCAT_BSS:
5077 prefix = one_only ? ".lb" : ".lbss";
5078 break;
5079 case SECCAT_RODATA:
5080 case SECCAT_RODATA_MERGE_STR:
5081 case SECCAT_RODATA_MERGE_STR_INIT:
5082 case SECCAT_RODATA_MERGE_CONST:
5083 prefix = one_only ? ".lr" : ".lrodata";
5084 break;
5085 case SECCAT_SRODATA:
5086 case SECCAT_SDATA:
5087 case SECCAT_SBSS:
5088 gcc_unreachable ();
5089 case SECCAT_TEXT:
5090 case SECCAT_TDATA:
5091 case SECCAT_TBSS:
5092 /* We don't split these for medium model. Place them into
5093 default sections and hope for best. */
5094 break;
5096 if (prefix)
5098 const char *name, *linkonce;
5099 char *string;
5101 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
5102 name = targetm.strip_name_encoding (name);
5104 /* If we're using one_only, then there needs to be a .gnu.linkonce
5105 prefix to the section name. */
5106 linkonce = one_only ? ".gnu.linkonce" : "";
5108 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
5110 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
5111 return;
5114 default_unique_section (decl, reloc);
5117 #ifdef COMMON_ASM_OP
5118 /* This says how to output assembler code to declare an
5119 uninitialized external linkage data object.
5121 For medium model x86-64 we need to use .largecomm opcode for
5122 large objects. */
5123 void
5124 x86_elf_aligned_common (FILE *file,
5125 const char *name, unsigned HOST_WIDE_INT size,
5126 int align)
5128 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5129 && size > (unsigned int)ix86_section_threshold)
5130 fputs (".largecomm\t", file);
5131 else
5132 fputs (COMMON_ASM_OP, file);
5133 assemble_name (file, name);
5134 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
5135 size, align / BITS_PER_UNIT);
5137 #endif
5139 /* Utility function for targets to use in implementing
5140 ASM_OUTPUT_ALIGNED_BSS. */
5142 void
5143 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
5144 const char *name, unsigned HOST_WIDE_INT size,
5145 int align)
5147 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
5148 && size > (unsigned int)ix86_section_threshold)
5149 switch_to_section (get_named_section (decl, ".lbss", 0));
5150 else
5151 switch_to_section (bss_section);
5152 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
5153 #ifdef ASM_DECLARE_OBJECT_NAME
5154 last_assemble_variable_decl = decl;
5155 ASM_DECLARE_OBJECT_NAME (file, name, decl);
5156 #else
5157 /* Standard thing is just output label for the object. */
5158 ASM_OUTPUT_LABEL (file, name);
5159 #endif /* ASM_DECLARE_OBJECT_NAME */
5160 ASM_OUTPUT_SKIP (file, size ? size : 1);
5163 /* Decide whether we must probe the stack before any space allocation
5164 on this target. It's essentially TARGET_STACK_PROBE except when
5165 -fstack-check causes the stack to be already probed differently. */
5167 bool
5168 ix86_target_stack_probe (void)
5170 /* Do not probe the stack twice if static stack checking is enabled. */
5171 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5172 return false;
5174 return TARGET_STACK_PROBE;
5177 /* Decide whether we can make a sibling call to a function. DECL is the
5178 declaration of the function being targeted by the call and EXP is the
5179 CALL_EXPR representing the call. */
5181 static bool
5182 ix86_function_ok_for_sibcall (tree decl, tree exp)
5184 tree type, decl_or_type;
5185 rtx a, b;
5187 /* If we are generating position-independent code, we cannot sibcall
5188 optimize any indirect call, or a direct call to a global function,
5189 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5190 if (!TARGET_MACHO
5191 && !TARGET_64BIT
5192 && flag_pic
5193 && (!decl || !targetm.binds_local_p (decl)))
5194 return false;
5196 /* If we need to align the outgoing stack, then sibcalling would
5197 unalign the stack, which may break the called function. */
5198 if (ix86_minimum_incoming_stack_boundary (true)
5199 < PREFERRED_STACK_BOUNDARY)
5200 return false;
5202 if (decl)
5204 decl_or_type = decl;
5205 type = TREE_TYPE (decl);
5207 else
5209 /* We're looking at the CALL_EXPR, we need the type of the function. */
5210 type = CALL_EXPR_FN (exp); /* pointer expression */
5211 type = TREE_TYPE (type); /* pointer type */
5212 type = TREE_TYPE (type); /* function type */
5213 decl_or_type = type;
5216 /* Check that the return value locations are the same. Like
5217 if we are returning floats on the 80387 register stack, we cannot
5218 make a sibcall from a function that doesn't return a float to a
5219 function that does or, conversely, from a function that does return
5220 a float to a function that doesn't; the necessary stack adjustment
5221 would not be executed. This is also the place we notice
5222 differences in the return value ABI. Note that it is ok for one
5223 of the functions to have void return type as long as the return
5224 value of the other is passed in a register. */
5225 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5226 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5227 cfun->decl, false);
5228 if (STACK_REG_P (a) || STACK_REG_P (b))
5230 if (!rtx_equal_p (a, b))
5231 return false;
5233 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5235 else if (!rtx_equal_p (a, b))
5236 return false;
5238 if (TARGET_64BIT)
5240 /* The SYSV ABI has more call-clobbered registers;
5241 disallow sibcalls from MS to SYSV. */
5242 if (cfun->machine->call_abi == MS_ABI
5243 && ix86_function_type_abi (type) == SYSV_ABI)
5244 return false;
5246 else
5248 /* If this call is indirect, we'll need to be able to use a
5249 call-clobbered register for the address of the target function.
5250 Make sure that all such registers are not used for passing
5251 parameters. Note that DLLIMPORT functions are indirect. */
5252 if (!decl
5253 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5255 if (ix86_function_regparm (type, NULL) >= 3)
5257 /* ??? Need to count the actual number of registers to be used,
5258 not the possible number of registers. Fix later. */
5259 return false;
5264 /* Otherwise okay. That also includes certain types of indirect calls. */
5265 return true;
5268 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5269 and "sseregparm" calling convention attributes;
5270 arguments as in struct attribute_spec.handler. */
5272 static tree
5273 ix86_handle_cconv_attribute (tree *node, tree name,
5274 tree args,
5275 int flags ATTRIBUTE_UNUSED,
5276 bool *no_add_attrs)
5278 if (TREE_CODE (*node) != FUNCTION_TYPE
5279 && TREE_CODE (*node) != METHOD_TYPE
5280 && TREE_CODE (*node) != FIELD_DECL
5281 && TREE_CODE (*node) != TYPE_DECL)
5283 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5284 name);
5285 *no_add_attrs = true;
5286 return NULL_TREE;
5289 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5290 if (is_attribute_p ("regparm", name))
5292 tree cst;
5294 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5296 error ("fastcall and regparm attributes are not compatible");
5299 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5301 error ("regparam and thiscall attributes are not compatible");
5304 cst = TREE_VALUE (args);
5305 if (TREE_CODE (cst) != INTEGER_CST)
5307 warning (OPT_Wattributes,
5308 "%qE attribute requires an integer constant argument",
5309 name);
5310 *no_add_attrs = true;
5312 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5314 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5315 name, REGPARM_MAX);
5316 *no_add_attrs = true;
5319 return NULL_TREE;
5322 if (TARGET_64BIT)
5324 /* Do not warn when emulating the MS ABI. */
5325 if ((TREE_CODE (*node) != FUNCTION_TYPE
5326 && TREE_CODE (*node) != METHOD_TYPE)
5327 || ix86_function_type_abi (*node) != MS_ABI)
5328 warning (OPT_Wattributes, "%qE attribute ignored",
5329 name);
5330 *no_add_attrs = true;
5331 return NULL_TREE;
5334 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5335 if (is_attribute_p ("fastcall", name))
5337 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5339 error ("fastcall and cdecl attributes are not compatible");
5341 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5343 error ("fastcall and stdcall attributes are not compatible");
5345 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5347 error ("fastcall and regparm attributes are not compatible");
5349 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5351 error ("fastcall and thiscall attributes are not compatible");
5355 /* Can combine stdcall with fastcall (redundant), regparm and
5356 sseregparm. */
5357 else if (is_attribute_p ("stdcall", name))
5359 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5361 error ("stdcall and cdecl attributes are not compatible");
5363 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5365 error ("stdcall and fastcall attributes are not compatible");
5367 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5369 error ("stdcall and thiscall attributes are not compatible");
5373 /* Can combine cdecl with regparm and sseregparm. */
5374 else if (is_attribute_p ("cdecl", name))
5376 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5378 error ("stdcall and cdecl attributes are not compatible");
5380 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5382 error ("fastcall and cdecl attributes are not compatible");
5384 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5386 error ("cdecl and thiscall attributes are not compatible");
5389 else if (is_attribute_p ("thiscall", name))
5391 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5392 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5393 name);
5394 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5396 error ("stdcall and thiscall attributes are not compatible");
5398 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5400 error ("fastcall and thiscall attributes are not compatible");
5402 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5404 error ("cdecl and thiscall attributes are not compatible");
5408 /* Can combine sseregparm with all attributes. */
5410 return NULL_TREE;
5413 /* The transactional memory builtins are implicitly regparm or fastcall
5414 depending on the ABI. Override the generic do-nothing attribute that
5415 these builtins were declared with, and replace it with one of the two
5416 attributes that we expect elsewhere. */
5418 static tree
5419 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5420 tree args ATTRIBUTE_UNUSED,
5421 int flags, bool *no_add_attrs)
5423 tree alt;
5425 /* In no case do we want to add the placeholder attribute. */
5426 *no_add_attrs = true;
5428 /* The 64-bit ABI is unchanged for transactional memory. */
5429 if (TARGET_64BIT)
5430 return NULL_TREE;
5432 /* ??? Is there a better way to validate 32-bit windows? We have
5433 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5434 if (CHECK_STACK_LIMIT > 0)
5435 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5436 else
5438 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5439 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5441 decl_attributes (node, alt, flags);
5443 return NULL_TREE;
5446 /* This function determines from TYPE the calling-convention. */
5448 unsigned int
5449 ix86_get_callcvt (const_tree type)
5451 unsigned int ret = 0;
5452 bool is_stdarg;
5453 tree attrs;
5455 if (TARGET_64BIT)
5456 return IX86_CALLCVT_CDECL;
5458 attrs = TYPE_ATTRIBUTES (type);
5459 if (attrs != NULL_TREE)
5461 if (lookup_attribute ("cdecl", attrs))
5462 ret |= IX86_CALLCVT_CDECL;
5463 else if (lookup_attribute ("stdcall", attrs))
5464 ret |= IX86_CALLCVT_STDCALL;
5465 else if (lookup_attribute ("fastcall", attrs))
5466 ret |= IX86_CALLCVT_FASTCALL;
5467 else if (lookup_attribute ("thiscall", attrs))
5468 ret |= IX86_CALLCVT_THISCALL;
5470 /* Regparam isn't allowed for thiscall and fastcall. */
5471 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5473 if (lookup_attribute ("regparm", attrs))
5474 ret |= IX86_CALLCVT_REGPARM;
5475 if (lookup_attribute ("sseregparm", attrs))
5476 ret |= IX86_CALLCVT_SSEREGPARM;
5479 if (IX86_BASE_CALLCVT(ret) != 0)
5480 return ret;
5483 is_stdarg = stdarg_p (type);
5484 if (TARGET_RTD && !is_stdarg)
5485 return IX86_CALLCVT_STDCALL | ret;
5487 if (ret != 0
5488 || is_stdarg
5489 || TREE_CODE (type) != METHOD_TYPE
5490 || ix86_function_type_abi (type) != MS_ABI)
5491 return IX86_CALLCVT_CDECL | ret;
5493 return IX86_CALLCVT_THISCALL;
5496 /* Return 0 if the attributes for two types are incompatible, 1 if they
5497 are compatible, and 2 if they are nearly compatible (which causes a
5498 warning to be generated). */
5500 static int
5501 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5503 unsigned int ccvt1, ccvt2;
5505 if (TREE_CODE (type1) != FUNCTION_TYPE
5506 && TREE_CODE (type1) != METHOD_TYPE)
5507 return 1;
5509 ccvt1 = ix86_get_callcvt (type1);
5510 ccvt2 = ix86_get_callcvt (type2);
5511 if (ccvt1 != ccvt2)
5512 return 0;
5513 if (ix86_function_regparm (type1, NULL)
5514 != ix86_function_regparm (type2, NULL))
5515 return 0;
5517 return 1;
5520 /* Return the regparm value for a function with the indicated TYPE and DECL.
5521 DECL may be NULL when calling function indirectly
5522 or considering a libcall. */
5524 static int
5525 ix86_function_regparm (const_tree type, const_tree decl)
5527 tree attr;
5528 int regparm;
5529 unsigned int ccvt;
5531 if (TARGET_64BIT)
5532 return (ix86_function_type_abi (type) == SYSV_ABI
5533 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5534 ccvt = ix86_get_callcvt (type);
5535 regparm = ix86_regparm;
5537 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5539 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5540 if (attr)
5542 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5543 return regparm;
5546 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5547 return 2;
5548 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5549 return 1;
5551 /* Use register calling convention for local functions when possible. */
5552 if (decl
5553 && TREE_CODE (decl) == FUNCTION_DECL
5554 && optimize
5555 && !(profile_flag && !flag_fentry))
5557 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5558 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5559 if (i && i->local && i->can_change_signature)
5561 int local_regparm, globals = 0, regno;
5563 /* Make sure no regparm register is taken by a
5564 fixed register variable. */
5565 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5566 if (fixed_regs[local_regparm])
5567 break;
5569 /* We don't want to use regparm(3) for nested functions as
5570 these use a static chain pointer in the third argument. */
5571 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5572 local_regparm = 2;
5574 /* In 32-bit mode save a register for the split stack. */
5575 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5576 local_regparm = 2;
5578 /* Each fixed register usage increases register pressure,
5579 so less registers should be used for argument passing.
5580 This functionality can be overriden by an explicit
5581 regparm value. */
5582 for (regno = AX_REG; regno <= DI_REG; regno++)
5583 if (fixed_regs[regno])
5584 globals++;
5586 local_regparm
5587 = globals < local_regparm ? local_regparm - globals : 0;
5589 if (local_regparm > regparm)
5590 regparm = local_regparm;
5594 return regparm;
5597 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5598 DFmode (2) arguments in SSE registers for a function with the
5599 indicated TYPE and DECL. DECL may be NULL when calling function
5600 indirectly or considering a libcall. Otherwise return 0. */
5602 static int
5603 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5605 gcc_assert (!TARGET_64BIT);
5607 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5608 by the sseregparm attribute. */
5609 if (TARGET_SSEREGPARM
5610 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5612 if (!TARGET_SSE)
5614 if (warn)
5616 if (decl)
5617 error ("calling %qD with attribute sseregparm without "
5618 "SSE/SSE2 enabled", decl);
5619 else
5620 error ("calling %qT with attribute sseregparm without "
5621 "SSE/SSE2 enabled", type);
5623 return 0;
5626 return 2;
5629 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5630 (and DFmode for SSE2) arguments in SSE registers. */
5631 if (decl && TARGET_SSE_MATH && optimize
5632 && !(profile_flag && !flag_fentry))
5634 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5635 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5636 if (i && i->local && i->can_change_signature)
5637 return TARGET_SSE2 ? 2 : 1;
5640 return 0;
5643 /* Return true if EAX is live at the start of the function. Used by
5644 ix86_expand_prologue to determine if we need special help before
5645 calling allocate_stack_worker. */
5647 static bool
5648 ix86_eax_live_at_start_p (void)
5650 /* Cheat. Don't bother working forward from ix86_function_regparm
5651 to the function type to whether an actual argument is located in
5652 eax. Instead just look at cfg info, which is still close enough
5653 to correct at this point. This gives false positives for broken
5654 functions that might use uninitialized data that happens to be
5655 allocated in eax, but who cares? */
5656 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
5659 static bool
5660 ix86_keep_aggregate_return_pointer (tree fntype)
5662 tree attr;
5664 if (!TARGET_64BIT)
5666 attr = lookup_attribute ("callee_pop_aggregate_return",
5667 TYPE_ATTRIBUTES (fntype));
5668 if (attr)
5669 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5671 /* For 32-bit MS-ABI the default is to keep aggregate
5672 return pointer. */
5673 if (ix86_function_type_abi (fntype) == MS_ABI)
5674 return true;
5676 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5679 /* Value is the number of bytes of arguments automatically
5680 popped when returning from a subroutine call.
5681 FUNDECL is the declaration node of the function (as a tree),
5682 FUNTYPE is the data type of the function (as a tree),
5683 or for a library call it is an identifier node for the subroutine name.
5684 SIZE is the number of bytes of arguments passed on the stack.
5686 On the 80386, the RTD insn may be used to pop them if the number
5687 of args is fixed, but if the number is variable then the caller
5688 must pop them all. RTD can't be used for library calls now
5689 because the library is compiled with the Unix compiler.
5690 Use of RTD is a selectable option, since it is incompatible with
5691 standard Unix calling sequences. If the option is not selected,
5692 the caller must always pop the args.
5694 The attribute stdcall is equivalent to RTD on a per module basis. */
5696 static int
5697 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5699 unsigned int ccvt;
5701 /* None of the 64-bit ABIs pop arguments. */
5702 if (TARGET_64BIT)
5703 return 0;
5705 ccvt = ix86_get_callcvt (funtype);
5707 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5708 | IX86_CALLCVT_THISCALL)) != 0
5709 && ! stdarg_p (funtype))
5710 return size;
5712 /* Lose any fake structure return argument if it is passed on the stack. */
5713 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5714 && !ix86_keep_aggregate_return_pointer (funtype))
5716 int nregs = ix86_function_regparm (funtype, fundecl);
5717 if (nregs == 0)
5718 return GET_MODE_SIZE (Pmode);
5721 return 0;
5724 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5726 static bool
5727 ix86_legitimate_combined_insn (rtx insn)
5729 /* Check operand constraints in case hard registers were propagated
5730 into insn pattern. This check prevents combine pass from
5731 generating insn patterns with invalid hard register operands.
5732 These invalid insns can eventually confuse reload to error out
5733 with a spill failure. See also PRs 46829 and 46843. */
5734 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5736 int i;
5738 extract_insn (insn);
5739 preprocess_constraints ();
5741 for (i = 0; i < recog_data.n_operands; i++)
5743 rtx op = recog_data.operand[i];
5744 enum machine_mode mode = GET_MODE (op);
5745 struct operand_alternative *op_alt;
5746 int offset = 0;
5747 bool win;
5748 int j;
5750 /* For pre-AVX disallow unaligned loads/stores where the
5751 instructions don't support it. */
5752 if (!TARGET_AVX
5753 && VECTOR_MODE_P (GET_MODE (op))
5754 && misaligned_operand (op, GET_MODE (op)))
5756 int min_align = get_attr_ssememalign (insn);
5757 if (min_align == 0)
5758 return false;
5761 /* A unary operator may be accepted by the predicate, but it
5762 is irrelevant for matching constraints. */
5763 if (UNARY_P (op))
5764 op = XEXP (op, 0);
5766 if (GET_CODE (op) == SUBREG)
5768 if (REG_P (SUBREG_REG (op))
5769 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5770 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5771 GET_MODE (SUBREG_REG (op)),
5772 SUBREG_BYTE (op),
5773 GET_MODE (op));
5774 op = SUBREG_REG (op);
5777 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5778 continue;
5780 op_alt = recog_op_alt[i];
5782 /* Operand has no constraints, anything is OK. */
5783 win = !recog_data.n_alternatives;
5785 for (j = 0; j < recog_data.n_alternatives; j++)
5787 if (op_alt[j].anything_ok
5788 || (op_alt[j].matches != -1
5789 && operands_match_p
5790 (recog_data.operand[i],
5791 recog_data.operand[op_alt[j].matches]))
5792 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5794 win = true;
5795 break;
5799 if (!win)
5800 return false;
5804 return true;
5807 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5809 static unsigned HOST_WIDE_INT
5810 ix86_asan_shadow_offset (void)
5812 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5813 : HOST_WIDE_INT_C (0x7fff8000))
5814 : (HOST_WIDE_INT_1 << 29);
5817 /* Argument support functions. */
5819 /* Return true when register may be used to pass function parameters. */
5820 bool
5821 ix86_function_arg_regno_p (int regno)
5823 int i;
5824 const int *parm_regs;
5826 if (!TARGET_64BIT)
5828 if (TARGET_MACHO)
5829 return (regno < REGPARM_MAX
5830 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5831 else
5832 return (regno < REGPARM_MAX
5833 || (TARGET_MMX && MMX_REGNO_P (regno)
5834 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5835 || (TARGET_SSE && SSE_REGNO_P (regno)
5836 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5839 if (TARGET_SSE && SSE_REGNO_P (regno)
5840 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5841 return true;
5843 /* TODO: The function should depend on current function ABI but
5844 builtins.c would need updating then. Therefore we use the
5845 default ABI. */
5847 /* RAX is used as hidden argument to va_arg functions. */
5848 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5849 return true;
5851 if (ix86_abi == MS_ABI)
5852 parm_regs = x86_64_ms_abi_int_parameter_registers;
5853 else
5854 parm_regs = x86_64_int_parameter_registers;
5855 for (i = 0; i < (ix86_abi == MS_ABI
5856 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5857 if (regno == parm_regs[i])
5858 return true;
5859 return false;
5862 /* Return if we do not know how to pass TYPE solely in registers. */
5864 static bool
5865 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5867 if (must_pass_in_stack_var_size_or_pad (mode, type))
5868 return true;
5870 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5871 The layout_type routine is crafty and tries to trick us into passing
5872 currently unsupported vector types on the stack by using TImode. */
5873 return (!TARGET_64BIT && mode == TImode
5874 && type && TREE_CODE (type) != VECTOR_TYPE);
5877 /* It returns the size, in bytes, of the area reserved for arguments passed
5878 in registers for the function represented by fndecl dependent to the used
5879 abi format. */
5881 ix86_reg_parm_stack_space (const_tree fndecl)
5883 enum calling_abi call_abi = SYSV_ABI;
5884 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5885 call_abi = ix86_function_abi (fndecl);
5886 else
5887 call_abi = ix86_function_type_abi (fndecl);
5888 if (TARGET_64BIT && call_abi == MS_ABI)
5889 return 32;
5890 return 0;
5893 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5894 call abi used. */
5895 enum calling_abi
5896 ix86_function_type_abi (const_tree fntype)
5898 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5900 enum calling_abi abi = ix86_abi;
5901 if (abi == SYSV_ABI)
5903 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5904 abi = MS_ABI;
5906 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5907 abi = SYSV_ABI;
5908 return abi;
5910 return ix86_abi;
5913 /* We add this as a workaround in order to use libc_has_function
5914 hook in i386.md. */
5915 bool
5916 ix86_libc_has_function (enum function_class fn_class)
5918 return targetm.libc_has_function (fn_class);
5921 static bool
5922 ix86_function_ms_hook_prologue (const_tree fn)
5924 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5926 if (decl_function_context (fn) != NULL_TREE)
5927 error_at (DECL_SOURCE_LOCATION (fn),
5928 "ms_hook_prologue is not compatible with nested function");
5929 else
5930 return true;
5932 return false;
5935 static enum calling_abi
5936 ix86_function_abi (const_tree fndecl)
5938 if (! fndecl)
5939 return ix86_abi;
5940 return ix86_function_type_abi (TREE_TYPE (fndecl));
5943 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5944 call abi used. */
5945 enum calling_abi
5946 ix86_cfun_abi (void)
5948 if (! cfun)
5949 return ix86_abi;
5950 return cfun->machine->call_abi;
5953 /* Write the extra assembler code needed to declare a function properly. */
5955 void
5956 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5957 tree decl)
5959 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5961 if (is_ms_hook)
5963 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5964 unsigned int filler_cc = 0xcccccccc;
5966 for (i = 0; i < filler_count; i += 4)
5967 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5970 #ifdef SUBTARGET_ASM_UNWIND_INIT
5971 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5972 #endif
5974 ASM_OUTPUT_LABEL (asm_out_file, fname);
5976 /* Output magic byte marker, if hot-patch attribute is set. */
5977 if (is_ms_hook)
5979 if (TARGET_64BIT)
5981 /* leaq [%rsp + 0], %rsp */
5982 asm_fprintf (asm_out_file, ASM_BYTE
5983 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5985 else
5987 /* movl.s %edi, %edi
5988 push %ebp
5989 movl.s %esp, %ebp */
5990 asm_fprintf (asm_out_file, ASM_BYTE
5991 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5996 /* regclass.c */
5997 extern void init_regs (void);
5999 /* Implementation of call abi switching target hook. Specific to FNDECL
6000 the specific call register sets are set. See also
6001 ix86_conditional_register_usage for more details. */
6002 void
6003 ix86_call_abi_override (const_tree fndecl)
6005 if (fndecl == NULL_TREE)
6006 cfun->machine->call_abi = ix86_abi;
6007 else
6008 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
6011 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
6012 expensive re-initialization of init_regs each time we switch function context
6013 since this is needed only during RTL expansion. */
6014 static void
6015 ix86_maybe_switch_abi (void)
6017 if (TARGET_64BIT &&
6018 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
6019 reinit_regs ();
6022 /* Initialize a variable CUM of type CUMULATIVE_ARGS
6023 for a call to a function whose data type is FNTYPE.
6024 For a library call, FNTYPE is 0. */
6026 void
6027 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
6028 tree fntype, /* tree ptr for function decl */
6029 rtx libname, /* SYMBOL_REF of library name or 0 */
6030 tree fndecl,
6031 int caller)
6033 struct cgraph_local_info *i;
6035 memset (cum, 0, sizeof (*cum));
6037 if (fndecl)
6039 i = cgraph_local_info (fndecl);
6040 cum->call_abi = ix86_function_abi (fndecl);
6042 else
6044 i = NULL;
6045 cum->call_abi = ix86_function_type_abi (fntype);
6048 cum->caller = caller;
6050 /* Set up the number of registers to use for passing arguments. */
6052 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
6053 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
6054 "or subtarget optimization implying it");
6055 cum->nregs = ix86_regparm;
6056 if (TARGET_64BIT)
6058 cum->nregs = (cum->call_abi == SYSV_ABI
6059 ? X86_64_REGPARM_MAX
6060 : X86_64_MS_REGPARM_MAX);
6062 if (TARGET_SSE)
6064 cum->sse_nregs = SSE_REGPARM_MAX;
6065 if (TARGET_64BIT)
6067 cum->sse_nregs = (cum->call_abi == SYSV_ABI
6068 ? X86_64_SSE_REGPARM_MAX
6069 : X86_64_MS_SSE_REGPARM_MAX);
6072 if (TARGET_MMX)
6073 cum->mmx_nregs = MMX_REGPARM_MAX;
6074 cum->warn_avx = true;
6075 cum->warn_sse = true;
6076 cum->warn_mmx = true;
6078 /* Because type might mismatch in between caller and callee, we need to
6079 use actual type of function for local calls.
6080 FIXME: cgraph_analyze can be told to actually record if function uses
6081 va_start so for local functions maybe_vaarg can be made aggressive
6082 helping K&R code.
6083 FIXME: once typesytem is fixed, we won't need this code anymore. */
6084 if (i && i->local && i->can_change_signature)
6085 fntype = TREE_TYPE (fndecl);
6086 cum->maybe_vaarg = (fntype
6087 ? (!prototype_p (fntype) || stdarg_p (fntype))
6088 : !libname);
6090 if (!TARGET_64BIT)
6092 /* If there are variable arguments, then we won't pass anything
6093 in registers in 32-bit mode. */
6094 if (stdarg_p (fntype))
6096 cum->nregs = 0;
6097 cum->sse_nregs = 0;
6098 cum->mmx_nregs = 0;
6099 cum->warn_avx = 0;
6100 cum->warn_sse = 0;
6101 cum->warn_mmx = 0;
6102 return;
6105 /* Use ecx and edx registers if function has fastcall attribute,
6106 else look for regparm information. */
6107 if (fntype)
6109 unsigned int ccvt = ix86_get_callcvt (fntype);
6110 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6112 cum->nregs = 1;
6113 cum->fastcall = 1; /* Same first register as in fastcall. */
6115 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6117 cum->nregs = 2;
6118 cum->fastcall = 1;
6120 else
6121 cum->nregs = ix86_function_regparm (fntype, fndecl);
6124 /* Set up the number of SSE registers used for passing SFmode
6125 and DFmode arguments. Warn for mismatching ABI. */
6126 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
6130 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
6131 But in the case of vector types, it is some vector mode.
6133 When we have only some of our vector isa extensions enabled, then there
6134 are some modes for which vector_mode_supported_p is false. For these
6135 modes, the generic vector support in gcc will choose some non-vector mode
6136 in order to implement the type. By computing the natural mode, we'll
6137 select the proper ABI location for the operand and not depend on whatever
6138 the middle-end decides to do with these vector types.
6140 The midde-end can't deal with the vector types > 16 bytes. In this
6141 case, we return the original mode and warn ABI change if CUM isn't
6142 NULL. */
6144 static enum machine_mode
6145 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
6147 enum machine_mode mode = TYPE_MODE (type);
6149 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
6151 HOST_WIDE_INT size = int_size_in_bytes (type);
6152 if ((size == 8 || size == 16 || size == 32)
6153 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
6154 && TYPE_VECTOR_SUBPARTS (type) > 1)
6156 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
6158 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
6159 mode = MIN_MODE_VECTOR_FLOAT;
6160 else
6161 mode = MIN_MODE_VECTOR_INT;
6163 /* Get the mode which has this inner mode and number of units. */
6164 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
6165 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
6166 && GET_MODE_INNER (mode) == innermode)
6168 if (size == 32 && !TARGET_AVX)
6170 static bool warnedavx;
6172 if (cum
6173 && !warnedavx
6174 && cum->warn_avx)
6176 warnedavx = true;
6177 warning (0, "AVX vector argument without AVX "
6178 "enabled changes the ABI");
6180 return TYPE_MODE (type);
6182 else if (((size == 8 && TARGET_64BIT) || size == 16)
6183 && !TARGET_SSE)
6185 static bool warnedsse;
6187 if (cum
6188 && !warnedsse
6189 && cum->warn_sse)
6191 warnedsse = true;
6192 warning (0, "SSE vector argument without SSE "
6193 "enabled changes the ABI");
6196 else if ((size == 8 && !TARGET_64BIT) && !TARGET_MMX)
6198 static bool warnedmmx;
6200 if (cum
6201 && !warnedmmx
6202 && cum->warn_mmx)
6204 warnedmmx = true;
6205 warning (0, "MMX vector argument without MMX "
6206 "enabled changes the ABI");
6209 return mode;
6212 gcc_unreachable ();
6216 return mode;
6219 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6220 this may not agree with the mode that the type system has chosen for the
6221 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6222 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6224 static rtx
6225 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6226 unsigned int regno)
6228 rtx tmp;
6230 if (orig_mode != BLKmode)
6231 tmp = gen_rtx_REG (orig_mode, regno);
6232 else
6234 tmp = gen_rtx_REG (mode, regno);
6235 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6236 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6239 return tmp;
6242 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6243 of this code is to classify each 8bytes of incoming argument by the register
6244 class and assign registers accordingly. */
6246 /* Return the union class of CLASS1 and CLASS2.
6247 See the x86-64 PS ABI for details. */
6249 static enum x86_64_reg_class
6250 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6252 /* Rule #1: If both classes are equal, this is the resulting class. */
6253 if (class1 == class2)
6254 return class1;
6256 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6257 the other class. */
6258 if (class1 == X86_64_NO_CLASS)
6259 return class2;
6260 if (class2 == X86_64_NO_CLASS)
6261 return class1;
6263 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6264 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6265 return X86_64_MEMORY_CLASS;
6267 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6268 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6269 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6270 return X86_64_INTEGERSI_CLASS;
6271 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6272 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6273 return X86_64_INTEGER_CLASS;
6275 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6276 MEMORY is used. */
6277 if (class1 == X86_64_X87_CLASS
6278 || class1 == X86_64_X87UP_CLASS
6279 || class1 == X86_64_COMPLEX_X87_CLASS
6280 || class2 == X86_64_X87_CLASS
6281 || class2 == X86_64_X87UP_CLASS
6282 || class2 == X86_64_COMPLEX_X87_CLASS)
6283 return X86_64_MEMORY_CLASS;
6285 /* Rule #6: Otherwise class SSE is used. */
6286 return X86_64_SSE_CLASS;
6289 /* Classify the argument of type TYPE and mode MODE.
6290 CLASSES will be filled by the register class used to pass each word
6291 of the operand. The number of words is returned. In case the parameter
6292 should be passed in memory, 0 is returned. As a special case for zero
6293 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6295 BIT_OFFSET is used internally for handling records and specifies offset
6296 of the offset in bits modulo 256 to avoid overflow cases.
6298 See the x86-64 PS ABI for details.
6301 static int
6302 classify_argument (enum machine_mode mode, const_tree type,
6303 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6305 HOST_WIDE_INT bytes =
6306 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6307 int words
6308 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6310 /* Variable sized entities are always passed/returned in memory. */
6311 if (bytes < 0)
6312 return 0;
6314 if (mode != VOIDmode
6315 && targetm.calls.must_pass_in_stack (mode, type))
6316 return 0;
6318 if (type && AGGREGATE_TYPE_P (type))
6320 int i;
6321 tree field;
6322 enum x86_64_reg_class subclasses[MAX_CLASSES];
6324 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6325 if (bytes > 32)
6326 return 0;
6328 for (i = 0; i < words; i++)
6329 classes[i] = X86_64_NO_CLASS;
6331 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6332 signalize memory class, so handle it as special case. */
6333 if (!words)
6335 classes[0] = X86_64_NO_CLASS;
6336 return 1;
6339 /* Classify each field of record and merge classes. */
6340 switch (TREE_CODE (type))
6342 case RECORD_TYPE:
6343 /* And now merge the fields of structure. */
6344 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6346 if (TREE_CODE (field) == FIELD_DECL)
6348 int num;
6350 if (TREE_TYPE (field) == error_mark_node)
6351 continue;
6353 /* Bitfields are always classified as integer. Handle them
6354 early, since later code would consider them to be
6355 misaligned integers. */
6356 if (DECL_BIT_FIELD (field))
6358 for (i = (int_bit_position (field)
6359 + (bit_offset % 64)) / 8 / 8;
6360 i < ((int_bit_position (field) + (bit_offset % 64))
6361 + tree_to_shwi (DECL_SIZE (field))
6362 + 63) / 8 / 8; i++)
6363 classes[i] =
6364 merge_classes (X86_64_INTEGER_CLASS,
6365 classes[i]);
6367 else
6369 int pos;
6371 type = TREE_TYPE (field);
6373 /* Flexible array member is ignored. */
6374 if (TYPE_MODE (type) == BLKmode
6375 && TREE_CODE (type) == ARRAY_TYPE
6376 && TYPE_SIZE (type) == NULL_TREE
6377 && TYPE_DOMAIN (type) != NULL_TREE
6378 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6379 == NULL_TREE))
6381 static bool warned;
6383 if (!warned && warn_psabi)
6385 warned = true;
6386 inform (input_location,
6387 "the ABI of passing struct with"
6388 " a flexible array member has"
6389 " changed in GCC 4.4");
6391 continue;
6393 num = classify_argument (TYPE_MODE (type), type,
6394 subclasses,
6395 (int_bit_position (field)
6396 + bit_offset) % 256);
6397 if (!num)
6398 return 0;
6399 pos = (int_bit_position (field)
6400 + (bit_offset % 64)) / 8 / 8;
6401 for (i = 0; i < num && (i + pos) < words; i++)
6402 classes[i + pos] =
6403 merge_classes (subclasses[i], classes[i + pos]);
6407 break;
6409 case ARRAY_TYPE:
6410 /* Arrays are handled as small records. */
6412 int num;
6413 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6414 TREE_TYPE (type), subclasses, bit_offset);
6415 if (!num)
6416 return 0;
6418 /* The partial classes are now full classes. */
6419 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6420 subclasses[0] = X86_64_SSE_CLASS;
6421 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6422 && !((bit_offset % 64) == 0 && bytes == 4))
6423 subclasses[0] = X86_64_INTEGER_CLASS;
6425 for (i = 0; i < words; i++)
6426 classes[i] = subclasses[i % num];
6428 break;
6430 case UNION_TYPE:
6431 case QUAL_UNION_TYPE:
6432 /* Unions are similar to RECORD_TYPE but offset is always 0.
6434 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6436 if (TREE_CODE (field) == FIELD_DECL)
6438 int num;
6440 if (TREE_TYPE (field) == error_mark_node)
6441 continue;
6443 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6444 TREE_TYPE (field), subclasses,
6445 bit_offset);
6446 if (!num)
6447 return 0;
6448 for (i = 0; i < num; i++)
6449 classes[i] = merge_classes (subclasses[i], classes[i]);
6452 break;
6454 default:
6455 gcc_unreachable ();
6458 if (words > 2)
6460 /* When size > 16 bytes, if the first one isn't
6461 X86_64_SSE_CLASS or any other ones aren't
6462 X86_64_SSEUP_CLASS, everything should be passed in
6463 memory. */
6464 if (classes[0] != X86_64_SSE_CLASS)
6465 return 0;
6467 for (i = 1; i < words; i++)
6468 if (classes[i] != X86_64_SSEUP_CLASS)
6469 return 0;
6472 /* Final merger cleanup. */
6473 for (i = 0; i < words; i++)
6475 /* If one class is MEMORY, everything should be passed in
6476 memory. */
6477 if (classes[i] == X86_64_MEMORY_CLASS)
6478 return 0;
6480 /* The X86_64_SSEUP_CLASS should be always preceded by
6481 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6482 if (classes[i] == X86_64_SSEUP_CLASS
6483 && classes[i - 1] != X86_64_SSE_CLASS
6484 && classes[i - 1] != X86_64_SSEUP_CLASS)
6486 /* The first one should never be X86_64_SSEUP_CLASS. */
6487 gcc_assert (i != 0);
6488 classes[i] = X86_64_SSE_CLASS;
6491 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6492 everything should be passed in memory. */
6493 if (classes[i] == X86_64_X87UP_CLASS
6494 && (classes[i - 1] != X86_64_X87_CLASS))
6496 static bool warned;
6498 /* The first one should never be X86_64_X87UP_CLASS. */
6499 gcc_assert (i != 0);
6500 if (!warned && warn_psabi)
6502 warned = true;
6503 inform (input_location,
6504 "the ABI of passing union with long double"
6505 " has changed in GCC 4.4");
6507 return 0;
6510 return words;
6513 /* Compute alignment needed. We align all types to natural boundaries with
6514 exception of XFmode that is aligned to 64bits. */
6515 if (mode != VOIDmode && mode != BLKmode)
6517 int mode_alignment = GET_MODE_BITSIZE (mode);
6519 if (mode == XFmode)
6520 mode_alignment = 128;
6521 else if (mode == XCmode)
6522 mode_alignment = 256;
6523 if (COMPLEX_MODE_P (mode))
6524 mode_alignment /= 2;
6525 /* Misaligned fields are always returned in memory. */
6526 if (bit_offset % mode_alignment)
6527 return 0;
6530 /* for V1xx modes, just use the base mode */
6531 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6532 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6533 mode = GET_MODE_INNER (mode);
6535 /* Classification of atomic types. */
6536 switch (mode)
6538 case SDmode:
6539 case DDmode:
6540 classes[0] = X86_64_SSE_CLASS;
6541 return 1;
6542 case TDmode:
6543 classes[0] = X86_64_SSE_CLASS;
6544 classes[1] = X86_64_SSEUP_CLASS;
6545 return 2;
6546 case DImode:
6547 case SImode:
6548 case HImode:
6549 case QImode:
6550 case CSImode:
6551 case CHImode:
6552 case CQImode:
6554 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6556 if (size <= 32)
6558 classes[0] = X86_64_INTEGERSI_CLASS;
6559 return 1;
6561 else if (size <= 64)
6563 classes[0] = X86_64_INTEGER_CLASS;
6564 return 1;
6566 else if (size <= 64+32)
6568 classes[0] = X86_64_INTEGER_CLASS;
6569 classes[1] = X86_64_INTEGERSI_CLASS;
6570 return 2;
6572 else if (size <= 64+64)
6574 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6575 return 2;
6577 else
6578 gcc_unreachable ();
6580 case CDImode:
6581 case TImode:
6582 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6583 return 2;
6584 case COImode:
6585 case OImode:
6586 /* OImode shouldn't be used directly. */
6587 gcc_unreachable ();
6588 case CTImode:
6589 return 0;
6590 case SFmode:
6591 if (!(bit_offset % 64))
6592 classes[0] = X86_64_SSESF_CLASS;
6593 else
6594 classes[0] = X86_64_SSE_CLASS;
6595 return 1;
6596 case DFmode:
6597 classes[0] = X86_64_SSEDF_CLASS;
6598 return 1;
6599 case XFmode:
6600 classes[0] = X86_64_X87_CLASS;
6601 classes[1] = X86_64_X87UP_CLASS;
6602 return 2;
6603 case TFmode:
6604 classes[0] = X86_64_SSE_CLASS;
6605 classes[1] = X86_64_SSEUP_CLASS;
6606 return 2;
6607 case SCmode:
6608 classes[0] = X86_64_SSE_CLASS;
6609 if (!(bit_offset % 64))
6610 return 1;
6611 else
6613 static bool warned;
6615 if (!warned && warn_psabi)
6617 warned = true;
6618 inform (input_location,
6619 "the ABI of passing structure with complex float"
6620 " member has changed in GCC 4.4");
6622 classes[1] = X86_64_SSESF_CLASS;
6623 return 2;
6625 case DCmode:
6626 classes[0] = X86_64_SSEDF_CLASS;
6627 classes[1] = X86_64_SSEDF_CLASS;
6628 return 2;
6629 case XCmode:
6630 classes[0] = X86_64_COMPLEX_X87_CLASS;
6631 return 1;
6632 case TCmode:
6633 /* This modes is larger than 16 bytes. */
6634 return 0;
6635 case V8SFmode:
6636 case V8SImode:
6637 case V32QImode:
6638 case V16HImode:
6639 case V4DFmode:
6640 case V4DImode:
6641 classes[0] = X86_64_SSE_CLASS;
6642 classes[1] = X86_64_SSEUP_CLASS;
6643 classes[2] = X86_64_SSEUP_CLASS;
6644 classes[3] = X86_64_SSEUP_CLASS;
6645 return 4;
6646 case V4SFmode:
6647 case V4SImode:
6648 case V16QImode:
6649 case V8HImode:
6650 case V2DFmode:
6651 case V2DImode:
6652 classes[0] = X86_64_SSE_CLASS;
6653 classes[1] = X86_64_SSEUP_CLASS;
6654 return 2;
6655 case V1TImode:
6656 case V1DImode:
6657 case V2SFmode:
6658 case V2SImode:
6659 case V4HImode:
6660 case V8QImode:
6661 classes[0] = X86_64_SSE_CLASS;
6662 return 1;
6663 case BLKmode:
6664 case VOIDmode:
6665 return 0;
6666 default:
6667 gcc_assert (VECTOR_MODE_P (mode));
6669 if (bytes > 16)
6670 return 0;
6672 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6674 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6675 classes[0] = X86_64_INTEGERSI_CLASS;
6676 else
6677 classes[0] = X86_64_INTEGER_CLASS;
6678 classes[1] = X86_64_INTEGER_CLASS;
6679 return 1 + (bytes > 8);
6683 /* Examine the argument and return set number of register required in each
6684 class. Return 0 iff parameter should be passed in memory. */
6685 static int
6686 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6687 int *int_nregs, int *sse_nregs)
6689 enum x86_64_reg_class regclass[MAX_CLASSES];
6690 int n = classify_argument (mode, type, regclass, 0);
6692 *int_nregs = 0;
6693 *sse_nregs = 0;
6694 if (!n)
6695 return 0;
6696 for (n--; n >= 0; n--)
6697 switch (regclass[n])
6699 case X86_64_INTEGER_CLASS:
6700 case X86_64_INTEGERSI_CLASS:
6701 (*int_nregs)++;
6702 break;
6703 case X86_64_SSE_CLASS:
6704 case X86_64_SSESF_CLASS:
6705 case X86_64_SSEDF_CLASS:
6706 (*sse_nregs)++;
6707 break;
6708 case X86_64_NO_CLASS:
6709 case X86_64_SSEUP_CLASS:
6710 break;
6711 case X86_64_X87_CLASS:
6712 case X86_64_X87UP_CLASS:
6713 if (!in_return)
6714 return 0;
6715 break;
6716 case X86_64_COMPLEX_X87_CLASS:
6717 return in_return ? 2 : 0;
6718 case X86_64_MEMORY_CLASS:
6719 gcc_unreachable ();
6721 return 1;
6724 /* Construct container for the argument used by GCC interface. See
6725 FUNCTION_ARG for the detailed description. */
6727 static rtx
6728 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6729 const_tree type, int in_return, int nintregs, int nsseregs,
6730 const int *intreg, int sse_regno)
6732 /* The following variables hold the static issued_error state. */
6733 static bool issued_sse_arg_error;
6734 static bool issued_sse_ret_error;
6735 static bool issued_x87_ret_error;
6737 enum machine_mode tmpmode;
6738 int bytes =
6739 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6740 enum x86_64_reg_class regclass[MAX_CLASSES];
6741 int n;
6742 int i;
6743 int nexps = 0;
6744 int needed_sseregs, needed_intregs;
6745 rtx exp[MAX_CLASSES];
6746 rtx ret;
6748 n = classify_argument (mode, type, regclass, 0);
6749 if (!n)
6750 return NULL;
6751 if (!examine_argument (mode, type, in_return, &needed_intregs,
6752 &needed_sseregs))
6753 return NULL;
6754 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6755 return NULL;
6757 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6758 some less clueful developer tries to use floating-point anyway. */
6759 if (needed_sseregs && !TARGET_SSE)
6761 if (in_return)
6763 if (!issued_sse_ret_error)
6765 error ("SSE register return with SSE disabled");
6766 issued_sse_ret_error = true;
6769 else if (!issued_sse_arg_error)
6771 error ("SSE register argument with SSE disabled");
6772 issued_sse_arg_error = true;
6774 return NULL;
6777 /* Likewise, error if the ABI requires us to return values in the
6778 x87 registers and the user specified -mno-80387. */
6779 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6780 for (i = 0; i < n; i++)
6781 if (regclass[i] == X86_64_X87_CLASS
6782 || regclass[i] == X86_64_X87UP_CLASS
6783 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6785 if (!issued_x87_ret_error)
6787 error ("x87 register return with x87 disabled");
6788 issued_x87_ret_error = true;
6790 return NULL;
6793 /* First construct simple cases. Avoid SCmode, since we want to use
6794 single register to pass this type. */
6795 if (n == 1 && mode != SCmode)
6796 switch (regclass[0])
6798 case X86_64_INTEGER_CLASS:
6799 case X86_64_INTEGERSI_CLASS:
6800 return gen_rtx_REG (mode, intreg[0]);
6801 case X86_64_SSE_CLASS:
6802 case X86_64_SSESF_CLASS:
6803 case X86_64_SSEDF_CLASS:
6804 if (mode != BLKmode)
6805 return gen_reg_or_parallel (mode, orig_mode,
6806 SSE_REGNO (sse_regno));
6807 break;
6808 case X86_64_X87_CLASS:
6809 case X86_64_COMPLEX_X87_CLASS:
6810 return gen_rtx_REG (mode, FIRST_STACK_REG);
6811 case X86_64_NO_CLASS:
6812 /* Zero sized array, struct or class. */
6813 return NULL;
6814 default:
6815 gcc_unreachable ();
6817 if (n == 2
6818 && regclass[0] == X86_64_SSE_CLASS
6819 && regclass[1] == X86_64_SSEUP_CLASS
6820 && mode != BLKmode)
6821 return gen_reg_or_parallel (mode, orig_mode,
6822 SSE_REGNO (sse_regno));
6823 if (n == 4
6824 && regclass[0] == X86_64_SSE_CLASS
6825 && regclass[1] == X86_64_SSEUP_CLASS
6826 && regclass[2] == X86_64_SSEUP_CLASS
6827 && regclass[3] == X86_64_SSEUP_CLASS
6828 && mode != BLKmode)
6829 return gen_reg_or_parallel (mode, orig_mode,
6830 SSE_REGNO (sse_regno));
6831 if (n == 2
6832 && regclass[0] == X86_64_X87_CLASS
6833 && regclass[1] == X86_64_X87UP_CLASS)
6834 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6836 if (n == 2
6837 && regclass[0] == X86_64_INTEGER_CLASS
6838 && regclass[1] == X86_64_INTEGER_CLASS
6839 && (mode == CDImode || mode == TImode || mode == TFmode)
6840 && intreg[0] + 1 == intreg[1])
6841 return gen_rtx_REG (mode, intreg[0]);
6843 /* Otherwise figure out the entries of the PARALLEL. */
6844 for (i = 0; i < n; i++)
6846 int pos;
6848 switch (regclass[i])
6850 case X86_64_NO_CLASS:
6851 break;
6852 case X86_64_INTEGER_CLASS:
6853 case X86_64_INTEGERSI_CLASS:
6854 /* Merge TImodes on aligned occasions here too. */
6855 if (i * 8 + 8 > bytes)
6856 tmpmode
6857 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6858 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6859 tmpmode = SImode;
6860 else
6861 tmpmode = DImode;
6862 /* We've requested 24 bytes we
6863 don't have mode for. Use DImode. */
6864 if (tmpmode == BLKmode)
6865 tmpmode = DImode;
6866 exp [nexps++]
6867 = gen_rtx_EXPR_LIST (VOIDmode,
6868 gen_rtx_REG (tmpmode, *intreg),
6869 GEN_INT (i*8));
6870 intreg++;
6871 break;
6872 case X86_64_SSESF_CLASS:
6873 exp [nexps++]
6874 = gen_rtx_EXPR_LIST (VOIDmode,
6875 gen_rtx_REG (SFmode,
6876 SSE_REGNO (sse_regno)),
6877 GEN_INT (i*8));
6878 sse_regno++;
6879 break;
6880 case X86_64_SSEDF_CLASS:
6881 exp [nexps++]
6882 = gen_rtx_EXPR_LIST (VOIDmode,
6883 gen_rtx_REG (DFmode,
6884 SSE_REGNO (sse_regno)),
6885 GEN_INT (i*8));
6886 sse_regno++;
6887 break;
6888 case X86_64_SSE_CLASS:
6889 pos = i;
6890 switch (n)
6892 case 1:
6893 tmpmode = DImode;
6894 break;
6895 case 2:
6896 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6898 tmpmode = TImode;
6899 i++;
6901 else
6902 tmpmode = DImode;
6903 break;
6904 case 4:
6905 gcc_assert (i == 0
6906 && regclass[1] == X86_64_SSEUP_CLASS
6907 && regclass[2] == X86_64_SSEUP_CLASS
6908 && regclass[3] == X86_64_SSEUP_CLASS);
6909 tmpmode = OImode;
6910 i += 3;
6911 break;
6912 default:
6913 gcc_unreachable ();
6915 exp [nexps++]
6916 = gen_rtx_EXPR_LIST (VOIDmode,
6917 gen_rtx_REG (tmpmode,
6918 SSE_REGNO (sse_regno)),
6919 GEN_INT (pos*8));
6920 sse_regno++;
6921 break;
6922 default:
6923 gcc_unreachable ();
6927 /* Empty aligned struct, union or class. */
6928 if (nexps == 0)
6929 return NULL;
6931 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6932 for (i = 0; i < nexps; i++)
6933 XVECEXP (ret, 0, i) = exp [i];
6934 return ret;
6937 /* Update the data in CUM to advance over an argument of mode MODE
6938 and data type TYPE. (TYPE is null for libcalls where that information
6939 may not be available.) */
6941 static void
6942 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6943 const_tree type, HOST_WIDE_INT bytes,
6944 HOST_WIDE_INT words)
6946 switch (mode)
6948 default:
6949 break;
6951 case BLKmode:
6952 if (bytes < 0)
6953 break;
6954 /* FALLTHRU */
6956 case DImode:
6957 case SImode:
6958 case HImode:
6959 case QImode:
6960 cum->words += words;
6961 cum->nregs -= words;
6962 cum->regno += words;
6964 if (cum->nregs <= 0)
6966 cum->nregs = 0;
6967 cum->regno = 0;
6969 break;
6971 case OImode:
6972 /* OImode shouldn't be used directly. */
6973 gcc_unreachable ();
6975 case DFmode:
6976 if (cum->float_in_sse < 2)
6977 break;
6978 case SFmode:
6979 if (cum->float_in_sse < 1)
6980 break;
6981 /* FALLTHRU */
6983 case V8SFmode:
6984 case V8SImode:
6985 case V32QImode:
6986 case V16HImode:
6987 case V4DFmode:
6988 case V4DImode:
6989 case TImode:
6990 case V16QImode:
6991 case V8HImode:
6992 case V4SImode:
6993 case V2DImode:
6994 case V4SFmode:
6995 case V2DFmode:
6996 if (!type || !AGGREGATE_TYPE_P (type))
6998 cum->sse_words += words;
6999 cum->sse_nregs -= 1;
7000 cum->sse_regno += 1;
7001 if (cum->sse_nregs <= 0)
7003 cum->sse_nregs = 0;
7004 cum->sse_regno = 0;
7007 break;
7009 case V8QImode:
7010 case V4HImode:
7011 case V2SImode:
7012 case V2SFmode:
7013 case V1TImode:
7014 case V1DImode:
7015 if (!type || !AGGREGATE_TYPE_P (type))
7017 cum->mmx_words += words;
7018 cum->mmx_nregs -= 1;
7019 cum->mmx_regno += 1;
7020 if (cum->mmx_nregs <= 0)
7022 cum->mmx_nregs = 0;
7023 cum->mmx_regno = 0;
7026 break;
7030 static void
7031 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
7032 const_tree type, HOST_WIDE_INT words, bool named)
7034 int int_nregs, sse_nregs;
7036 /* Unnamed 256bit vector mode parameters are passed on stack. */
7037 if (!named && VALID_AVX256_REG_MODE (mode))
7038 return;
7040 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
7041 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
7043 cum->nregs -= int_nregs;
7044 cum->sse_nregs -= sse_nregs;
7045 cum->regno += int_nregs;
7046 cum->sse_regno += sse_nregs;
7048 else
7050 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
7051 cum->words = (cum->words + align - 1) & ~(align - 1);
7052 cum->words += words;
7056 static void
7057 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
7058 HOST_WIDE_INT words)
7060 /* Otherwise, this should be passed indirect. */
7061 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
7063 cum->words += words;
7064 if (cum->nregs > 0)
7066 cum->nregs -= 1;
7067 cum->regno += 1;
7071 /* Update the data in CUM to advance over an argument of mode MODE and
7072 data type TYPE. (TYPE is null for libcalls where that information
7073 may not be available.) */
7075 static void
7076 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
7077 const_tree type, bool named)
7079 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7080 HOST_WIDE_INT bytes, words;
7082 if (mode == BLKmode)
7083 bytes = int_size_in_bytes (type);
7084 else
7085 bytes = GET_MODE_SIZE (mode);
7086 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7088 if (type)
7089 mode = type_natural_mode (type, NULL);
7091 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7092 function_arg_advance_ms_64 (cum, bytes, words);
7093 else if (TARGET_64BIT)
7094 function_arg_advance_64 (cum, mode, type, words, named);
7095 else
7096 function_arg_advance_32 (cum, mode, type, bytes, words);
7099 /* Define where to put the arguments to a function.
7100 Value is zero to push the argument on the stack,
7101 or a hard register in which to store the argument.
7103 MODE is the argument's machine mode.
7104 TYPE is the data type of the argument (as a tree).
7105 This is null for libcalls where that information may
7106 not be available.
7107 CUM is a variable of type CUMULATIVE_ARGS which gives info about
7108 the preceding args and about the function being called.
7109 NAMED is nonzero if this argument is a named parameter
7110 (otherwise it is an extra parameter matching an ellipsis). */
7112 static rtx
7113 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7114 enum machine_mode orig_mode, const_tree type,
7115 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
7117 static bool warnedsse, warnedmmx;
7119 /* Avoid the AL settings for the Unix64 ABI. */
7120 if (mode == VOIDmode)
7121 return constm1_rtx;
7123 switch (mode)
7125 default:
7126 break;
7128 case BLKmode:
7129 if (bytes < 0)
7130 break;
7131 /* FALLTHRU */
7132 case DImode:
7133 case SImode:
7134 case HImode:
7135 case QImode:
7136 if (words <= cum->nregs)
7138 int regno = cum->regno;
7140 /* Fastcall allocates the first two DWORD (SImode) or
7141 smaller arguments to ECX and EDX if it isn't an
7142 aggregate type . */
7143 if (cum->fastcall)
7145 if (mode == BLKmode
7146 || mode == DImode
7147 || (type && AGGREGATE_TYPE_P (type)))
7148 break;
7150 /* ECX not EAX is the first allocated register. */
7151 if (regno == AX_REG)
7152 regno = CX_REG;
7154 return gen_rtx_REG (mode, regno);
7156 break;
7158 case DFmode:
7159 if (cum->float_in_sse < 2)
7160 break;
7161 case SFmode:
7162 if (cum->float_in_sse < 1)
7163 break;
7164 /* FALLTHRU */
7165 case TImode:
7166 /* In 32bit, we pass TImode in xmm registers. */
7167 case V16QImode:
7168 case V8HImode:
7169 case V4SImode:
7170 case V2DImode:
7171 case V4SFmode:
7172 case V2DFmode:
7173 if (!type || !AGGREGATE_TYPE_P (type))
7175 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
7177 warnedsse = true;
7178 warning (0, "SSE vector argument without SSE enabled "
7179 "changes the ABI");
7181 if (cum->sse_nregs)
7182 return gen_reg_or_parallel (mode, orig_mode,
7183 cum->sse_regno + FIRST_SSE_REG);
7185 break;
7187 case OImode:
7188 /* OImode shouldn't be used directly. */
7189 gcc_unreachable ();
7191 case V8SFmode:
7192 case V8SImode:
7193 case V32QImode:
7194 case V16HImode:
7195 case V4DFmode:
7196 case V4DImode:
7197 if (!type || !AGGREGATE_TYPE_P (type))
7199 if (cum->sse_nregs)
7200 return gen_reg_or_parallel (mode, orig_mode,
7201 cum->sse_regno + FIRST_SSE_REG);
7203 break;
7205 case V8QImode:
7206 case V4HImode:
7207 case V2SImode:
7208 case V2SFmode:
7209 case V1TImode:
7210 case V1DImode:
7211 if (!type || !AGGREGATE_TYPE_P (type))
7213 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7215 warnedmmx = true;
7216 warning (0, "MMX vector argument without MMX enabled "
7217 "changes the ABI");
7219 if (cum->mmx_nregs)
7220 return gen_reg_or_parallel (mode, orig_mode,
7221 cum->mmx_regno + FIRST_MMX_REG);
7223 break;
7226 return NULL_RTX;
7229 static rtx
7230 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7231 enum machine_mode orig_mode, const_tree type, bool named)
7233 /* Handle a hidden AL argument containing number of registers
7234 for varargs x86-64 functions. */
7235 if (mode == VOIDmode)
7236 return GEN_INT (cum->maybe_vaarg
7237 ? (cum->sse_nregs < 0
7238 ? X86_64_SSE_REGPARM_MAX
7239 : cum->sse_regno)
7240 : -1);
7242 switch (mode)
7244 default:
7245 break;
7247 case V8SFmode:
7248 case V8SImode:
7249 case V32QImode:
7250 case V16HImode:
7251 case V4DFmode:
7252 case V4DImode:
7253 /* Unnamed 256bit vector mode parameters are passed on stack. */
7254 if (!named)
7255 return NULL;
7256 break;
7259 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7260 cum->sse_nregs,
7261 &x86_64_int_parameter_registers [cum->regno],
7262 cum->sse_regno);
7265 static rtx
7266 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7267 enum machine_mode orig_mode, bool named,
7268 HOST_WIDE_INT bytes)
7270 unsigned int regno;
7272 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7273 We use value of -2 to specify that current function call is MSABI. */
7274 if (mode == VOIDmode)
7275 return GEN_INT (-2);
7277 /* If we've run out of registers, it goes on the stack. */
7278 if (cum->nregs == 0)
7279 return NULL_RTX;
7281 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7283 /* Only floating point modes are passed in anything but integer regs. */
7284 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7286 if (named)
7287 regno = cum->regno + FIRST_SSE_REG;
7288 else
7290 rtx t1, t2;
7292 /* Unnamed floating parameters are passed in both the
7293 SSE and integer registers. */
7294 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7295 t2 = gen_rtx_REG (mode, regno);
7296 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7297 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7298 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7301 /* Handle aggregated types passed in register. */
7302 if (orig_mode == BLKmode)
7304 if (bytes > 0 && bytes <= 8)
7305 mode = (bytes > 4 ? DImode : SImode);
7306 if (mode == BLKmode)
7307 mode = DImode;
7310 return gen_reg_or_parallel (mode, orig_mode, regno);
7313 /* Return where to put the arguments to a function.
7314 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7316 MODE is the argument's machine mode. TYPE is the data type of the
7317 argument. It is null for libcalls where that information may not be
7318 available. CUM gives information about the preceding args and about
7319 the function being called. NAMED is nonzero if this argument is a
7320 named parameter (otherwise it is an extra parameter matching an
7321 ellipsis). */
7323 static rtx
7324 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7325 const_tree type, bool named)
7327 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7328 enum machine_mode mode = omode;
7329 HOST_WIDE_INT bytes, words;
7330 rtx arg;
7332 if (mode == BLKmode)
7333 bytes = int_size_in_bytes (type);
7334 else
7335 bytes = GET_MODE_SIZE (mode);
7336 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7338 /* To simplify the code below, represent vector types with a vector mode
7339 even if MMX/SSE are not active. */
7340 if (type && TREE_CODE (type) == VECTOR_TYPE)
7341 mode = type_natural_mode (type, cum);
7343 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7344 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7345 else if (TARGET_64BIT)
7346 arg = function_arg_64 (cum, mode, omode, type, named);
7347 else
7348 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7350 return arg;
7353 /* A C expression that indicates when an argument must be passed by
7354 reference. If nonzero for an argument, a copy of that argument is
7355 made in memory and a pointer to the argument is passed instead of
7356 the argument itself. The pointer is passed in whatever way is
7357 appropriate for passing a pointer to that type. */
7359 static bool
7360 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7361 const_tree type, bool named ATTRIBUTE_UNUSED)
7363 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7365 /* See Windows x64 Software Convention. */
7366 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7368 int msize = (int) GET_MODE_SIZE (mode);
7369 if (type)
7371 /* Arrays are passed by reference. */
7372 if (TREE_CODE (type) == ARRAY_TYPE)
7373 return true;
7375 if (AGGREGATE_TYPE_P (type))
7377 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7378 are passed by reference. */
7379 msize = int_size_in_bytes (type);
7383 /* __m128 is passed by reference. */
7384 switch (msize) {
7385 case 1: case 2: case 4: case 8:
7386 break;
7387 default:
7388 return true;
7391 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7392 return 1;
7394 return 0;
7397 /* Return true when TYPE should be 128bit aligned for 32bit argument
7398 passing ABI. XXX: This function is obsolete and is only used for
7399 checking psABI compatibility with previous versions of GCC. */
7401 static bool
7402 ix86_compat_aligned_value_p (const_tree type)
7404 enum machine_mode mode = TYPE_MODE (type);
7405 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7406 || mode == TDmode
7407 || mode == TFmode
7408 || mode == TCmode)
7409 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7410 return true;
7411 if (TYPE_ALIGN (type) < 128)
7412 return false;
7414 if (AGGREGATE_TYPE_P (type))
7416 /* Walk the aggregates recursively. */
7417 switch (TREE_CODE (type))
7419 case RECORD_TYPE:
7420 case UNION_TYPE:
7421 case QUAL_UNION_TYPE:
7423 tree field;
7425 /* Walk all the structure fields. */
7426 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7428 if (TREE_CODE (field) == FIELD_DECL
7429 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7430 return true;
7432 break;
7435 case ARRAY_TYPE:
7436 /* Just for use if some languages passes arrays by value. */
7437 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7438 return true;
7439 break;
7441 default:
7442 gcc_unreachable ();
7445 return false;
7448 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7449 XXX: This function is obsolete and is only used for checking psABI
7450 compatibility with previous versions of GCC. */
7452 static unsigned int
7453 ix86_compat_function_arg_boundary (enum machine_mode mode,
7454 const_tree type, unsigned int align)
7456 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7457 natural boundaries. */
7458 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7460 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7461 make an exception for SSE modes since these require 128bit
7462 alignment.
7464 The handling here differs from field_alignment. ICC aligns MMX
7465 arguments to 4 byte boundaries, while structure fields are aligned
7466 to 8 byte boundaries. */
7467 if (!type)
7469 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7470 align = PARM_BOUNDARY;
7472 else
7474 if (!ix86_compat_aligned_value_p (type))
7475 align = PARM_BOUNDARY;
7478 if (align > BIGGEST_ALIGNMENT)
7479 align = BIGGEST_ALIGNMENT;
7480 return align;
7483 /* Return true when TYPE should be 128bit aligned for 32bit argument
7484 passing ABI. */
7486 static bool
7487 ix86_contains_aligned_value_p (const_tree type)
7489 enum machine_mode mode = TYPE_MODE (type);
7491 if (mode == XFmode || mode == XCmode)
7492 return false;
7494 if (TYPE_ALIGN (type) < 128)
7495 return false;
7497 if (AGGREGATE_TYPE_P (type))
7499 /* Walk the aggregates recursively. */
7500 switch (TREE_CODE (type))
7502 case RECORD_TYPE:
7503 case UNION_TYPE:
7504 case QUAL_UNION_TYPE:
7506 tree field;
7508 /* Walk all the structure fields. */
7509 for (field = TYPE_FIELDS (type);
7510 field;
7511 field = DECL_CHAIN (field))
7513 if (TREE_CODE (field) == FIELD_DECL
7514 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7515 return true;
7517 break;
7520 case ARRAY_TYPE:
7521 /* Just for use if some languages passes arrays by value. */
7522 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7523 return true;
7524 break;
7526 default:
7527 gcc_unreachable ();
7530 else
7531 return TYPE_ALIGN (type) >= 128;
7533 return false;
7536 /* Gives the alignment boundary, in bits, of an argument with the
7537 specified mode and type. */
7539 static unsigned int
7540 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7542 unsigned int align;
7543 if (type)
7545 /* Since the main variant type is used for call, we convert it to
7546 the main variant type. */
7547 type = TYPE_MAIN_VARIANT (type);
7548 align = TYPE_ALIGN (type);
7550 else
7551 align = GET_MODE_ALIGNMENT (mode);
7552 if (align < PARM_BOUNDARY)
7553 align = PARM_BOUNDARY;
7554 else
7556 static bool warned;
7557 unsigned int saved_align = align;
7559 if (!TARGET_64BIT)
7561 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7562 if (!type)
7564 if (mode == XFmode || mode == XCmode)
7565 align = PARM_BOUNDARY;
7567 else if (!ix86_contains_aligned_value_p (type))
7568 align = PARM_BOUNDARY;
7570 if (align < 128)
7571 align = PARM_BOUNDARY;
7574 if (warn_psabi
7575 && !warned
7576 && align != ix86_compat_function_arg_boundary (mode, type,
7577 saved_align))
7579 warned = true;
7580 inform (input_location,
7581 "The ABI for passing parameters with %d-byte"
7582 " alignment has changed in GCC 4.6",
7583 align / BITS_PER_UNIT);
7587 return align;
7590 /* Return true if N is a possible register number of function value. */
7592 static bool
7593 ix86_function_value_regno_p (const unsigned int regno)
7595 switch (regno)
7597 case AX_REG:
7598 case DX_REG:
7599 return true;
7600 case DI_REG:
7601 case SI_REG:
7602 return TARGET_64BIT && ix86_abi != MS_ABI;
7604 /* Complex values are returned in %st(0)/%st(1) pair. */
7605 case ST0_REG:
7606 case ST1_REG:
7607 /* TODO: The function should depend on current function ABI but
7608 builtins.c would need updating then. Therefore we use the
7609 default ABI. */
7610 if (TARGET_64BIT && ix86_abi == MS_ABI)
7611 return false;
7612 return TARGET_FLOAT_RETURNS_IN_80387;
7614 /* Complex values are returned in %xmm0/%xmm1 pair. */
7615 case XMM0_REG:
7616 case XMM1_REG:
7617 return TARGET_SSE;
7619 case MM0_REG:
7620 if (TARGET_MACHO || TARGET_64BIT)
7621 return false;
7622 return TARGET_MMX;
7625 return false;
7628 /* Define how to find the value returned by a function.
7629 VALTYPE is the data type of the value (as a tree).
7630 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7631 otherwise, FUNC is 0. */
7633 static rtx
7634 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7635 const_tree fntype, const_tree fn)
7637 unsigned int regno;
7639 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7640 we normally prevent this case when mmx is not available. However
7641 some ABIs may require the result to be returned like DImode. */
7642 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7643 regno = FIRST_MMX_REG;
7645 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7646 we prevent this case when sse is not available. However some ABIs
7647 may require the result to be returned like integer TImode. */
7648 else if (mode == TImode
7649 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7650 regno = FIRST_SSE_REG;
7652 /* 32-byte vector modes in %ymm0. */
7653 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7654 regno = FIRST_SSE_REG;
7656 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7657 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7658 regno = FIRST_FLOAT_REG;
7659 else
7660 /* Most things go in %eax. */
7661 regno = AX_REG;
7663 /* Override FP return register with %xmm0 for local functions when
7664 SSE math is enabled or for functions with sseregparm attribute. */
7665 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7667 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7668 if ((sse_level >= 1 && mode == SFmode)
7669 || (sse_level == 2 && mode == DFmode))
7670 regno = FIRST_SSE_REG;
7673 /* OImode shouldn't be used directly. */
7674 gcc_assert (mode != OImode);
7676 return gen_rtx_REG (orig_mode, regno);
7679 static rtx
7680 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7681 const_tree valtype)
7683 rtx ret;
7685 /* Handle libcalls, which don't provide a type node. */
7686 if (valtype == NULL)
7688 unsigned int regno;
7690 switch (mode)
7692 case SFmode:
7693 case SCmode:
7694 case DFmode:
7695 case DCmode:
7696 case TFmode:
7697 case SDmode:
7698 case DDmode:
7699 case TDmode:
7700 regno = FIRST_SSE_REG;
7701 break;
7702 case XFmode:
7703 case XCmode:
7704 regno = FIRST_FLOAT_REG;
7705 break;
7706 case TCmode:
7707 return NULL;
7708 default:
7709 regno = AX_REG;
7712 return gen_rtx_REG (mode, regno);
7714 else if (POINTER_TYPE_P (valtype))
7716 /* Pointers are always returned in word_mode. */
7717 mode = word_mode;
7720 ret = construct_container (mode, orig_mode, valtype, 1,
7721 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7722 x86_64_int_return_registers, 0);
7724 /* For zero sized structures, construct_container returns NULL, but we
7725 need to keep rest of compiler happy by returning meaningful value. */
7726 if (!ret)
7727 ret = gen_rtx_REG (orig_mode, AX_REG);
7729 return ret;
7732 static rtx
7733 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7734 const_tree valtype)
7736 unsigned int regno = AX_REG;
7738 if (TARGET_SSE)
7740 switch (GET_MODE_SIZE (mode))
7742 case 16:
7743 if (valtype != NULL_TREE
7744 && !VECTOR_INTEGER_TYPE_P (valtype)
7745 && !VECTOR_INTEGER_TYPE_P (valtype)
7746 && !INTEGRAL_TYPE_P (valtype)
7747 && !VECTOR_FLOAT_TYPE_P (valtype))
7748 break;
7749 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7750 && !COMPLEX_MODE_P (mode))
7751 regno = FIRST_SSE_REG;
7752 break;
7753 case 8:
7754 case 4:
7755 if (mode == SFmode || mode == DFmode)
7756 regno = FIRST_SSE_REG;
7757 break;
7758 default:
7759 break;
7762 return gen_rtx_REG (orig_mode, regno);
7765 static rtx
7766 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7767 enum machine_mode orig_mode, enum machine_mode mode)
7769 const_tree fn, fntype;
7771 fn = NULL_TREE;
7772 if (fntype_or_decl && DECL_P (fntype_or_decl))
7773 fn = fntype_or_decl;
7774 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7776 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7777 return function_value_ms_64 (orig_mode, mode, valtype);
7778 else if (TARGET_64BIT)
7779 return function_value_64 (orig_mode, mode, valtype);
7780 else
7781 return function_value_32 (orig_mode, mode, fntype, fn);
7784 static rtx
7785 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7786 bool outgoing ATTRIBUTE_UNUSED)
7788 enum machine_mode mode, orig_mode;
7790 orig_mode = TYPE_MODE (valtype);
7791 mode = type_natural_mode (valtype, NULL);
7792 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7795 /* Pointer function arguments and return values are promoted to
7796 word_mode. */
7798 static enum machine_mode
7799 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7800 int *punsignedp, const_tree fntype,
7801 int for_return)
7803 if (type != NULL_TREE && POINTER_TYPE_P (type))
7805 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7806 return word_mode;
7808 return default_promote_function_mode (type, mode, punsignedp, fntype,
7809 for_return);
7812 /* Return true if a structure, union or array with MODE containing FIELD
7813 should be accessed using BLKmode. */
7815 static bool
7816 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7818 /* Union with XFmode must be in BLKmode. */
7819 return (mode == XFmode
7820 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7821 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7825 ix86_libcall_value (enum machine_mode mode)
7827 return ix86_function_value_1 (NULL, NULL, mode, mode);
7830 /* Return true iff type is returned in memory. */
7832 static bool ATTRIBUTE_UNUSED
7833 return_in_memory_32 (const_tree type, enum machine_mode mode)
7835 HOST_WIDE_INT size;
7837 if (mode == BLKmode)
7838 return true;
7840 size = int_size_in_bytes (type);
7842 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7843 return false;
7845 if (VECTOR_MODE_P (mode) || mode == TImode)
7847 /* User-created vectors small enough to fit in EAX. */
7848 if (size < 8)
7849 return false;
7851 /* MMX/3dNow values are returned in MM0,
7852 except when it doesn't exits or the ABI prescribes otherwise. */
7853 if (size == 8)
7854 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7856 /* SSE values are returned in XMM0, except when it doesn't exist. */
7857 if (size == 16)
7858 return !TARGET_SSE;
7860 /* AVX values are returned in YMM0, except when it doesn't exist. */
7861 if (size == 32)
7862 return !TARGET_AVX;
7865 if (mode == XFmode)
7866 return false;
7868 if (size > 12)
7869 return true;
7871 /* OImode shouldn't be used directly. */
7872 gcc_assert (mode != OImode);
7874 return false;
7877 static bool ATTRIBUTE_UNUSED
7878 return_in_memory_64 (const_tree type, enum machine_mode mode)
7880 int needed_intregs, needed_sseregs;
7881 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7884 static bool ATTRIBUTE_UNUSED
7885 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7887 HOST_WIDE_INT size = int_size_in_bytes (type);
7889 /* __m128 is returned in xmm0. */
7890 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7891 || VECTOR_FLOAT_TYPE_P (type))
7892 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7893 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7894 return false;
7896 /* Otherwise, the size must be exactly in [1248]. */
7897 return size != 1 && size != 2 && size != 4 && size != 8;
7900 static bool
7901 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7903 #ifdef SUBTARGET_RETURN_IN_MEMORY
7904 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7905 #else
7906 const enum machine_mode mode = type_natural_mode (type, NULL);
7908 if (TARGET_64BIT)
7910 if (ix86_function_type_abi (fntype) == MS_ABI)
7911 return return_in_memory_ms_64 (type, mode);
7912 else
7913 return return_in_memory_64 (type, mode);
7915 else
7916 return return_in_memory_32 (type, mode);
7917 #endif
7920 /* When returning SSE vector types, we have a choice of either
7921 (1) being abi incompatible with a -march switch, or
7922 (2) generating an error.
7923 Given no good solution, I think the safest thing is one warning.
7924 The user won't be able to use -Werror, but....
7926 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7927 called in response to actually generating a caller or callee that
7928 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7929 via aggregate_value_p for general type probing from tree-ssa. */
7931 static rtx
7932 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7934 static bool warnedsse, warnedmmx;
7936 if (!TARGET_64BIT && type)
7938 /* Look at the return type of the function, not the function type. */
7939 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7941 if (!TARGET_SSE && !warnedsse)
7943 if (mode == TImode
7944 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7946 warnedsse = true;
7947 warning (0, "SSE vector return without SSE enabled "
7948 "changes the ABI");
7952 if (!TARGET_MMX && !warnedmmx)
7954 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7956 warnedmmx = true;
7957 warning (0, "MMX vector return without MMX enabled "
7958 "changes the ABI");
7963 return NULL;
7967 /* Create the va_list data type. */
7969 /* Returns the calling convention specific va_list date type.
7970 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7972 static tree
7973 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7975 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7977 /* For i386 we use plain pointer to argument area. */
7978 if (!TARGET_64BIT || abi == MS_ABI)
7979 return build_pointer_type (char_type_node);
7981 record = lang_hooks.types.make_type (RECORD_TYPE);
7982 type_decl = build_decl (BUILTINS_LOCATION,
7983 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7985 f_gpr = build_decl (BUILTINS_LOCATION,
7986 FIELD_DECL, get_identifier ("gp_offset"),
7987 unsigned_type_node);
7988 f_fpr = build_decl (BUILTINS_LOCATION,
7989 FIELD_DECL, get_identifier ("fp_offset"),
7990 unsigned_type_node);
7991 f_ovf = build_decl (BUILTINS_LOCATION,
7992 FIELD_DECL, get_identifier ("overflow_arg_area"),
7993 ptr_type_node);
7994 f_sav = build_decl (BUILTINS_LOCATION,
7995 FIELD_DECL, get_identifier ("reg_save_area"),
7996 ptr_type_node);
7998 va_list_gpr_counter_field = f_gpr;
7999 va_list_fpr_counter_field = f_fpr;
8001 DECL_FIELD_CONTEXT (f_gpr) = record;
8002 DECL_FIELD_CONTEXT (f_fpr) = record;
8003 DECL_FIELD_CONTEXT (f_ovf) = record;
8004 DECL_FIELD_CONTEXT (f_sav) = record;
8006 TYPE_STUB_DECL (record) = type_decl;
8007 TYPE_NAME (record) = type_decl;
8008 TYPE_FIELDS (record) = f_gpr;
8009 DECL_CHAIN (f_gpr) = f_fpr;
8010 DECL_CHAIN (f_fpr) = f_ovf;
8011 DECL_CHAIN (f_ovf) = f_sav;
8013 layout_type (record);
8015 /* The correct type is an array type of one element. */
8016 return build_array_type (record, build_index_type (size_zero_node));
8019 /* Setup the builtin va_list data type and for 64-bit the additional
8020 calling convention specific va_list data types. */
8022 static tree
8023 ix86_build_builtin_va_list (void)
8025 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
8027 /* Initialize abi specific va_list builtin types. */
8028 if (TARGET_64BIT)
8030 tree t;
8031 if (ix86_abi == MS_ABI)
8033 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
8034 if (TREE_CODE (t) != RECORD_TYPE)
8035 t = build_variant_type_copy (t);
8036 sysv_va_list_type_node = t;
8038 else
8040 t = ret;
8041 if (TREE_CODE (t) != RECORD_TYPE)
8042 t = build_variant_type_copy (t);
8043 sysv_va_list_type_node = t;
8045 if (ix86_abi != MS_ABI)
8047 t = ix86_build_builtin_va_list_abi (MS_ABI);
8048 if (TREE_CODE (t) != RECORD_TYPE)
8049 t = build_variant_type_copy (t);
8050 ms_va_list_type_node = t;
8052 else
8054 t = ret;
8055 if (TREE_CODE (t) != RECORD_TYPE)
8056 t = build_variant_type_copy (t);
8057 ms_va_list_type_node = t;
8061 return ret;
8064 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
8066 static void
8067 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
8069 rtx save_area, mem;
8070 alias_set_type set;
8071 int i, max;
8073 /* GPR size of varargs save area. */
8074 if (cfun->va_list_gpr_size)
8075 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
8076 else
8077 ix86_varargs_gpr_size = 0;
8079 /* FPR size of varargs save area. We don't need it if we don't pass
8080 anything in SSE registers. */
8081 if (TARGET_SSE && cfun->va_list_fpr_size)
8082 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
8083 else
8084 ix86_varargs_fpr_size = 0;
8086 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
8087 return;
8089 save_area = frame_pointer_rtx;
8090 set = get_varargs_alias_set ();
8092 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
8093 if (max > X86_64_REGPARM_MAX)
8094 max = X86_64_REGPARM_MAX;
8096 for (i = cum->regno; i < max; i++)
8098 mem = gen_rtx_MEM (word_mode,
8099 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
8100 MEM_NOTRAP_P (mem) = 1;
8101 set_mem_alias_set (mem, set);
8102 emit_move_insn (mem,
8103 gen_rtx_REG (word_mode,
8104 x86_64_int_parameter_registers[i]));
8107 if (ix86_varargs_fpr_size)
8109 enum machine_mode smode;
8110 rtx label, test;
8112 /* Now emit code to save SSE registers. The AX parameter contains number
8113 of SSE parameter registers used to call this function, though all we
8114 actually check here is the zero/non-zero status. */
8116 label = gen_label_rtx ();
8117 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
8118 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
8119 label));
8121 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
8122 we used movdqa (i.e. TImode) instead? Perhaps even better would
8123 be if we could determine the real mode of the data, via a hook
8124 into pass_stdarg. Ignore all that for now. */
8125 smode = V4SFmode;
8126 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
8127 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
8129 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
8130 if (max > X86_64_SSE_REGPARM_MAX)
8131 max = X86_64_SSE_REGPARM_MAX;
8133 for (i = cum->sse_regno; i < max; ++i)
8135 mem = plus_constant (Pmode, save_area,
8136 i * 16 + ix86_varargs_gpr_size);
8137 mem = gen_rtx_MEM (smode, mem);
8138 MEM_NOTRAP_P (mem) = 1;
8139 set_mem_alias_set (mem, set);
8140 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
8142 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
8145 emit_label (label);
8149 static void
8150 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
8152 alias_set_type set = get_varargs_alias_set ();
8153 int i;
8155 /* Reset to zero, as there might be a sysv vaarg used
8156 before. */
8157 ix86_varargs_gpr_size = 0;
8158 ix86_varargs_fpr_size = 0;
8160 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
8162 rtx reg, mem;
8164 mem = gen_rtx_MEM (Pmode,
8165 plus_constant (Pmode, virtual_incoming_args_rtx,
8166 i * UNITS_PER_WORD));
8167 MEM_NOTRAP_P (mem) = 1;
8168 set_mem_alias_set (mem, set);
8170 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
8171 emit_move_insn (mem, reg);
8175 static void
8176 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
8177 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8178 int no_rtl)
8180 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8181 CUMULATIVE_ARGS next_cum;
8182 tree fntype;
8184 /* This argument doesn't appear to be used anymore. Which is good,
8185 because the old code here didn't suppress rtl generation. */
8186 gcc_assert (!no_rtl);
8188 if (!TARGET_64BIT)
8189 return;
8191 fntype = TREE_TYPE (current_function_decl);
8193 /* For varargs, we do not want to skip the dummy va_dcl argument.
8194 For stdargs, we do want to skip the last named argument. */
8195 next_cum = *cum;
8196 if (stdarg_p (fntype))
8197 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8198 true);
8200 if (cum->call_abi == MS_ABI)
8201 setup_incoming_varargs_ms_64 (&next_cum);
8202 else
8203 setup_incoming_varargs_64 (&next_cum);
8206 /* Checks if TYPE is of kind va_list char *. */
8208 static bool
8209 is_va_list_char_pointer (tree type)
8211 tree canonic;
8213 /* For 32-bit it is always true. */
8214 if (!TARGET_64BIT)
8215 return true;
8216 canonic = ix86_canonical_va_list_type (type);
8217 return (canonic == ms_va_list_type_node
8218 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8221 /* Implement va_start. */
8223 static void
8224 ix86_va_start (tree valist, rtx nextarg)
8226 HOST_WIDE_INT words, n_gpr, n_fpr;
8227 tree f_gpr, f_fpr, f_ovf, f_sav;
8228 tree gpr, fpr, ovf, sav, t;
8229 tree type;
8230 rtx ovf_rtx;
8232 if (flag_split_stack
8233 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8235 unsigned int scratch_regno;
8237 /* When we are splitting the stack, we can't refer to the stack
8238 arguments using internal_arg_pointer, because they may be on
8239 the old stack. The split stack prologue will arrange to
8240 leave a pointer to the old stack arguments in a scratch
8241 register, which we here copy to a pseudo-register. The split
8242 stack prologue can't set the pseudo-register directly because
8243 it (the prologue) runs before any registers have been saved. */
8245 scratch_regno = split_stack_prologue_scratch_regno ();
8246 if (scratch_regno != INVALID_REGNUM)
8248 rtx reg, seq;
8250 reg = gen_reg_rtx (Pmode);
8251 cfun->machine->split_stack_varargs_pointer = reg;
8253 start_sequence ();
8254 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8255 seq = get_insns ();
8256 end_sequence ();
8258 push_topmost_sequence ();
8259 emit_insn_after (seq, entry_of_function ());
8260 pop_topmost_sequence ();
8264 /* Only 64bit target needs something special. */
8265 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8267 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8268 std_expand_builtin_va_start (valist, nextarg);
8269 else
8271 rtx va_r, next;
8273 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8274 next = expand_binop (ptr_mode, add_optab,
8275 cfun->machine->split_stack_varargs_pointer,
8276 crtl->args.arg_offset_rtx,
8277 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8278 convert_move (va_r, next, 0);
8280 return;
8283 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8284 f_fpr = DECL_CHAIN (f_gpr);
8285 f_ovf = DECL_CHAIN (f_fpr);
8286 f_sav = DECL_CHAIN (f_ovf);
8288 valist = build_simple_mem_ref (valist);
8289 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8290 /* The following should be folded into the MEM_REF offset. */
8291 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8292 f_gpr, NULL_TREE);
8293 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8294 f_fpr, NULL_TREE);
8295 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8296 f_ovf, NULL_TREE);
8297 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8298 f_sav, NULL_TREE);
8300 /* Count number of gp and fp argument registers used. */
8301 words = crtl->args.info.words;
8302 n_gpr = crtl->args.info.regno;
8303 n_fpr = crtl->args.info.sse_regno;
8305 if (cfun->va_list_gpr_size)
8307 type = TREE_TYPE (gpr);
8308 t = build2 (MODIFY_EXPR, type,
8309 gpr, build_int_cst (type, n_gpr * 8));
8310 TREE_SIDE_EFFECTS (t) = 1;
8311 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8314 if (TARGET_SSE && cfun->va_list_fpr_size)
8316 type = TREE_TYPE (fpr);
8317 t = build2 (MODIFY_EXPR, type, fpr,
8318 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8319 TREE_SIDE_EFFECTS (t) = 1;
8320 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8323 /* Find the overflow area. */
8324 type = TREE_TYPE (ovf);
8325 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8326 ovf_rtx = crtl->args.internal_arg_pointer;
8327 else
8328 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8329 t = make_tree (type, ovf_rtx);
8330 if (words != 0)
8331 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8332 t = build2 (MODIFY_EXPR, type, ovf, t);
8333 TREE_SIDE_EFFECTS (t) = 1;
8334 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8336 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8338 /* Find the register save area.
8339 Prologue of the function save it right above stack frame. */
8340 type = TREE_TYPE (sav);
8341 t = make_tree (type, frame_pointer_rtx);
8342 if (!ix86_varargs_gpr_size)
8343 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8344 t = build2 (MODIFY_EXPR, type, sav, t);
8345 TREE_SIDE_EFFECTS (t) = 1;
8346 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8350 /* Implement va_arg. */
8352 static tree
8353 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8354 gimple_seq *post_p)
8356 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8357 tree f_gpr, f_fpr, f_ovf, f_sav;
8358 tree gpr, fpr, ovf, sav, t;
8359 int size, rsize;
8360 tree lab_false, lab_over = NULL_TREE;
8361 tree addr, t2;
8362 rtx container;
8363 int indirect_p = 0;
8364 tree ptrtype;
8365 enum machine_mode nat_mode;
8366 unsigned int arg_boundary;
8368 /* Only 64bit target needs something special. */
8369 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8370 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8372 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8373 f_fpr = DECL_CHAIN (f_gpr);
8374 f_ovf = DECL_CHAIN (f_fpr);
8375 f_sav = DECL_CHAIN (f_ovf);
8377 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8378 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8379 valist = build_va_arg_indirect_ref (valist);
8380 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8381 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8382 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8384 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8385 if (indirect_p)
8386 type = build_pointer_type (type);
8387 size = int_size_in_bytes (type);
8388 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8390 nat_mode = type_natural_mode (type, NULL);
8391 switch (nat_mode)
8393 case V8SFmode:
8394 case V8SImode:
8395 case V32QImode:
8396 case V16HImode:
8397 case V4DFmode:
8398 case V4DImode:
8399 /* Unnamed 256bit vector mode parameters are passed on stack. */
8400 if (!TARGET_64BIT_MS_ABI)
8402 container = NULL;
8403 break;
8406 default:
8407 container = construct_container (nat_mode, TYPE_MODE (type),
8408 type, 0, X86_64_REGPARM_MAX,
8409 X86_64_SSE_REGPARM_MAX, intreg,
8411 break;
8414 /* Pull the value out of the saved registers. */
8416 addr = create_tmp_var (ptr_type_node, "addr");
8418 if (container)
8420 int needed_intregs, needed_sseregs;
8421 bool need_temp;
8422 tree int_addr, sse_addr;
8424 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8425 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8427 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8429 need_temp = (!REG_P (container)
8430 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8431 || TYPE_ALIGN (type) > 128));
8433 /* In case we are passing structure, verify that it is consecutive block
8434 on the register save area. If not we need to do moves. */
8435 if (!need_temp && !REG_P (container))
8437 /* Verify that all registers are strictly consecutive */
8438 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8440 int i;
8442 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8444 rtx slot = XVECEXP (container, 0, i);
8445 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8446 || INTVAL (XEXP (slot, 1)) != i * 16)
8447 need_temp = 1;
8450 else
8452 int i;
8454 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8456 rtx slot = XVECEXP (container, 0, i);
8457 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8458 || INTVAL (XEXP (slot, 1)) != i * 8)
8459 need_temp = 1;
8463 if (!need_temp)
8465 int_addr = addr;
8466 sse_addr = addr;
8468 else
8470 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8471 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8474 /* First ensure that we fit completely in registers. */
8475 if (needed_intregs)
8477 t = build_int_cst (TREE_TYPE (gpr),
8478 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8479 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8480 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8481 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8482 gimplify_and_add (t, pre_p);
8484 if (needed_sseregs)
8486 t = build_int_cst (TREE_TYPE (fpr),
8487 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8488 + X86_64_REGPARM_MAX * 8);
8489 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8490 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8491 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8492 gimplify_and_add (t, pre_p);
8495 /* Compute index to start of area used for integer regs. */
8496 if (needed_intregs)
8498 /* int_addr = gpr + sav; */
8499 t = fold_build_pointer_plus (sav, gpr);
8500 gimplify_assign (int_addr, t, pre_p);
8502 if (needed_sseregs)
8504 /* sse_addr = fpr + sav; */
8505 t = fold_build_pointer_plus (sav, fpr);
8506 gimplify_assign (sse_addr, t, pre_p);
8508 if (need_temp)
8510 int i, prev_size = 0;
8511 tree temp = create_tmp_var (type, "va_arg_tmp");
8513 /* addr = &temp; */
8514 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8515 gimplify_assign (addr, t, pre_p);
8517 for (i = 0; i < XVECLEN (container, 0); i++)
8519 rtx slot = XVECEXP (container, 0, i);
8520 rtx reg = XEXP (slot, 0);
8521 enum machine_mode mode = GET_MODE (reg);
8522 tree piece_type;
8523 tree addr_type;
8524 tree daddr_type;
8525 tree src_addr, src;
8526 int src_offset;
8527 tree dest_addr, dest;
8528 int cur_size = GET_MODE_SIZE (mode);
8530 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8531 prev_size = INTVAL (XEXP (slot, 1));
8532 if (prev_size + cur_size > size)
8534 cur_size = size - prev_size;
8535 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8536 if (mode == BLKmode)
8537 mode = QImode;
8539 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8540 if (mode == GET_MODE (reg))
8541 addr_type = build_pointer_type (piece_type);
8542 else
8543 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8544 true);
8545 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8546 true);
8548 if (SSE_REGNO_P (REGNO (reg)))
8550 src_addr = sse_addr;
8551 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8553 else
8555 src_addr = int_addr;
8556 src_offset = REGNO (reg) * 8;
8558 src_addr = fold_convert (addr_type, src_addr);
8559 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8561 dest_addr = fold_convert (daddr_type, addr);
8562 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8563 if (cur_size == GET_MODE_SIZE (mode))
8565 src = build_va_arg_indirect_ref (src_addr);
8566 dest = build_va_arg_indirect_ref (dest_addr);
8568 gimplify_assign (dest, src, pre_p);
8570 else
8572 tree copy
8573 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8574 3, dest_addr, src_addr,
8575 size_int (cur_size));
8576 gimplify_and_add (copy, pre_p);
8578 prev_size += cur_size;
8582 if (needed_intregs)
8584 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8585 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8586 gimplify_assign (gpr, t, pre_p);
8589 if (needed_sseregs)
8591 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8592 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8593 gimplify_assign (fpr, t, pre_p);
8596 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8598 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8601 /* ... otherwise out of the overflow area. */
8603 /* When we align parameter on stack for caller, if the parameter
8604 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8605 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8606 here with caller. */
8607 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8608 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8609 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8611 /* Care for on-stack alignment if needed. */
8612 if (arg_boundary <= 64 || size == 0)
8613 t = ovf;
8614 else
8616 HOST_WIDE_INT align = arg_boundary / 8;
8617 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8618 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8619 build_int_cst (TREE_TYPE (t), -align));
8622 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8623 gimplify_assign (addr, t, pre_p);
8625 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8626 gimplify_assign (unshare_expr (ovf), t, pre_p);
8628 if (container)
8629 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8631 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8632 addr = fold_convert (ptrtype, addr);
8634 if (indirect_p)
8635 addr = build_va_arg_indirect_ref (addr);
8636 return build_va_arg_indirect_ref (addr);
8639 /* Return true if OPNUM's MEM should be matched
8640 in movabs* patterns. */
8642 bool
8643 ix86_check_movabs (rtx insn, int opnum)
8645 rtx set, mem;
8647 set = PATTERN (insn);
8648 if (GET_CODE (set) == PARALLEL)
8649 set = XVECEXP (set, 0, 0);
8650 gcc_assert (GET_CODE (set) == SET);
8651 mem = XEXP (set, opnum);
8652 while (GET_CODE (mem) == SUBREG)
8653 mem = SUBREG_REG (mem);
8654 gcc_assert (MEM_P (mem));
8655 return volatile_ok || !MEM_VOLATILE_P (mem);
8658 /* Initialize the table of extra 80387 mathematical constants. */
8660 static void
8661 init_ext_80387_constants (void)
8663 static const char * cst[5] =
8665 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8666 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8667 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8668 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8669 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8671 int i;
8673 for (i = 0; i < 5; i++)
8675 real_from_string (&ext_80387_constants_table[i], cst[i]);
8676 /* Ensure each constant is rounded to XFmode precision. */
8677 real_convert (&ext_80387_constants_table[i],
8678 XFmode, &ext_80387_constants_table[i]);
8681 ext_80387_constants_init = 1;
8684 /* Return non-zero if the constant is something that
8685 can be loaded with a special instruction. */
8688 standard_80387_constant_p (rtx x)
8690 enum machine_mode mode = GET_MODE (x);
8692 REAL_VALUE_TYPE r;
8694 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8695 return -1;
8697 if (x == CONST0_RTX (mode))
8698 return 1;
8699 if (x == CONST1_RTX (mode))
8700 return 2;
8702 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8704 /* For XFmode constants, try to find a special 80387 instruction when
8705 optimizing for size or on those CPUs that benefit from them. */
8706 if (mode == XFmode
8707 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8709 int i;
8711 if (! ext_80387_constants_init)
8712 init_ext_80387_constants ();
8714 for (i = 0; i < 5; i++)
8715 if (real_identical (&r, &ext_80387_constants_table[i]))
8716 return i + 3;
8719 /* Load of the constant -0.0 or -1.0 will be split as
8720 fldz;fchs or fld1;fchs sequence. */
8721 if (real_isnegzero (&r))
8722 return 8;
8723 if (real_identical (&r, &dconstm1))
8724 return 9;
8726 return 0;
8729 /* Return the opcode of the special instruction to be used to load
8730 the constant X. */
8732 const char *
8733 standard_80387_constant_opcode (rtx x)
8735 switch (standard_80387_constant_p (x))
8737 case 1:
8738 return "fldz";
8739 case 2:
8740 return "fld1";
8741 case 3:
8742 return "fldlg2";
8743 case 4:
8744 return "fldln2";
8745 case 5:
8746 return "fldl2e";
8747 case 6:
8748 return "fldl2t";
8749 case 7:
8750 return "fldpi";
8751 case 8:
8752 case 9:
8753 return "#";
8754 default:
8755 gcc_unreachable ();
8759 /* Return the CONST_DOUBLE representing the 80387 constant that is
8760 loaded by the specified special instruction. The argument IDX
8761 matches the return value from standard_80387_constant_p. */
8764 standard_80387_constant_rtx (int idx)
8766 int i;
8768 if (! ext_80387_constants_init)
8769 init_ext_80387_constants ();
8771 switch (idx)
8773 case 3:
8774 case 4:
8775 case 5:
8776 case 6:
8777 case 7:
8778 i = idx - 3;
8779 break;
8781 default:
8782 gcc_unreachable ();
8785 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8786 XFmode);
8789 /* Return 1 if X is all 0s and 2 if x is all 1s
8790 in supported SSE/AVX vector mode. */
8793 standard_sse_constant_p (rtx x)
8795 enum machine_mode mode = GET_MODE (x);
8797 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8798 return 1;
8799 if (vector_all_ones_operand (x, mode))
8800 switch (mode)
8802 case V16QImode:
8803 case V8HImode:
8804 case V4SImode:
8805 case V2DImode:
8806 if (TARGET_SSE2)
8807 return 2;
8808 case V32QImode:
8809 case V16HImode:
8810 case V8SImode:
8811 case V4DImode:
8812 if (TARGET_AVX2)
8813 return 2;
8814 default:
8815 break;
8818 return 0;
8821 /* Return the opcode of the special instruction to be used to load
8822 the constant X. */
8824 const char *
8825 standard_sse_constant_opcode (rtx insn, rtx x)
8827 switch (standard_sse_constant_p (x))
8829 case 1:
8830 switch (get_attr_mode (insn))
8832 case MODE_TI:
8833 return "%vpxor\t%0, %d0";
8834 case MODE_V2DF:
8835 return "%vxorpd\t%0, %d0";
8836 case MODE_V4SF:
8837 return "%vxorps\t%0, %d0";
8839 case MODE_OI:
8840 return "vpxor\t%x0, %x0, %x0";
8841 case MODE_V4DF:
8842 return "vxorpd\t%x0, %x0, %x0";
8843 case MODE_V8SF:
8844 return "vxorps\t%x0, %x0, %x0";
8846 default:
8847 break;
8850 case 2:
8851 if (get_attr_mode (insn) == MODE_XI
8852 || get_attr_mode (insn) == MODE_V8DF
8853 || get_attr_mode (insn) == MODE_V16SF)
8854 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8855 if (TARGET_AVX)
8856 return "vpcmpeqd\t%0, %0, %0";
8857 else
8858 return "pcmpeqd\t%0, %0";
8860 default:
8861 break;
8863 gcc_unreachable ();
8866 /* Returns true if OP contains a symbol reference */
8868 bool
8869 symbolic_reference_mentioned_p (rtx op)
8871 const char *fmt;
8872 int i;
8874 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8875 return true;
8877 fmt = GET_RTX_FORMAT (GET_CODE (op));
8878 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8880 if (fmt[i] == 'E')
8882 int j;
8884 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8885 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8886 return true;
8889 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8890 return true;
8893 return false;
8896 /* Return true if it is appropriate to emit `ret' instructions in the
8897 body of a function. Do this only if the epilogue is simple, needing a
8898 couple of insns. Prior to reloading, we can't tell how many registers
8899 must be saved, so return false then. Return false if there is no frame
8900 marker to de-allocate. */
8902 bool
8903 ix86_can_use_return_insn_p (void)
8905 struct ix86_frame frame;
8907 if (! reload_completed || frame_pointer_needed)
8908 return 0;
8910 /* Don't allow more than 32k pop, since that's all we can do
8911 with one instruction. */
8912 if (crtl->args.pops_args && crtl->args.size >= 32768)
8913 return 0;
8915 ix86_compute_frame_layout (&frame);
8916 return (frame.stack_pointer_offset == UNITS_PER_WORD
8917 && (frame.nregs + frame.nsseregs) == 0);
8920 /* Value should be nonzero if functions must have frame pointers.
8921 Zero means the frame pointer need not be set up (and parms may
8922 be accessed via the stack pointer) in functions that seem suitable. */
8924 static bool
8925 ix86_frame_pointer_required (void)
8927 /* If we accessed previous frames, then the generated code expects
8928 to be able to access the saved ebp value in our frame. */
8929 if (cfun->machine->accesses_prev_frame)
8930 return true;
8932 /* Several x86 os'es need a frame pointer for other reasons,
8933 usually pertaining to setjmp. */
8934 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8935 return true;
8937 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8938 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8939 return true;
8941 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8942 allocation is 4GB. */
8943 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8944 return true;
8946 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8947 turns off the frame pointer by default. Turn it back on now if
8948 we've not got a leaf function. */
8949 if (TARGET_OMIT_LEAF_FRAME_POINTER
8950 && (!crtl->is_leaf
8951 || ix86_current_function_calls_tls_descriptor))
8952 return true;
8954 if (crtl->profile && !flag_fentry)
8955 return true;
8957 return false;
8960 /* Record that the current function accesses previous call frames. */
8962 void
8963 ix86_setup_frame_addresses (void)
8965 cfun->machine->accesses_prev_frame = 1;
8968 #ifndef USE_HIDDEN_LINKONCE
8969 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8970 # define USE_HIDDEN_LINKONCE 1
8971 # else
8972 # define USE_HIDDEN_LINKONCE 0
8973 # endif
8974 #endif
8976 static int pic_labels_used;
8978 /* Fills in the label name that should be used for a pc thunk for
8979 the given register. */
8981 static void
8982 get_pc_thunk_name (char name[32], unsigned int regno)
8984 gcc_assert (!TARGET_64BIT);
8986 if (USE_HIDDEN_LINKONCE)
8987 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8988 else
8989 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8993 /* This function generates code for -fpic that loads %ebx with
8994 the return address of the caller and then returns. */
8996 static void
8997 ix86_code_end (void)
8999 rtx xops[2];
9000 int regno;
9002 for (regno = AX_REG; regno <= SP_REG; regno++)
9004 char name[32];
9005 tree decl;
9007 if (!(pic_labels_used & (1 << regno)))
9008 continue;
9010 get_pc_thunk_name (name, regno);
9012 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
9013 get_identifier (name),
9014 build_function_type_list (void_type_node, NULL_TREE));
9015 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
9016 NULL_TREE, void_type_node);
9017 TREE_PUBLIC (decl) = 1;
9018 TREE_STATIC (decl) = 1;
9019 DECL_IGNORED_P (decl) = 1;
9021 #if TARGET_MACHO
9022 if (TARGET_MACHO)
9024 switch_to_section (darwin_sections[text_coal_section]);
9025 fputs ("\t.weak_definition\t", asm_out_file);
9026 assemble_name (asm_out_file, name);
9027 fputs ("\n\t.private_extern\t", asm_out_file);
9028 assemble_name (asm_out_file, name);
9029 putc ('\n', asm_out_file);
9030 ASM_OUTPUT_LABEL (asm_out_file, name);
9031 DECL_WEAK (decl) = 1;
9033 else
9034 #endif
9035 if (USE_HIDDEN_LINKONCE)
9037 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
9039 targetm.asm_out.unique_section (decl, 0);
9040 switch_to_section (get_named_section (decl, NULL, 0));
9042 targetm.asm_out.globalize_label (asm_out_file, name);
9043 fputs ("\t.hidden\t", asm_out_file);
9044 assemble_name (asm_out_file, name);
9045 putc ('\n', asm_out_file);
9046 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
9048 else
9050 switch_to_section (text_section);
9051 ASM_OUTPUT_LABEL (asm_out_file, name);
9054 DECL_INITIAL (decl) = make_node (BLOCK);
9055 current_function_decl = decl;
9056 init_function_start (decl);
9057 first_function_block_is_cold = false;
9058 /* Make sure unwind info is emitted for the thunk if needed. */
9059 final_start_function (emit_barrier (), asm_out_file, 1);
9061 /* Pad stack IP move with 4 instructions (two NOPs count
9062 as one instruction). */
9063 if (TARGET_PAD_SHORT_FUNCTION)
9065 int i = 8;
9067 while (i--)
9068 fputs ("\tnop\n", asm_out_file);
9071 xops[0] = gen_rtx_REG (Pmode, regno);
9072 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
9073 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
9074 fputs ("\tret\n", asm_out_file);
9075 final_end_function ();
9076 init_insn_lengths ();
9077 free_after_compilation (cfun);
9078 set_cfun (NULL);
9079 current_function_decl = NULL;
9082 if (flag_split_stack)
9083 file_end_indicate_split_stack ();
9086 /* Emit code for the SET_GOT patterns. */
9088 const char *
9089 output_set_got (rtx dest, rtx label)
9091 rtx xops[3];
9093 xops[0] = dest;
9095 if (TARGET_VXWORKS_RTP && flag_pic)
9097 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
9098 xops[2] = gen_rtx_MEM (Pmode,
9099 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
9100 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
9102 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
9103 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
9104 an unadorned address. */
9105 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
9106 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
9107 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
9108 return "";
9111 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
9113 if (!flag_pic)
9115 if (TARGET_MACHO)
9116 /* We don't need a pic base, we're not producing pic. */
9117 gcc_unreachable ();
9119 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
9120 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
9121 targetm.asm_out.internal_label (asm_out_file, "L",
9122 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
9124 else
9126 char name[32];
9127 get_pc_thunk_name (name, REGNO (dest));
9128 pic_labels_used |= 1 << REGNO (dest);
9130 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
9131 xops[2] = gen_rtx_MEM (QImode, xops[2]);
9132 output_asm_insn ("call\t%X2", xops);
9134 #if TARGET_MACHO
9135 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
9136 This is what will be referenced by the Mach-O PIC subsystem. */
9137 if (machopic_should_output_picbase_label () || !label)
9138 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
9140 /* When we are restoring the pic base at the site of a nonlocal label,
9141 and we decided to emit the pic base above, we will still output a
9142 local label used for calculating the correction offset (even though
9143 the offset will be 0 in that case). */
9144 if (label)
9145 targetm.asm_out.internal_label (asm_out_file, "L",
9146 CODE_LABEL_NUMBER (label));
9147 #endif
9150 if (!TARGET_MACHO)
9151 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
9153 return "";
9156 /* Generate an "push" pattern for input ARG. */
9158 static rtx
9159 gen_push (rtx arg)
9161 struct machine_function *m = cfun->machine;
9163 if (m->fs.cfa_reg == stack_pointer_rtx)
9164 m->fs.cfa_offset += UNITS_PER_WORD;
9165 m->fs.sp_offset += UNITS_PER_WORD;
9167 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9168 arg = gen_rtx_REG (word_mode, REGNO (arg));
9170 return gen_rtx_SET (VOIDmode,
9171 gen_rtx_MEM (word_mode,
9172 gen_rtx_PRE_DEC (Pmode,
9173 stack_pointer_rtx)),
9174 arg);
9177 /* Generate an "pop" pattern for input ARG. */
9179 static rtx
9180 gen_pop (rtx arg)
9182 if (REG_P (arg) && GET_MODE (arg) != word_mode)
9183 arg = gen_rtx_REG (word_mode, REGNO (arg));
9185 return gen_rtx_SET (VOIDmode,
9186 arg,
9187 gen_rtx_MEM (word_mode,
9188 gen_rtx_POST_INC (Pmode,
9189 stack_pointer_rtx)));
9192 /* Return >= 0 if there is an unused call-clobbered register available
9193 for the entire function. */
9195 static unsigned int
9196 ix86_select_alt_pic_regnum (void)
9198 if (crtl->is_leaf
9199 && !crtl->profile
9200 && !ix86_current_function_calls_tls_descriptor)
9202 int i, drap;
9203 /* Can't use the same register for both PIC and DRAP. */
9204 if (crtl->drap_reg)
9205 drap = REGNO (crtl->drap_reg);
9206 else
9207 drap = -1;
9208 for (i = 2; i >= 0; --i)
9209 if (i != drap && !df_regs_ever_live_p (i))
9210 return i;
9213 return INVALID_REGNUM;
9216 /* Return TRUE if we need to save REGNO. */
9218 static bool
9219 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9221 if (pic_offset_table_rtx
9222 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9223 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9224 || crtl->profile
9225 || crtl->calls_eh_return
9226 || crtl->uses_const_pool
9227 || cfun->has_nonlocal_label))
9228 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9230 if (crtl->calls_eh_return && maybe_eh_return)
9232 unsigned i;
9233 for (i = 0; ; i++)
9235 unsigned test = EH_RETURN_DATA_REGNO (i);
9236 if (test == INVALID_REGNUM)
9237 break;
9238 if (test == regno)
9239 return true;
9243 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9244 return true;
9246 return (df_regs_ever_live_p (regno)
9247 && !call_used_regs[regno]
9248 && !fixed_regs[regno]
9249 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9252 /* Return number of saved general prupose registers. */
9254 static int
9255 ix86_nsaved_regs (void)
9257 int nregs = 0;
9258 int regno;
9260 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9261 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9262 nregs ++;
9263 return nregs;
9266 /* Return number of saved SSE registrers. */
9268 static int
9269 ix86_nsaved_sseregs (void)
9271 int nregs = 0;
9272 int regno;
9274 if (!TARGET_64BIT_MS_ABI)
9275 return 0;
9276 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9277 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9278 nregs ++;
9279 return nregs;
9282 /* Given FROM and TO register numbers, say whether this elimination is
9283 allowed. If stack alignment is needed, we can only replace argument
9284 pointer with hard frame pointer, or replace frame pointer with stack
9285 pointer. Otherwise, frame pointer elimination is automatically
9286 handled and all other eliminations are valid. */
9288 static bool
9289 ix86_can_eliminate (const int from, const int to)
9291 if (stack_realign_fp)
9292 return ((from == ARG_POINTER_REGNUM
9293 && to == HARD_FRAME_POINTER_REGNUM)
9294 || (from == FRAME_POINTER_REGNUM
9295 && to == STACK_POINTER_REGNUM));
9296 else
9297 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9300 /* Return the offset between two registers, one to be eliminated, and the other
9301 its replacement, at the start of a routine. */
9303 HOST_WIDE_INT
9304 ix86_initial_elimination_offset (int from, int to)
9306 struct ix86_frame frame;
9307 ix86_compute_frame_layout (&frame);
9309 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9310 return frame.hard_frame_pointer_offset;
9311 else if (from == FRAME_POINTER_REGNUM
9312 && to == HARD_FRAME_POINTER_REGNUM)
9313 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9314 else
9316 gcc_assert (to == STACK_POINTER_REGNUM);
9318 if (from == ARG_POINTER_REGNUM)
9319 return frame.stack_pointer_offset;
9321 gcc_assert (from == FRAME_POINTER_REGNUM);
9322 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9326 /* In a dynamically-aligned function, we can't know the offset from
9327 stack pointer to frame pointer, so we must ensure that setjmp
9328 eliminates fp against the hard fp (%ebp) rather than trying to
9329 index from %esp up to the top of the frame across a gap that is
9330 of unknown (at compile-time) size. */
9331 static rtx
9332 ix86_builtin_setjmp_frame_value (void)
9334 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9337 /* When using -fsplit-stack, the allocation routines set a field in
9338 the TCB to the bottom of the stack plus this much space, measured
9339 in bytes. */
9341 #define SPLIT_STACK_AVAILABLE 256
9343 /* Fill structure ix86_frame about frame of currently computed function. */
9345 static void
9346 ix86_compute_frame_layout (struct ix86_frame *frame)
9348 unsigned HOST_WIDE_INT stack_alignment_needed;
9349 HOST_WIDE_INT offset;
9350 unsigned HOST_WIDE_INT preferred_alignment;
9351 HOST_WIDE_INT size = get_frame_size ();
9352 HOST_WIDE_INT to_allocate;
9354 frame->nregs = ix86_nsaved_regs ();
9355 frame->nsseregs = ix86_nsaved_sseregs ();
9357 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9358 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9360 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9361 function prologues and leaf. */
9362 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9363 && (!crtl->is_leaf || cfun->calls_alloca != 0
9364 || ix86_current_function_calls_tls_descriptor))
9366 preferred_alignment = 16;
9367 stack_alignment_needed = 16;
9368 crtl->preferred_stack_boundary = 128;
9369 crtl->stack_alignment_needed = 128;
9372 gcc_assert (!size || stack_alignment_needed);
9373 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9374 gcc_assert (preferred_alignment <= stack_alignment_needed);
9376 /* For SEH we have to limit the amount of code movement into the prologue.
9377 At present we do this via a BLOCKAGE, at which point there's very little
9378 scheduling that can be done, which means that there's very little point
9379 in doing anything except PUSHs. */
9380 if (TARGET_SEH)
9381 cfun->machine->use_fast_prologue_epilogue = false;
9383 /* During reload iteration the amount of registers saved can change.
9384 Recompute the value as needed. Do not recompute when amount of registers
9385 didn't change as reload does multiple calls to the function and does not
9386 expect the decision to change within single iteration. */
9387 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
9388 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9390 int count = frame->nregs;
9391 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9393 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9395 /* The fast prologue uses move instead of push to save registers. This
9396 is significantly longer, but also executes faster as modern hardware
9397 can execute the moves in parallel, but can't do that for push/pop.
9399 Be careful about choosing what prologue to emit: When function takes
9400 many instructions to execute we may use slow version as well as in
9401 case function is known to be outside hot spot (this is known with
9402 feedback only). Weight the size of function by number of registers
9403 to save as it is cheap to use one or two push instructions but very
9404 slow to use many of them. */
9405 if (count)
9406 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9407 if (node->frequency < NODE_FREQUENCY_NORMAL
9408 || (flag_branch_probabilities
9409 && node->frequency < NODE_FREQUENCY_HOT))
9410 cfun->machine->use_fast_prologue_epilogue = false;
9411 else
9412 cfun->machine->use_fast_prologue_epilogue
9413 = !expensive_function_p (count);
9416 frame->save_regs_using_mov
9417 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9418 /* If static stack checking is enabled and done with probes,
9419 the registers need to be saved before allocating the frame. */
9420 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9422 /* Skip return address. */
9423 offset = UNITS_PER_WORD;
9425 /* Skip pushed static chain. */
9426 if (ix86_static_chain_on_stack)
9427 offset += UNITS_PER_WORD;
9429 /* Skip saved base pointer. */
9430 if (frame_pointer_needed)
9431 offset += UNITS_PER_WORD;
9432 frame->hfp_save_offset = offset;
9434 /* The traditional frame pointer location is at the top of the frame. */
9435 frame->hard_frame_pointer_offset = offset;
9437 /* Register save area */
9438 offset += frame->nregs * UNITS_PER_WORD;
9439 frame->reg_save_offset = offset;
9441 /* On SEH target, registers are pushed just before the frame pointer
9442 location. */
9443 if (TARGET_SEH)
9444 frame->hard_frame_pointer_offset = offset;
9446 /* Align and set SSE register save area. */
9447 if (frame->nsseregs)
9449 /* The only ABI that has saved SSE registers (Win64) also has a
9450 16-byte aligned default stack, and thus we don't need to be
9451 within the re-aligned local stack frame to save them. */
9452 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9453 offset = (offset + 16 - 1) & -16;
9454 offset += frame->nsseregs * 16;
9456 frame->sse_reg_save_offset = offset;
9458 /* The re-aligned stack starts here. Values before this point are not
9459 directly comparable with values below this point. In order to make
9460 sure that no value happens to be the same before and after, force
9461 the alignment computation below to add a non-zero value. */
9462 if (stack_realign_fp)
9463 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9465 /* Va-arg area */
9466 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9467 offset += frame->va_arg_size;
9469 /* Align start of frame for local function. */
9470 if (stack_realign_fp
9471 || offset != frame->sse_reg_save_offset
9472 || size != 0
9473 || !crtl->is_leaf
9474 || cfun->calls_alloca
9475 || ix86_current_function_calls_tls_descriptor)
9476 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9478 /* Frame pointer points here. */
9479 frame->frame_pointer_offset = offset;
9481 offset += size;
9483 /* Add outgoing arguments area. Can be skipped if we eliminated
9484 all the function calls as dead code.
9485 Skipping is however impossible when function calls alloca. Alloca
9486 expander assumes that last crtl->outgoing_args_size
9487 of stack frame are unused. */
9488 if (ACCUMULATE_OUTGOING_ARGS
9489 && (!crtl->is_leaf || cfun->calls_alloca
9490 || ix86_current_function_calls_tls_descriptor))
9492 offset += crtl->outgoing_args_size;
9493 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9495 else
9496 frame->outgoing_arguments_size = 0;
9498 /* Align stack boundary. Only needed if we're calling another function
9499 or using alloca. */
9500 if (!crtl->is_leaf || cfun->calls_alloca
9501 || ix86_current_function_calls_tls_descriptor)
9502 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9504 /* We've reached end of stack frame. */
9505 frame->stack_pointer_offset = offset;
9507 /* Size prologue needs to allocate. */
9508 to_allocate = offset - frame->sse_reg_save_offset;
9510 if ((!to_allocate && frame->nregs <= 1)
9511 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9512 frame->save_regs_using_mov = false;
9514 if (ix86_using_red_zone ()
9515 && crtl->sp_is_unchanging
9516 && crtl->is_leaf
9517 && !ix86_current_function_calls_tls_descriptor)
9519 frame->red_zone_size = to_allocate;
9520 if (frame->save_regs_using_mov)
9521 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9522 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9523 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9525 else
9526 frame->red_zone_size = 0;
9527 frame->stack_pointer_offset -= frame->red_zone_size;
9529 /* The SEH frame pointer location is near the bottom of the frame.
9530 This is enforced by the fact that the difference between the
9531 stack pointer and the frame pointer is limited to 240 bytes in
9532 the unwind data structure. */
9533 if (TARGET_SEH)
9535 HOST_WIDE_INT diff;
9537 /* If we can leave the frame pointer where it is, do so. Also, returns
9538 the establisher frame for __builtin_frame_address (0). */
9539 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9540 if (diff <= SEH_MAX_FRAME_SIZE
9541 && (diff > 240 || (diff & 15) != 0)
9542 && !crtl->accesses_prior_frames)
9544 /* Ideally we'd determine what portion of the local stack frame
9545 (within the constraint of the lowest 240) is most heavily used.
9546 But without that complication, simply bias the frame pointer
9547 by 128 bytes so as to maximize the amount of the local stack
9548 frame that is addressable with 8-bit offsets. */
9549 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9554 /* This is semi-inlined memory_address_length, but simplified
9555 since we know that we're always dealing with reg+offset, and
9556 to avoid having to create and discard all that rtl. */
9558 static inline int
9559 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9561 int len = 4;
9563 if (offset == 0)
9565 /* EBP and R13 cannot be encoded without an offset. */
9566 len = (regno == BP_REG || regno == R13_REG);
9568 else if (IN_RANGE (offset, -128, 127))
9569 len = 1;
9571 /* ESP and R12 must be encoded with a SIB byte. */
9572 if (regno == SP_REG || regno == R12_REG)
9573 len++;
9575 return len;
9578 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9579 The valid base registers are taken from CFUN->MACHINE->FS. */
9581 static rtx
9582 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9584 const struct machine_function *m = cfun->machine;
9585 rtx base_reg = NULL;
9586 HOST_WIDE_INT base_offset = 0;
9588 if (m->use_fast_prologue_epilogue)
9590 /* Choose the base register most likely to allow the most scheduling
9591 opportunities. Generally FP is valid throughout the function,
9592 while DRAP must be reloaded within the epilogue. But choose either
9593 over the SP due to increased encoding size. */
9595 if (m->fs.fp_valid)
9597 base_reg = hard_frame_pointer_rtx;
9598 base_offset = m->fs.fp_offset - cfa_offset;
9600 else if (m->fs.drap_valid)
9602 base_reg = crtl->drap_reg;
9603 base_offset = 0 - cfa_offset;
9605 else if (m->fs.sp_valid)
9607 base_reg = stack_pointer_rtx;
9608 base_offset = m->fs.sp_offset - cfa_offset;
9611 else
9613 HOST_WIDE_INT toffset;
9614 int len = 16, tlen;
9616 /* Choose the base register with the smallest address encoding.
9617 With a tie, choose FP > DRAP > SP. */
9618 if (m->fs.sp_valid)
9620 base_reg = stack_pointer_rtx;
9621 base_offset = m->fs.sp_offset - cfa_offset;
9622 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9624 if (m->fs.drap_valid)
9626 toffset = 0 - cfa_offset;
9627 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9628 if (tlen <= len)
9630 base_reg = crtl->drap_reg;
9631 base_offset = toffset;
9632 len = tlen;
9635 if (m->fs.fp_valid)
9637 toffset = m->fs.fp_offset - cfa_offset;
9638 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9639 if (tlen <= len)
9641 base_reg = hard_frame_pointer_rtx;
9642 base_offset = toffset;
9643 len = tlen;
9647 gcc_assert (base_reg != NULL);
9649 return plus_constant (Pmode, base_reg, base_offset);
9652 /* Emit code to save registers in the prologue. */
9654 static void
9655 ix86_emit_save_regs (void)
9657 unsigned int regno;
9658 rtx insn;
9660 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9661 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9663 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9664 RTX_FRAME_RELATED_P (insn) = 1;
9668 /* Emit a single register save at CFA - CFA_OFFSET. */
9670 static void
9671 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9672 HOST_WIDE_INT cfa_offset)
9674 struct machine_function *m = cfun->machine;
9675 rtx reg = gen_rtx_REG (mode, regno);
9676 rtx mem, addr, base, insn;
9678 addr = choose_baseaddr (cfa_offset);
9679 mem = gen_frame_mem (mode, addr);
9681 /* For SSE saves, we need to indicate the 128-bit alignment. */
9682 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9684 insn = emit_move_insn (mem, reg);
9685 RTX_FRAME_RELATED_P (insn) = 1;
9687 base = addr;
9688 if (GET_CODE (base) == PLUS)
9689 base = XEXP (base, 0);
9690 gcc_checking_assert (REG_P (base));
9692 /* When saving registers into a re-aligned local stack frame, avoid
9693 any tricky guessing by dwarf2out. */
9694 if (m->fs.realigned)
9696 gcc_checking_assert (stack_realign_drap);
9698 if (regno == REGNO (crtl->drap_reg))
9700 /* A bit of a hack. We force the DRAP register to be saved in
9701 the re-aligned stack frame, which provides us with a copy
9702 of the CFA that will last past the prologue. Install it. */
9703 gcc_checking_assert (cfun->machine->fs.fp_valid);
9704 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9705 cfun->machine->fs.fp_offset - cfa_offset);
9706 mem = gen_rtx_MEM (mode, addr);
9707 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9709 else
9711 /* The frame pointer is a stable reference within the
9712 aligned frame. Use it. */
9713 gcc_checking_assert (cfun->machine->fs.fp_valid);
9714 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9715 cfun->machine->fs.fp_offset - cfa_offset);
9716 mem = gen_rtx_MEM (mode, addr);
9717 add_reg_note (insn, REG_CFA_EXPRESSION,
9718 gen_rtx_SET (VOIDmode, mem, reg));
9722 /* The memory may not be relative to the current CFA register,
9723 which means that we may need to generate a new pattern for
9724 use by the unwind info. */
9725 else if (base != m->fs.cfa_reg)
9727 addr = plus_constant (Pmode, m->fs.cfa_reg,
9728 m->fs.cfa_offset - cfa_offset);
9729 mem = gen_rtx_MEM (mode, addr);
9730 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9734 /* Emit code to save registers using MOV insns.
9735 First register is stored at CFA - CFA_OFFSET. */
9736 static void
9737 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9739 unsigned int regno;
9741 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9742 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9744 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9745 cfa_offset -= UNITS_PER_WORD;
9749 /* Emit code to save SSE registers using MOV insns.
9750 First register is stored at CFA - CFA_OFFSET. */
9751 static void
9752 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9754 unsigned int regno;
9756 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9757 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9759 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9760 cfa_offset -= 16;
9764 static GTY(()) rtx queued_cfa_restores;
9766 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9767 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9768 Don't add the note if the previously saved value will be left untouched
9769 within stack red-zone till return, as unwinders can find the same value
9770 in the register and on the stack. */
9772 static void
9773 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9775 if (!crtl->shrink_wrapped
9776 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9777 return;
9779 if (insn)
9781 add_reg_note (insn, REG_CFA_RESTORE, reg);
9782 RTX_FRAME_RELATED_P (insn) = 1;
9784 else
9785 queued_cfa_restores
9786 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9789 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9791 static void
9792 ix86_add_queued_cfa_restore_notes (rtx insn)
9794 rtx last;
9795 if (!queued_cfa_restores)
9796 return;
9797 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9799 XEXP (last, 1) = REG_NOTES (insn);
9800 REG_NOTES (insn) = queued_cfa_restores;
9801 queued_cfa_restores = NULL_RTX;
9802 RTX_FRAME_RELATED_P (insn) = 1;
9805 /* Expand prologue or epilogue stack adjustment.
9806 The pattern exist to put a dependency on all ebp-based memory accesses.
9807 STYLE should be negative if instructions should be marked as frame related,
9808 zero if %r11 register is live and cannot be freely used and positive
9809 otherwise. */
9811 static void
9812 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9813 int style, bool set_cfa)
9815 struct machine_function *m = cfun->machine;
9816 rtx insn;
9817 bool add_frame_related_expr = false;
9819 if (Pmode == SImode)
9820 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9821 else if (x86_64_immediate_operand (offset, DImode))
9822 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9823 else
9825 rtx tmp;
9826 /* r11 is used by indirect sibcall return as well, set before the
9827 epilogue and used after the epilogue. */
9828 if (style)
9829 tmp = gen_rtx_REG (DImode, R11_REG);
9830 else
9832 gcc_assert (src != hard_frame_pointer_rtx
9833 && dest != hard_frame_pointer_rtx);
9834 tmp = hard_frame_pointer_rtx;
9836 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9837 if (style < 0)
9838 add_frame_related_expr = true;
9840 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9843 insn = emit_insn (insn);
9844 if (style >= 0)
9845 ix86_add_queued_cfa_restore_notes (insn);
9847 if (set_cfa)
9849 rtx r;
9851 gcc_assert (m->fs.cfa_reg == src);
9852 m->fs.cfa_offset += INTVAL (offset);
9853 m->fs.cfa_reg = dest;
9855 r = gen_rtx_PLUS (Pmode, src, offset);
9856 r = gen_rtx_SET (VOIDmode, dest, r);
9857 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9858 RTX_FRAME_RELATED_P (insn) = 1;
9860 else if (style < 0)
9862 RTX_FRAME_RELATED_P (insn) = 1;
9863 if (add_frame_related_expr)
9865 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9866 r = gen_rtx_SET (VOIDmode, dest, r);
9867 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9871 if (dest == stack_pointer_rtx)
9873 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9874 bool valid = m->fs.sp_valid;
9876 if (src == hard_frame_pointer_rtx)
9878 valid = m->fs.fp_valid;
9879 ooffset = m->fs.fp_offset;
9881 else if (src == crtl->drap_reg)
9883 valid = m->fs.drap_valid;
9884 ooffset = 0;
9886 else
9888 /* Else there are two possibilities: SP itself, which we set
9889 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9890 taken care of this by hand along the eh_return path. */
9891 gcc_checking_assert (src == stack_pointer_rtx
9892 || offset == const0_rtx);
9895 m->fs.sp_offset = ooffset - INTVAL (offset);
9896 m->fs.sp_valid = valid;
9900 /* Find an available register to be used as dynamic realign argument
9901 pointer regsiter. Such a register will be written in prologue and
9902 used in begin of body, so it must not be
9903 1. parameter passing register.
9904 2. GOT pointer.
9905 We reuse static-chain register if it is available. Otherwise, we
9906 use DI for i386 and R13 for x86-64. We chose R13 since it has
9907 shorter encoding.
9909 Return: the regno of chosen register. */
9911 static unsigned int
9912 find_drap_reg (void)
9914 tree decl = cfun->decl;
9916 if (TARGET_64BIT)
9918 /* Use R13 for nested function or function need static chain.
9919 Since function with tail call may use any caller-saved
9920 registers in epilogue, DRAP must not use caller-saved
9921 register in such case. */
9922 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9923 return R13_REG;
9925 return R10_REG;
9927 else
9929 /* Use DI for nested function or function need static chain.
9930 Since function with tail call may use any caller-saved
9931 registers in epilogue, DRAP must not use caller-saved
9932 register in such case. */
9933 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9934 return DI_REG;
9936 /* Reuse static chain register if it isn't used for parameter
9937 passing. */
9938 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9940 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9941 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9942 return CX_REG;
9944 return DI_REG;
9948 /* Return minimum incoming stack alignment. */
9950 static unsigned int
9951 ix86_minimum_incoming_stack_boundary (bool sibcall)
9953 unsigned int incoming_stack_boundary;
9955 /* Prefer the one specified at command line. */
9956 if (ix86_user_incoming_stack_boundary)
9957 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9958 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9959 if -mstackrealign is used, it isn't used for sibcall check and
9960 estimated stack alignment is 128bit. */
9961 else if (!sibcall
9962 && !TARGET_64BIT
9963 && ix86_force_align_arg_pointer
9964 && crtl->stack_alignment_estimated == 128)
9965 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9966 else
9967 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9969 /* Incoming stack alignment can be changed on individual functions
9970 via force_align_arg_pointer attribute. We use the smallest
9971 incoming stack boundary. */
9972 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9973 && lookup_attribute (ix86_force_align_arg_pointer_string,
9974 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9975 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9977 /* The incoming stack frame has to be aligned at least at
9978 parm_stack_boundary. */
9979 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9980 incoming_stack_boundary = crtl->parm_stack_boundary;
9982 /* Stack at entrance of main is aligned by runtime. We use the
9983 smallest incoming stack boundary. */
9984 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9985 && DECL_NAME (current_function_decl)
9986 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9987 && DECL_FILE_SCOPE_P (current_function_decl))
9988 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9990 return incoming_stack_boundary;
9993 /* Update incoming stack boundary and estimated stack alignment. */
9995 static void
9996 ix86_update_stack_boundary (void)
9998 ix86_incoming_stack_boundary
9999 = ix86_minimum_incoming_stack_boundary (false);
10001 /* x86_64 vararg needs 16byte stack alignment for register save
10002 area. */
10003 if (TARGET_64BIT
10004 && cfun->stdarg
10005 && crtl->stack_alignment_estimated < 128)
10006 crtl->stack_alignment_estimated = 128;
10009 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
10010 needed or an rtx for DRAP otherwise. */
10012 static rtx
10013 ix86_get_drap_rtx (void)
10015 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
10016 crtl->need_drap = true;
10018 if (stack_realign_drap)
10020 /* Assign DRAP to vDRAP and returns vDRAP */
10021 unsigned int regno = find_drap_reg ();
10022 rtx drap_vreg;
10023 rtx arg_ptr;
10024 rtx seq, insn;
10026 arg_ptr = gen_rtx_REG (Pmode, regno);
10027 crtl->drap_reg = arg_ptr;
10029 start_sequence ();
10030 drap_vreg = copy_to_reg (arg_ptr);
10031 seq = get_insns ();
10032 end_sequence ();
10034 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
10035 if (!optimize)
10037 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
10038 RTX_FRAME_RELATED_P (insn) = 1;
10040 return drap_vreg;
10042 else
10043 return NULL;
10046 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
10048 static rtx
10049 ix86_internal_arg_pointer (void)
10051 return virtual_incoming_args_rtx;
10054 struct scratch_reg {
10055 rtx reg;
10056 bool saved;
10059 /* Return a short-lived scratch register for use on function entry.
10060 In 32-bit mode, it is valid only after the registers are saved
10061 in the prologue. This register must be released by means of
10062 release_scratch_register_on_entry once it is dead. */
10064 static void
10065 get_scratch_register_on_entry (struct scratch_reg *sr)
10067 int regno;
10069 sr->saved = false;
10071 if (TARGET_64BIT)
10073 /* We always use R11 in 64-bit mode. */
10074 regno = R11_REG;
10076 else
10078 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
10079 bool fastcall_p
10080 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10081 bool thiscall_p
10082 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
10083 bool static_chain_p = DECL_STATIC_CHAIN (decl);
10084 int regparm = ix86_function_regparm (fntype, decl);
10085 int drap_regno
10086 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
10088 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
10089 for the static chain register. */
10090 if ((regparm < 1 || (fastcall_p && !static_chain_p))
10091 && drap_regno != AX_REG)
10092 regno = AX_REG;
10093 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
10094 for the static chain register. */
10095 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
10096 regno = AX_REG;
10097 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
10098 regno = DX_REG;
10099 /* ecx is the static chain register. */
10100 else if (regparm < 3 && !fastcall_p && !thiscall_p
10101 && !static_chain_p
10102 && drap_regno != CX_REG)
10103 regno = CX_REG;
10104 else if (ix86_save_reg (BX_REG, true))
10105 regno = BX_REG;
10106 /* esi is the static chain register. */
10107 else if (!(regparm == 3 && static_chain_p)
10108 && ix86_save_reg (SI_REG, true))
10109 regno = SI_REG;
10110 else if (ix86_save_reg (DI_REG, true))
10111 regno = DI_REG;
10112 else
10114 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
10115 sr->saved = true;
10119 sr->reg = gen_rtx_REG (Pmode, regno);
10120 if (sr->saved)
10122 rtx insn = emit_insn (gen_push (sr->reg));
10123 RTX_FRAME_RELATED_P (insn) = 1;
10127 /* Release a scratch register obtained from the preceding function. */
10129 static void
10130 release_scratch_register_on_entry (struct scratch_reg *sr)
10132 if (sr->saved)
10134 struct machine_function *m = cfun->machine;
10135 rtx x, insn = emit_insn (gen_pop (sr->reg));
10137 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
10138 RTX_FRAME_RELATED_P (insn) = 1;
10139 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
10140 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10141 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
10142 m->fs.sp_offset -= UNITS_PER_WORD;
10146 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
10148 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
10150 static void
10151 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
10153 /* We skip the probe for the first interval + a small dope of 4 words and
10154 probe that many bytes past the specified size to maintain a protection
10155 area at the botton of the stack. */
10156 const int dope = 4 * UNITS_PER_WORD;
10157 rtx size_rtx = GEN_INT (size), last;
10159 /* See if we have a constant small number of probes to generate. If so,
10160 that's the easy case. The run-time loop is made up of 11 insns in the
10161 generic case while the compile-time loop is made up of 3+2*(n-1) insns
10162 for n # of intervals. */
10163 if (size <= 5 * PROBE_INTERVAL)
10165 HOST_WIDE_INT i, adjust;
10166 bool first_probe = true;
10168 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
10169 values of N from 1 until it exceeds SIZE. If only one probe is
10170 needed, this will not generate any code. Then adjust and probe
10171 to PROBE_INTERVAL + SIZE. */
10172 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10174 if (first_probe)
10176 adjust = 2 * PROBE_INTERVAL + dope;
10177 first_probe = false;
10179 else
10180 adjust = PROBE_INTERVAL;
10182 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10183 plus_constant (Pmode, stack_pointer_rtx,
10184 -adjust)));
10185 emit_stack_probe (stack_pointer_rtx);
10188 if (first_probe)
10189 adjust = size + PROBE_INTERVAL + dope;
10190 else
10191 adjust = size + PROBE_INTERVAL - i;
10193 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10194 plus_constant (Pmode, stack_pointer_rtx,
10195 -adjust)));
10196 emit_stack_probe (stack_pointer_rtx);
10198 /* Adjust back to account for the additional first interval. */
10199 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10200 plus_constant (Pmode, stack_pointer_rtx,
10201 PROBE_INTERVAL + dope)));
10204 /* Otherwise, do the same as above, but in a loop. Note that we must be
10205 extra careful with variables wrapping around because we might be at
10206 the very top (or the very bottom) of the address space and we have
10207 to be able to handle this case properly; in particular, we use an
10208 equality test for the loop condition. */
10209 else
10211 HOST_WIDE_INT rounded_size;
10212 struct scratch_reg sr;
10214 get_scratch_register_on_entry (&sr);
10217 /* Step 1: round SIZE to the previous multiple of the interval. */
10219 rounded_size = size & -PROBE_INTERVAL;
10222 /* Step 2: compute initial and final value of the loop counter. */
10224 /* SP = SP_0 + PROBE_INTERVAL. */
10225 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10226 plus_constant (Pmode, stack_pointer_rtx,
10227 - (PROBE_INTERVAL + dope))));
10229 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10230 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10231 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10232 gen_rtx_PLUS (Pmode, sr.reg,
10233 stack_pointer_rtx)));
10236 /* Step 3: the loop
10238 while (SP != LAST_ADDR)
10240 SP = SP + PROBE_INTERVAL
10241 probe at SP
10244 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10245 values of N from 1 until it is equal to ROUNDED_SIZE. */
10247 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10250 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10251 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10253 if (size != rounded_size)
10255 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10256 plus_constant (Pmode, stack_pointer_rtx,
10257 rounded_size - size)));
10258 emit_stack_probe (stack_pointer_rtx);
10261 /* Adjust back to account for the additional first interval. */
10262 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10263 plus_constant (Pmode, stack_pointer_rtx,
10264 PROBE_INTERVAL + dope)));
10266 release_scratch_register_on_entry (&sr);
10269 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10271 /* Even if the stack pointer isn't the CFA register, we need to correctly
10272 describe the adjustments made to it, in particular differentiate the
10273 frame-related ones from the frame-unrelated ones. */
10274 if (size > 0)
10276 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10277 XVECEXP (expr, 0, 0)
10278 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10279 plus_constant (Pmode, stack_pointer_rtx, -size));
10280 XVECEXP (expr, 0, 1)
10281 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10282 plus_constant (Pmode, stack_pointer_rtx,
10283 PROBE_INTERVAL + dope + size));
10284 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10285 RTX_FRAME_RELATED_P (last) = 1;
10287 cfun->machine->fs.sp_offset += size;
10290 /* Make sure nothing is scheduled before we are done. */
10291 emit_insn (gen_blockage ());
10294 /* Adjust the stack pointer up to REG while probing it. */
10296 const char *
10297 output_adjust_stack_and_probe (rtx reg)
10299 static int labelno = 0;
10300 char loop_lab[32], end_lab[32];
10301 rtx xops[2];
10303 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10304 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10306 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10308 /* Jump to END_LAB if SP == LAST_ADDR. */
10309 xops[0] = stack_pointer_rtx;
10310 xops[1] = reg;
10311 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10312 fputs ("\tje\t", asm_out_file);
10313 assemble_name_raw (asm_out_file, end_lab);
10314 fputc ('\n', asm_out_file);
10316 /* SP = SP + PROBE_INTERVAL. */
10317 xops[1] = GEN_INT (PROBE_INTERVAL);
10318 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10320 /* Probe at SP. */
10321 xops[1] = const0_rtx;
10322 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10324 fprintf (asm_out_file, "\tjmp\t");
10325 assemble_name_raw (asm_out_file, loop_lab);
10326 fputc ('\n', asm_out_file);
10328 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10330 return "";
10333 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10334 inclusive. These are offsets from the current stack pointer. */
10336 static void
10337 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10339 /* See if we have a constant small number of probes to generate. If so,
10340 that's the easy case. The run-time loop is made up of 7 insns in the
10341 generic case while the compile-time loop is made up of n insns for n #
10342 of intervals. */
10343 if (size <= 7 * PROBE_INTERVAL)
10345 HOST_WIDE_INT i;
10347 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10348 it exceeds SIZE. If only one probe is needed, this will not
10349 generate any code. Then probe at FIRST + SIZE. */
10350 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10351 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10352 -(first + i)));
10354 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10355 -(first + size)));
10358 /* Otherwise, do the same as above, but in a loop. Note that we must be
10359 extra careful with variables wrapping around because we might be at
10360 the very top (or the very bottom) of the address space and we have
10361 to be able to handle this case properly; in particular, we use an
10362 equality test for the loop condition. */
10363 else
10365 HOST_WIDE_INT rounded_size, last;
10366 struct scratch_reg sr;
10368 get_scratch_register_on_entry (&sr);
10371 /* Step 1: round SIZE to the previous multiple of the interval. */
10373 rounded_size = size & -PROBE_INTERVAL;
10376 /* Step 2: compute initial and final value of the loop counter. */
10378 /* TEST_OFFSET = FIRST. */
10379 emit_move_insn (sr.reg, GEN_INT (-first));
10381 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10382 last = first + rounded_size;
10385 /* Step 3: the loop
10387 while (TEST_ADDR != LAST_ADDR)
10389 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10390 probe at TEST_ADDR
10393 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10394 until it is equal to ROUNDED_SIZE. */
10396 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10399 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10400 that SIZE is equal to ROUNDED_SIZE. */
10402 if (size != rounded_size)
10403 emit_stack_probe (plus_constant (Pmode,
10404 gen_rtx_PLUS (Pmode,
10405 stack_pointer_rtx,
10406 sr.reg),
10407 rounded_size - size));
10409 release_scratch_register_on_entry (&sr);
10412 /* Make sure nothing is scheduled before we are done. */
10413 emit_insn (gen_blockage ());
10416 /* Probe a range of stack addresses from REG to END, inclusive. These are
10417 offsets from the current stack pointer. */
10419 const char *
10420 output_probe_stack_range (rtx reg, rtx end)
10422 static int labelno = 0;
10423 char loop_lab[32], end_lab[32];
10424 rtx xops[3];
10426 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10427 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10429 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10431 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10432 xops[0] = reg;
10433 xops[1] = end;
10434 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10435 fputs ("\tje\t", asm_out_file);
10436 assemble_name_raw (asm_out_file, end_lab);
10437 fputc ('\n', asm_out_file);
10439 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10440 xops[1] = GEN_INT (PROBE_INTERVAL);
10441 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10443 /* Probe at TEST_ADDR. */
10444 xops[0] = stack_pointer_rtx;
10445 xops[1] = reg;
10446 xops[2] = const0_rtx;
10447 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10449 fprintf (asm_out_file, "\tjmp\t");
10450 assemble_name_raw (asm_out_file, loop_lab);
10451 fputc ('\n', asm_out_file);
10453 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10455 return "";
10458 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10459 to be generated in correct form. */
10460 static void
10461 ix86_finalize_stack_realign_flags (void)
10463 /* Check if stack realign is really needed after reload, and
10464 stores result in cfun */
10465 unsigned int incoming_stack_boundary
10466 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10467 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10468 unsigned int stack_realign = (incoming_stack_boundary
10469 < (crtl->is_leaf
10470 ? crtl->max_used_stack_slot_alignment
10471 : crtl->stack_alignment_needed));
10473 if (crtl->stack_realign_finalized)
10475 /* After stack_realign_needed is finalized, we can't no longer
10476 change it. */
10477 gcc_assert (crtl->stack_realign_needed == stack_realign);
10478 return;
10481 /* If the only reason for frame_pointer_needed is that we conservatively
10482 assumed stack realignment might be needed, but in the end nothing that
10483 needed the stack alignment had been spilled, clear frame_pointer_needed
10484 and say we don't need stack realignment. */
10485 if (stack_realign
10486 && !crtl->need_drap
10487 && frame_pointer_needed
10488 && crtl->is_leaf
10489 && flag_omit_frame_pointer
10490 && crtl->sp_is_unchanging
10491 && !ix86_current_function_calls_tls_descriptor
10492 && !crtl->accesses_prior_frames
10493 && !cfun->calls_alloca
10494 && !crtl->calls_eh_return
10495 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10496 && !ix86_frame_pointer_required ()
10497 && get_frame_size () == 0
10498 && ix86_nsaved_sseregs () == 0
10499 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10501 HARD_REG_SET set_up_by_prologue, prologue_used;
10502 basic_block bb;
10504 CLEAR_HARD_REG_SET (prologue_used);
10505 CLEAR_HARD_REG_SET (set_up_by_prologue);
10506 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10507 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10508 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10509 HARD_FRAME_POINTER_REGNUM);
10510 FOR_EACH_BB_FN (bb, cfun)
10512 rtx insn;
10513 FOR_BB_INSNS (bb, insn)
10514 if (NONDEBUG_INSN_P (insn)
10515 && requires_stack_frame_p (insn, prologue_used,
10516 set_up_by_prologue))
10518 crtl->stack_realign_needed = stack_realign;
10519 crtl->stack_realign_finalized = true;
10520 return;
10524 frame_pointer_needed = false;
10525 stack_realign = false;
10526 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10527 crtl->stack_alignment_needed = incoming_stack_boundary;
10528 crtl->stack_alignment_estimated = incoming_stack_boundary;
10529 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10530 crtl->preferred_stack_boundary = incoming_stack_boundary;
10531 df_finish_pass (true);
10532 df_scan_alloc (NULL);
10533 df_scan_blocks ();
10534 df_compute_regs_ever_live (true);
10535 df_analyze ();
10538 crtl->stack_realign_needed = stack_realign;
10539 crtl->stack_realign_finalized = true;
10542 /* Expand the prologue into a bunch of separate insns. */
10544 void
10545 ix86_expand_prologue (void)
10547 struct machine_function *m = cfun->machine;
10548 rtx insn, t;
10549 bool pic_reg_used;
10550 struct ix86_frame frame;
10551 HOST_WIDE_INT allocate;
10552 bool int_registers_saved;
10553 bool sse_registers_saved;
10555 ix86_finalize_stack_realign_flags ();
10557 /* DRAP should not coexist with stack_realign_fp */
10558 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10560 memset (&m->fs, 0, sizeof (m->fs));
10562 /* Initialize CFA state for before the prologue. */
10563 m->fs.cfa_reg = stack_pointer_rtx;
10564 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10566 /* Track SP offset to the CFA. We continue tracking this after we've
10567 swapped the CFA register away from SP. In the case of re-alignment
10568 this is fudged; we're interested to offsets within the local frame. */
10569 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10570 m->fs.sp_valid = true;
10572 ix86_compute_frame_layout (&frame);
10574 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10576 /* We should have already generated an error for any use of
10577 ms_hook on a nested function. */
10578 gcc_checking_assert (!ix86_static_chain_on_stack);
10580 /* Check if profiling is active and we shall use profiling before
10581 prologue variant. If so sorry. */
10582 if (crtl->profile && flag_fentry != 0)
10583 sorry ("ms_hook_prologue attribute isn%'t compatible "
10584 "with -mfentry for 32-bit");
10586 /* In ix86_asm_output_function_label we emitted:
10587 8b ff movl.s %edi,%edi
10588 55 push %ebp
10589 8b ec movl.s %esp,%ebp
10591 This matches the hookable function prologue in Win32 API
10592 functions in Microsoft Windows XP Service Pack 2 and newer.
10593 Wine uses this to enable Windows apps to hook the Win32 API
10594 functions provided by Wine.
10596 What that means is that we've already set up the frame pointer. */
10598 if (frame_pointer_needed
10599 && !(crtl->drap_reg && crtl->stack_realign_needed))
10601 rtx push, mov;
10603 /* We've decided to use the frame pointer already set up.
10604 Describe this to the unwinder by pretending that both
10605 push and mov insns happen right here.
10607 Putting the unwind info here at the end of the ms_hook
10608 is done so that we can make absolutely certain we get
10609 the required byte sequence at the start of the function,
10610 rather than relying on an assembler that can produce
10611 the exact encoding required.
10613 However it does mean (in the unpatched case) that we have
10614 a 1 insn window where the asynchronous unwind info is
10615 incorrect. However, if we placed the unwind info at
10616 its correct location we would have incorrect unwind info
10617 in the patched case. Which is probably all moot since
10618 I don't expect Wine generates dwarf2 unwind info for the
10619 system libraries that use this feature. */
10621 insn = emit_insn (gen_blockage ());
10623 push = gen_push (hard_frame_pointer_rtx);
10624 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10625 stack_pointer_rtx);
10626 RTX_FRAME_RELATED_P (push) = 1;
10627 RTX_FRAME_RELATED_P (mov) = 1;
10629 RTX_FRAME_RELATED_P (insn) = 1;
10630 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10631 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10633 /* Note that gen_push incremented m->fs.cfa_offset, even
10634 though we didn't emit the push insn here. */
10635 m->fs.cfa_reg = hard_frame_pointer_rtx;
10636 m->fs.fp_offset = m->fs.cfa_offset;
10637 m->fs.fp_valid = true;
10639 else
10641 /* The frame pointer is not needed so pop %ebp again.
10642 This leaves us with a pristine state. */
10643 emit_insn (gen_pop (hard_frame_pointer_rtx));
10647 /* The first insn of a function that accepts its static chain on the
10648 stack is to push the register that would be filled in by a direct
10649 call. This insn will be skipped by the trampoline. */
10650 else if (ix86_static_chain_on_stack)
10652 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10653 emit_insn (gen_blockage ());
10655 /* We don't want to interpret this push insn as a register save,
10656 only as a stack adjustment. The real copy of the register as
10657 a save will be done later, if needed. */
10658 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10659 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10660 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10661 RTX_FRAME_RELATED_P (insn) = 1;
10664 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10665 of DRAP is needed and stack realignment is really needed after reload */
10666 if (stack_realign_drap)
10668 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10670 /* Only need to push parameter pointer reg if it is caller saved. */
10671 if (!call_used_regs[REGNO (crtl->drap_reg)])
10673 /* Push arg pointer reg */
10674 insn = emit_insn (gen_push (crtl->drap_reg));
10675 RTX_FRAME_RELATED_P (insn) = 1;
10678 /* Grab the argument pointer. */
10679 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10680 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10681 RTX_FRAME_RELATED_P (insn) = 1;
10682 m->fs.cfa_reg = crtl->drap_reg;
10683 m->fs.cfa_offset = 0;
10685 /* Align the stack. */
10686 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10687 stack_pointer_rtx,
10688 GEN_INT (-align_bytes)));
10689 RTX_FRAME_RELATED_P (insn) = 1;
10691 /* Replicate the return address on the stack so that return
10692 address can be reached via (argp - 1) slot. This is needed
10693 to implement macro RETURN_ADDR_RTX and intrinsic function
10694 expand_builtin_return_addr etc. */
10695 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10696 t = gen_frame_mem (word_mode, t);
10697 insn = emit_insn (gen_push (t));
10698 RTX_FRAME_RELATED_P (insn) = 1;
10700 /* For the purposes of frame and register save area addressing,
10701 we've started over with a new frame. */
10702 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10703 m->fs.realigned = true;
10706 int_registers_saved = (frame.nregs == 0);
10707 sse_registers_saved = (frame.nsseregs == 0);
10709 if (frame_pointer_needed && !m->fs.fp_valid)
10711 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10712 slower on all targets. Also sdb doesn't like it. */
10713 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10714 RTX_FRAME_RELATED_P (insn) = 1;
10716 /* Push registers now, before setting the frame pointer
10717 on SEH target. */
10718 if (!int_registers_saved
10719 && TARGET_SEH
10720 && !frame.save_regs_using_mov)
10722 ix86_emit_save_regs ();
10723 int_registers_saved = true;
10724 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10727 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10729 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10730 RTX_FRAME_RELATED_P (insn) = 1;
10732 if (m->fs.cfa_reg == stack_pointer_rtx)
10733 m->fs.cfa_reg = hard_frame_pointer_rtx;
10734 m->fs.fp_offset = m->fs.sp_offset;
10735 m->fs.fp_valid = true;
10739 if (!int_registers_saved)
10741 /* If saving registers via PUSH, do so now. */
10742 if (!frame.save_regs_using_mov)
10744 ix86_emit_save_regs ();
10745 int_registers_saved = true;
10746 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10749 /* When using red zone we may start register saving before allocating
10750 the stack frame saving one cycle of the prologue. However, avoid
10751 doing this if we have to probe the stack; at least on x86_64 the
10752 stack probe can turn into a call that clobbers a red zone location. */
10753 else if (ix86_using_red_zone ()
10754 && (! TARGET_STACK_PROBE
10755 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10757 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10758 int_registers_saved = true;
10762 if (stack_realign_fp)
10764 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10765 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10767 /* The computation of the size of the re-aligned stack frame means
10768 that we must allocate the size of the register save area before
10769 performing the actual alignment. Otherwise we cannot guarantee
10770 that there's enough storage above the realignment point. */
10771 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10772 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10773 GEN_INT (m->fs.sp_offset
10774 - frame.sse_reg_save_offset),
10775 -1, false);
10777 /* Align the stack. */
10778 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10779 stack_pointer_rtx,
10780 GEN_INT (-align_bytes)));
10782 /* For the purposes of register save area addressing, the stack
10783 pointer is no longer valid. As for the value of sp_offset,
10784 see ix86_compute_frame_layout, which we need to match in order
10785 to pass verification of stack_pointer_offset at the end. */
10786 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10787 m->fs.sp_valid = false;
10790 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10792 if (flag_stack_usage_info)
10794 /* We start to count from ARG_POINTER. */
10795 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10797 /* If it was realigned, take into account the fake frame. */
10798 if (stack_realign_drap)
10800 if (ix86_static_chain_on_stack)
10801 stack_size += UNITS_PER_WORD;
10803 if (!call_used_regs[REGNO (crtl->drap_reg)])
10804 stack_size += UNITS_PER_WORD;
10806 /* This over-estimates by 1 minimal-stack-alignment-unit but
10807 mitigates that by counting in the new return address slot. */
10808 current_function_dynamic_stack_size
10809 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10812 current_function_static_stack_size = stack_size;
10815 /* On SEH target with very large frame size, allocate an area to save
10816 SSE registers (as the very large allocation won't be described). */
10817 if (TARGET_SEH
10818 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10819 && !sse_registers_saved)
10821 HOST_WIDE_INT sse_size =
10822 frame.sse_reg_save_offset - frame.reg_save_offset;
10824 gcc_assert (int_registers_saved);
10826 /* No need to do stack checking as the area will be immediately
10827 written. */
10828 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10829 GEN_INT (-sse_size), -1,
10830 m->fs.cfa_reg == stack_pointer_rtx);
10831 allocate -= sse_size;
10832 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10833 sse_registers_saved = true;
10836 /* The stack has already been decremented by the instruction calling us
10837 so probe if the size is non-negative to preserve the protection area. */
10838 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10840 /* We expect the registers to be saved when probes are used. */
10841 gcc_assert (int_registers_saved);
10843 if (STACK_CHECK_MOVING_SP)
10845 if (!(crtl->is_leaf && !cfun->calls_alloca
10846 && allocate <= PROBE_INTERVAL))
10848 ix86_adjust_stack_and_probe (allocate);
10849 allocate = 0;
10852 else
10854 HOST_WIDE_INT size = allocate;
10856 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10857 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10859 if (TARGET_STACK_PROBE)
10861 if (crtl->is_leaf && !cfun->calls_alloca)
10863 if (size > PROBE_INTERVAL)
10864 ix86_emit_probe_stack_range (0, size);
10866 else
10867 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10869 else
10871 if (crtl->is_leaf && !cfun->calls_alloca)
10873 if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
10874 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
10875 size - STACK_CHECK_PROTECT);
10877 else
10878 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10883 if (allocate == 0)
10885 else if (!ix86_target_stack_probe ()
10886 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10888 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10889 GEN_INT (-allocate), -1,
10890 m->fs.cfa_reg == stack_pointer_rtx);
10892 else
10894 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10895 rtx r10 = NULL;
10896 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10897 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10898 bool eax_live = false;
10899 bool r10_live = false;
10901 if (TARGET_64BIT)
10902 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10903 if (!TARGET_64BIT_MS_ABI)
10904 eax_live = ix86_eax_live_at_start_p ();
10906 /* Note that SEH directives need to continue tracking the stack
10907 pointer even after the frame pointer has been set up. */
10908 if (eax_live)
10910 insn = emit_insn (gen_push (eax));
10911 allocate -= UNITS_PER_WORD;
10912 if (sp_is_cfa_reg || TARGET_SEH)
10914 if (sp_is_cfa_reg)
10915 m->fs.cfa_offset += UNITS_PER_WORD;
10916 RTX_FRAME_RELATED_P (insn) = 1;
10920 if (r10_live)
10922 r10 = gen_rtx_REG (Pmode, R10_REG);
10923 insn = emit_insn (gen_push (r10));
10924 allocate -= UNITS_PER_WORD;
10925 if (sp_is_cfa_reg || TARGET_SEH)
10927 if (sp_is_cfa_reg)
10928 m->fs.cfa_offset += UNITS_PER_WORD;
10929 RTX_FRAME_RELATED_P (insn) = 1;
10933 emit_move_insn (eax, GEN_INT (allocate));
10934 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10936 /* Use the fact that AX still contains ALLOCATE. */
10937 adjust_stack_insn = (Pmode == DImode
10938 ? gen_pro_epilogue_adjust_stack_di_sub
10939 : gen_pro_epilogue_adjust_stack_si_sub);
10941 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10942 stack_pointer_rtx, eax));
10944 if (sp_is_cfa_reg || TARGET_SEH)
10946 if (sp_is_cfa_reg)
10947 m->fs.cfa_offset += allocate;
10948 RTX_FRAME_RELATED_P (insn) = 1;
10949 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10950 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10951 plus_constant (Pmode, stack_pointer_rtx,
10952 -allocate)));
10954 m->fs.sp_offset += allocate;
10956 /* Use stack_pointer_rtx for relative addressing so that code
10957 works for realigned stack, too. */
10958 if (r10_live && eax_live)
10960 t = plus_constant (Pmode, stack_pointer_rtx, allocate);
10961 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10962 gen_frame_mem (word_mode, t));
10963 t = plus_constant (Pmode, stack_pointer_rtx,
10964 allocate - UNITS_PER_WORD);
10965 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10966 gen_frame_mem (word_mode, t));
10968 else if (eax_live || r10_live)
10970 t = plus_constant (Pmode, stack_pointer_rtx, allocate);
10971 emit_move_insn (gen_rtx_REG (word_mode,
10972 (eax_live ? AX_REG : R10_REG)),
10973 gen_frame_mem (word_mode, t));
10976 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10978 /* If we havn't already set up the frame pointer, do so now. */
10979 if (frame_pointer_needed && !m->fs.fp_valid)
10981 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10982 GEN_INT (frame.stack_pointer_offset
10983 - frame.hard_frame_pointer_offset));
10984 insn = emit_insn (insn);
10985 RTX_FRAME_RELATED_P (insn) = 1;
10986 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10988 if (m->fs.cfa_reg == stack_pointer_rtx)
10989 m->fs.cfa_reg = hard_frame_pointer_rtx;
10990 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10991 m->fs.fp_valid = true;
10994 if (!int_registers_saved)
10995 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10996 if (!sse_registers_saved)
10997 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10999 pic_reg_used = false;
11000 /* We don't use pic-register for pe-coff target. */
11001 if (pic_offset_table_rtx
11002 && !TARGET_PECOFF
11003 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11004 || crtl->profile))
11006 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
11008 if (alt_pic_reg_used != INVALID_REGNUM)
11009 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
11011 pic_reg_used = true;
11014 if (pic_reg_used)
11016 if (TARGET_64BIT)
11018 if (ix86_cmodel == CM_LARGE_PIC)
11020 rtx label, tmp_reg;
11022 gcc_assert (Pmode == DImode);
11023 label = gen_label_rtx ();
11024 emit_label (label);
11025 LABEL_PRESERVE_P (label) = 1;
11026 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
11027 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
11028 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
11029 label));
11030 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
11031 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
11032 pic_offset_table_rtx, tmp_reg));
11034 else
11035 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
11037 else
11039 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
11040 RTX_FRAME_RELATED_P (insn) = 1;
11041 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
11045 /* In the pic_reg_used case, make sure that the got load isn't deleted
11046 when mcount needs it. Blockage to avoid call movement across mcount
11047 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
11048 note. */
11049 if (crtl->profile && !flag_fentry && pic_reg_used)
11050 emit_insn (gen_prologue_use (pic_offset_table_rtx));
11052 if (crtl->drap_reg && !crtl->stack_realign_needed)
11054 /* vDRAP is setup but after reload it turns out stack realign
11055 isn't necessary, here we will emit prologue to setup DRAP
11056 without stack realign adjustment */
11057 t = choose_baseaddr (0);
11058 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
11061 /* Prevent instructions from being scheduled into register save push
11062 sequence when access to the redzone area is done through frame pointer.
11063 The offset between the frame pointer and the stack pointer is calculated
11064 relative to the value of the stack pointer at the end of the function
11065 prologue, and moving instructions that access redzone area via frame
11066 pointer inside push sequence violates this assumption. */
11067 if (frame_pointer_needed && frame.red_zone_size)
11068 emit_insn (gen_memory_blockage ());
11070 /* Emit cld instruction if stringops are used in the function. */
11071 if (TARGET_CLD && ix86_current_function_needs_cld)
11072 emit_insn (gen_cld ());
11074 /* SEH requires that the prologue end within 256 bytes of the start of
11075 the function. Prevent instruction schedules that would extend that.
11076 Further, prevent alloca modifications to the stack pointer from being
11077 combined with prologue modifications. */
11078 if (TARGET_SEH)
11079 emit_insn (gen_prologue_use (stack_pointer_rtx));
11082 /* Emit code to restore REG using a POP insn. */
11084 static void
11085 ix86_emit_restore_reg_using_pop (rtx reg)
11087 struct machine_function *m = cfun->machine;
11088 rtx insn = emit_insn (gen_pop (reg));
11090 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
11091 m->fs.sp_offset -= UNITS_PER_WORD;
11093 if (m->fs.cfa_reg == crtl->drap_reg
11094 && REGNO (reg) == REGNO (crtl->drap_reg))
11096 /* Previously we'd represented the CFA as an expression
11097 like *(%ebp - 8). We've just popped that value from
11098 the stack, which means we need to reset the CFA to
11099 the drap register. This will remain until we restore
11100 the stack pointer. */
11101 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11102 RTX_FRAME_RELATED_P (insn) = 1;
11104 /* This means that the DRAP register is valid for addressing too. */
11105 m->fs.drap_valid = true;
11106 return;
11109 if (m->fs.cfa_reg == stack_pointer_rtx)
11111 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11112 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
11113 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11114 RTX_FRAME_RELATED_P (insn) = 1;
11116 m->fs.cfa_offset -= UNITS_PER_WORD;
11119 /* When the frame pointer is the CFA, and we pop it, we are
11120 swapping back to the stack pointer as the CFA. This happens
11121 for stack frames that don't allocate other data, so we assume
11122 the stack pointer is now pointing at the return address, i.e.
11123 the function entry state, which makes the offset be 1 word. */
11124 if (reg == hard_frame_pointer_rtx)
11126 m->fs.fp_valid = false;
11127 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11129 m->fs.cfa_reg = stack_pointer_rtx;
11130 m->fs.cfa_offset -= UNITS_PER_WORD;
11132 add_reg_note (insn, REG_CFA_DEF_CFA,
11133 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11134 GEN_INT (m->fs.cfa_offset)));
11135 RTX_FRAME_RELATED_P (insn) = 1;
11140 /* Emit code to restore saved registers using POP insns. */
11142 static void
11143 ix86_emit_restore_regs_using_pop (void)
11145 unsigned int regno;
11147 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11148 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
11149 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
11152 /* Emit code and notes for the LEAVE instruction. */
11154 static void
11155 ix86_emit_leave (void)
11157 struct machine_function *m = cfun->machine;
11158 rtx insn = emit_insn (ix86_gen_leave ());
11160 ix86_add_queued_cfa_restore_notes (insn);
11162 gcc_assert (m->fs.fp_valid);
11163 m->fs.sp_valid = true;
11164 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
11165 m->fs.fp_valid = false;
11167 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
11169 m->fs.cfa_reg = stack_pointer_rtx;
11170 m->fs.cfa_offset = m->fs.sp_offset;
11172 add_reg_note (insn, REG_CFA_DEF_CFA,
11173 plus_constant (Pmode, stack_pointer_rtx,
11174 m->fs.sp_offset));
11175 RTX_FRAME_RELATED_P (insn) = 1;
11177 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
11178 m->fs.fp_offset);
11181 /* Emit code to restore saved registers using MOV insns.
11182 First register is restored from CFA - CFA_OFFSET. */
11183 static void
11184 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
11185 bool maybe_eh_return)
11187 struct machine_function *m = cfun->machine;
11188 unsigned int regno;
11190 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11191 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11193 rtx reg = gen_rtx_REG (word_mode, regno);
11194 rtx insn, mem;
11196 mem = choose_baseaddr (cfa_offset);
11197 mem = gen_frame_mem (word_mode, mem);
11198 insn = emit_move_insn (reg, mem);
11200 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
11202 /* Previously we'd represented the CFA as an expression
11203 like *(%ebp - 8). We've just popped that value from
11204 the stack, which means we need to reset the CFA to
11205 the drap register. This will remain until we restore
11206 the stack pointer. */
11207 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
11208 RTX_FRAME_RELATED_P (insn) = 1;
11210 /* This means that the DRAP register is valid for addressing. */
11211 m->fs.drap_valid = true;
11213 else
11214 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11216 cfa_offset -= UNITS_PER_WORD;
11220 /* Emit code to restore saved registers using MOV insns.
11221 First register is restored from CFA - CFA_OFFSET. */
11222 static void
11223 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11224 bool maybe_eh_return)
11226 unsigned int regno;
11228 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11229 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11231 rtx reg = gen_rtx_REG (V4SFmode, regno);
11232 rtx mem;
11234 mem = choose_baseaddr (cfa_offset);
11235 mem = gen_rtx_MEM (V4SFmode, mem);
11236 set_mem_align (mem, 128);
11237 emit_move_insn (reg, mem);
11239 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11241 cfa_offset -= 16;
11245 /* Restore function stack, frame, and registers. */
11247 void
11248 ix86_expand_epilogue (int style)
11250 struct machine_function *m = cfun->machine;
11251 struct machine_frame_state frame_state_save = m->fs;
11252 struct ix86_frame frame;
11253 bool restore_regs_via_mov;
11254 bool using_drap;
11256 ix86_finalize_stack_realign_flags ();
11257 ix86_compute_frame_layout (&frame);
11259 m->fs.sp_valid = (!frame_pointer_needed
11260 || (crtl->sp_is_unchanging
11261 && !stack_realign_fp));
11262 gcc_assert (!m->fs.sp_valid
11263 || m->fs.sp_offset == frame.stack_pointer_offset);
11265 /* The FP must be valid if the frame pointer is present. */
11266 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11267 gcc_assert (!m->fs.fp_valid
11268 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11270 /* We must have *some* valid pointer to the stack frame. */
11271 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11273 /* The DRAP is never valid at this point. */
11274 gcc_assert (!m->fs.drap_valid);
11276 /* See the comment about red zone and frame
11277 pointer usage in ix86_expand_prologue. */
11278 if (frame_pointer_needed && frame.red_zone_size)
11279 emit_insn (gen_memory_blockage ());
11281 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11282 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11284 /* Determine the CFA offset of the end of the red-zone. */
11285 m->fs.red_zone_offset = 0;
11286 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11288 /* The red-zone begins below the return address. */
11289 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11291 /* When the register save area is in the aligned portion of
11292 the stack, determine the maximum runtime displacement that
11293 matches up with the aligned frame. */
11294 if (stack_realign_drap)
11295 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11296 + UNITS_PER_WORD);
11299 /* Special care must be taken for the normal return case of a function
11300 using eh_return: the eax and edx registers are marked as saved, but
11301 not restored along this path. Adjust the save location to match. */
11302 if (crtl->calls_eh_return && style != 2)
11303 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11305 /* EH_RETURN requires the use of moves to function properly. */
11306 if (crtl->calls_eh_return)
11307 restore_regs_via_mov = true;
11308 /* SEH requires the use of pops to identify the epilogue. */
11309 else if (TARGET_SEH)
11310 restore_regs_via_mov = false;
11311 /* If we're only restoring one register and sp is not valid then
11312 using a move instruction to restore the register since it's
11313 less work than reloading sp and popping the register. */
11314 else if (!m->fs.sp_valid && frame.nregs <= 1)
11315 restore_regs_via_mov = true;
11316 else if (TARGET_EPILOGUE_USING_MOVE
11317 && cfun->machine->use_fast_prologue_epilogue
11318 && (frame.nregs > 1
11319 || m->fs.sp_offset != frame.reg_save_offset))
11320 restore_regs_via_mov = true;
11321 else if (frame_pointer_needed
11322 && !frame.nregs
11323 && m->fs.sp_offset != frame.reg_save_offset)
11324 restore_regs_via_mov = true;
11325 else if (frame_pointer_needed
11326 && TARGET_USE_LEAVE
11327 && cfun->machine->use_fast_prologue_epilogue
11328 && frame.nregs == 1)
11329 restore_regs_via_mov = true;
11330 else
11331 restore_regs_via_mov = false;
11333 if (restore_regs_via_mov || frame.nsseregs)
11335 /* Ensure that the entire register save area is addressable via
11336 the stack pointer, if we will restore via sp. */
11337 if (TARGET_64BIT
11338 && m->fs.sp_offset > 0x7fffffff
11339 && !(m->fs.fp_valid || m->fs.drap_valid)
11340 && (frame.nsseregs + frame.nregs) != 0)
11342 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11343 GEN_INT (m->fs.sp_offset
11344 - frame.sse_reg_save_offset),
11345 style,
11346 m->fs.cfa_reg == stack_pointer_rtx);
11350 /* If there are any SSE registers to restore, then we have to do it
11351 via moves, since there's obviously no pop for SSE regs. */
11352 if (frame.nsseregs)
11353 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11354 style == 2);
11356 if (restore_regs_via_mov)
11358 rtx t;
11360 if (frame.nregs)
11361 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11363 /* eh_return epilogues need %ecx added to the stack pointer. */
11364 if (style == 2)
11366 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11368 /* Stack align doesn't work with eh_return. */
11369 gcc_assert (!stack_realign_drap);
11370 /* Neither does regparm nested functions. */
11371 gcc_assert (!ix86_static_chain_on_stack);
11373 if (frame_pointer_needed)
11375 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11376 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11377 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11379 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11380 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11382 /* Note that we use SA as a temporary CFA, as the return
11383 address is at the proper place relative to it. We
11384 pretend this happens at the FP restore insn because
11385 prior to this insn the FP would be stored at the wrong
11386 offset relative to SA, and after this insn we have no
11387 other reasonable register to use for the CFA. We don't
11388 bother resetting the CFA to the SP for the duration of
11389 the return insn. */
11390 add_reg_note (insn, REG_CFA_DEF_CFA,
11391 plus_constant (Pmode, sa, UNITS_PER_WORD));
11392 ix86_add_queued_cfa_restore_notes (insn);
11393 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11394 RTX_FRAME_RELATED_P (insn) = 1;
11396 m->fs.cfa_reg = sa;
11397 m->fs.cfa_offset = UNITS_PER_WORD;
11398 m->fs.fp_valid = false;
11400 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11401 const0_rtx, style, false);
11403 else
11405 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11406 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11407 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11408 ix86_add_queued_cfa_restore_notes (insn);
11410 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11411 if (m->fs.cfa_offset != UNITS_PER_WORD)
11413 m->fs.cfa_offset = UNITS_PER_WORD;
11414 add_reg_note (insn, REG_CFA_DEF_CFA,
11415 plus_constant (Pmode, stack_pointer_rtx,
11416 UNITS_PER_WORD));
11417 RTX_FRAME_RELATED_P (insn) = 1;
11420 m->fs.sp_offset = UNITS_PER_WORD;
11421 m->fs.sp_valid = true;
11424 else
11426 /* SEH requires that the function end with (1) a stack adjustment
11427 if necessary, (2) a sequence of pops, and (3) a return or
11428 jump instruction. Prevent insns from the function body from
11429 being scheduled into this sequence. */
11430 if (TARGET_SEH)
11432 /* Prevent a catch region from being adjacent to the standard
11433 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11434 several other flags that would be interesting to test are
11435 not yet set up. */
11436 if (flag_non_call_exceptions)
11437 emit_insn (gen_nops (const1_rtx));
11438 else
11439 emit_insn (gen_blockage ());
11442 /* First step is to deallocate the stack frame so that we can
11443 pop the registers. Also do it on SEH target for very large
11444 frame as the emitted instructions aren't allowed by the ABI in
11445 epilogues. */
11446 if (!m->fs.sp_valid
11447 || (TARGET_SEH
11448 && (m->fs.sp_offset - frame.reg_save_offset
11449 >= SEH_MAX_FRAME_SIZE)))
11451 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11452 GEN_INT (m->fs.fp_offset
11453 - frame.reg_save_offset),
11454 style, false);
11456 else if (m->fs.sp_offset != frame.reg_save_offset)
11458 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11459 GEN_INT (m->fs.sp_offset
11460 - frame.reg_save_offset),
11461 style,
11462 m->fs.cfa_reg == stack_pointer_rtx);
11465 ix86_emit_restore_regs_using_pop ();
11468 /* If we used a stack pointer and haven't already got rid of it,
11469 then do so now. */
11470 if (m->fs.fp_valid)
11472 /* If the stack pointer is valid and pointing at the frame
11473 pointer store address, then we only need a pop. */
11474 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11475 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11476 /* Leave results in shorter dependency chains on CPUs that are
11477 able to grok it fast. */
11478 else if (TARGET_USE_LEAVE
11479 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
11480 || !cfun->machine->use_fast_prologue_epilogue)
11481 ix86_emit_leave ();
11482 else
11484 pro_epilogue_adjust_stack (stack_pointer_rtx,
11485 hard_frame_pointer_rtx,
11486 const0_rtx, style, !using_drap);
11487 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11491 if (using_drap)
11493 int param_ptr_offset = UNITS_PER_WORD;
11494 rtx insn;
11496 gcc_assert (stack_realign_drap);
11498 if (ix86_static_chain_on_stack)
11499 param_ptr_offset += UNITS_PER_WORD;
11500 if (!call_used_regs[REGNO (crtl->drap_reg)])
11501 param_ptr_offset += UNITS_PER_WORD;
11503 insn = emit_insn (gen_rtx_SET
11504 (VOIDmode, stack_pointer_rtx,
11505 gen_rtx_PLUS (Pmode,
11506 crtl->drap_reg,
11507 GEN_INT (-param_ptr_offset))));
11508 m->fs.cfa_reg = stack_pointer_rtx;
11509 m->fs.cfa_offset = param_ptr_offset;
11510 m->fs.sp_offset = param_ptr_offset;
11511 m->fs.realigned = false;
11513 add_reg_note (insn, REG_CFA_DEF_CFA,
11514 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11515 GEN_INT (param_ptr_offset)));
11516 RTX_FRAME_RELATED_P (insn) = 1;
11518 if (!call_used_regs[REGNO (crtl->drap_reg)])
11519 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11522 /* At this point the stack pointer must be valid, and we must have
11523 restored all of the registers. We may not have deallocated the
11524 entire stack frame. We've delayed this until now because it may
11525 be possible to merge the local stack deallocation with the
11526 deallocation forced by ix86_static_chain_on_stack. */
11527 gcc_assert (m->fs.sp_valid);
11528 gcc_assert (!m->fs.fp_valid);
11529 gcc_assert (!m->fs.realigned);
11530 if (m->fs.sp_offset != UNITS_PER_WORD)
11532 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11533 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11534 style, true);
11536 else
11537 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11539 /* Sibcall epilogues don't want a return instruction. */
11540 if (style == 0)
11542 m->fs = frame_state_save;
11543 return;
11546 if (crtl->args.pops_args && crtl->args.size)
11548 rtx popc = GEN_INT (crtl->args.pops_args);
11550 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11551 address, do explicit add, and jump indirectly to the caller. */
11553 if (crtl->args.pops_args >= 65536)
11555 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11556 rtx insn;
11558 /* There is no "pascal" calling convention in any 64bit ABI. */
11559 gcc_assert (!TARGET_64BIT);
11561 insn = emit_insn (gen_pop (ecx));
11562 m->fs.cfa_offset -= UNITS_PER_WORD;
11563 m->fs.sp_offset -= UNITS_PER_WORD;
11565 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11566 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11567 add_reg_note (insn, REG_CFA_REGISTER,
11568 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11569 RTX_FRAME_RELATED_P (insn) = 1;
11571 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11572 popc, -1, true);
11573 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11575 else
11576 emit_jump_insn (gen_simple_return_pop_internal (popc));
11578 else
11579 emit_jump_insn (gen_simple_return_internal ());
11581 /* Restore the state back to the state from the prologue,
11582 so that it's correct for the next epilogue. */
11583 m->fs = frame_state_save;
11586 /* Reset from the function's potential modifications. */
11588 static void
11589 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11590 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11592 if (pic_offset_table_rtx)
11593 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11594 #if TARGET_MACHO
11595 /* Mach-O doesn't support labels at the end of objects, so if
11596 it looks like we might want one, insert a NOP. */
11598 rtx insn = get_last_insn ();
11599 rtx deleted_debug_label = NULL_RTX;
11600 while (insn
11601 && NOTE_P (insn)
11602 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11604 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11605 notes only, instead set their CODE_LABEL_NUMBER to -1,
11606 otherwise there would be code generation differences
11607 in between -g and -g0. */
11608 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11609 deleted_debug_label = insn;
11610 insn = PREV_INSN (insn);
11612 if (insn
11613 && (LABEL_P (insn)
11614 || (NOTE_P (insn)
11615 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11616 fputs ("\tnop\n", file);
11617 else if (deleted_debug_label)
11618 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11619 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11620 CODE_LABEL_NUMBER (insn) = -1;
11622 #endif
11626 /* Return a scratch register to use in the split stack prologue. The
11627 split stack prologue is used for -fsplit-stack. It is the first
11628 instructions in the function, even before the regular prologue.
11629 The scratch register can be any caller-saved register which is not
11630 used for parameters or for the static chain. */
11632 static unsigned int
11633 split_stack_prologue_scratch_regno (void)
11635 if (TARGET_64BIT)
11636 return R11_REG;
11637 else
11639 bool is_fastcall, is_thiscall;
11640 int regparm;
11642 is_fastcall = (lookup_attribute ("fastcall",
11643 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11644 != NULL);
11645 is_thiscall = (lookup_attribute ("thiscall",
11646 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11647 != NULL);
11648 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11650 if (is_fastcall)
11652 if (DECL_STATIC_CHAIN (cfun->decl))
11654 sorry ("-fsplit-stack does not support fastcall with "
11655 "nested function");
11656 return INVALID_REGNUM;
11658 return AX_REG;
11660 else if (is_thiscall)
11662 if (!DECL_STATIC_CHAIN (cfun->decl))
11663 return DX_REG;
11664 return AX_REG;
11666 else if (regparm < 3)
11668 if (!DECL_STATIC_CHAIN (cfun->decl))
11669 return CX_REG;
11670 else
11672 if (regparm >= 2)
11674 sorry ("-fsplit-stack does not support 2 register "
11675 " parameters for a nested function");
11676 return INVALID_REGNUM;
11678 return DX_REG;
11681 else
11683 /* FIXME: We could make this work by pushing a register
11684 around the addition and comparison. */
11685 sorry ("-fsplit-stack does not support 3 register parameters");
11686 return INVALID_REGNUM;
11691 /* A SYMBOL_REF for the function which allocates new stackspace for
11692 -fsplit-stack. */
11694 static GTY(()) rtx split_stack_fn;
11696 /* A SYMBOL_REF for the more stack function when using the large
11697 model. */
11699 static GTY(()) rtx split_stack_fn_large;
11701 /* Handle -fsplit-stack. These are the first instructions in the
11702 function, even before the regular prologue. */
11704 void
11705 ix86_expand_split_stack_prologue (void)
11707 struct ix86_frame frame;
11708 HOST_WIDE_INT allocate;
11709 unsigned HOST_WIDE_INT args_size;
11710 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11711 rtx scratch_reg = NULL_RTX;
11712 rtx varargs_label = NULL_RTX;
11713 rtx fn;
11715 gcc_assert (flag_split_stack && reload_completed);
11717 ix86_finalize_stack_realign_flags ();
11718 ix86_compute_frame_layout (&frame);
11719 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11721 /* This is the label we will branch to if we have enough stack
11722 space. We expect the basic block reordering pass to reverse this
11723 branch if optimizing, so that we branch in the unlikely case. */
11724 label = gen_label_rtx ();
11726 /* We need to compare the stack pointer minus the frame size with
11727 the stack boundary in the TCB. The stack boundary always gives
11728 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11729 can compare directly. Otherwise we need to do an addition. */
11731 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11732 UNSPEC_STACK_CHECK);
11733 limit = gen_rtx_CONST (Pmode, limit);
11734 limit = gen_rtx_MEM (Pmode, limit);
11735 if (allocate < SPLIT_STACK_AVAILABLE)
11736 current = stack_pointer_rtx;
11737 else
11739 unsigned int scratch_regno;
11740 rtx offset;
11742 /* We need a scratch register to hold the stack pointer minus
11743 the required frame size. Since this is the very start of the
11744 function, the scratch register can be any caller-saved
11745 register which is not used for parameters. */
11746 offset = GEN_INT (- allocate);
11747 scratch_regno = split_stack_prologue_scratch_regno ();
11748 if (scratch_regno == INVALID_REGNUM)
11749 return;
11750 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11751 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11753 /* We don't use ix86_gen_add3 in this case because it will
11754 want to split to lea, but when not optimizing the insn
11755 will not be split after this point. */
11756 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11757 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11758 offset)));
11760 else
11762 emit_move_insn (scratch_reg, offset);
11763 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11764 stack_pointer_rtx));
11766 current = scratch_reg;
11769 ix86_expand_branch (GEU, current, limit, label);
11770 jump_insn = get_last_insn ();
11771 JUMP_LABEL (jump_insn) = label;
11773 /* Mark the jump as very likely to be taken. */
11774 add_int_reg_note (jump_insn, REG_BR_PROB,
11775 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11777 if (split_stack_fn == NULL_RTX)
11778 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11779 fn = split_stack_fn;
11781 /* Get more stack space. We pass in the desired stack space and the
11782 size of the arguments to copy to the new stack. In 32-bit mode
11783 we push the parameters; __morestack will return on a new stack
11784 anyhow. In 64-bit mode we pass the parameters in r10 and
11785 r11. */
11786 allocate_rtx = GEN_INT (allocate);
11787 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11788 call_fusage = NULL_RTX;
11789 if (TARGET_64BIT)
11791 rtx reg10, reg11;
11793 reg10 = gen_rtx_REG (Pmode, R10_REG);
11794 reg11 = gen_rtx_REG (Pmode, R11_REG);
11796 /* If this function uses a static chain, it will be in %r10.
11797 Preserve it across the call to __morestack. */
11798 if (DECL_STATIC_CHAIN (cfun->decl))
11800 rtx rax;
11802 rax = gen_rtx_REG (word_mode, AX_REG);
11803 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11804 use_reg (&call_fusage, rax);
11807 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11808 && !TARGET_PECOFF)
11810 HOST_WIDE_INT argval;
11812 gcc_assert (Pmode == DImode);
11813 /* When using the large model we need to load the address
11814 into a register, and we've run out of registers. So we
11815 switch to a different calling convention, and we call a
11816 different function: __morestack_large. We pass the
11817 argument size in the upper 32 bits of r10 and pass the
11818 frame size in the lower 32 bits. */
11819 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11820 gcc_assert ((args_size & 0xffffffff) == args_size);
11822 if (split_stack_fn_large == NULL_RTX)
11823 split_stack_fn_large =
11824 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11826 if (ix86_cmodel == CM_LARGE_PIC)
11828 rtx label, x;
11830 label = gen_label_rtx ();
11831 emit_label (label);
11832 LABEL_PRESERVE_P (label) = 1;
11833 emit_insn (gen_set_rip_rex64 (reg10, label));
11834 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11835 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11836 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11837 UNSPEC_GOT);
11838 x = gen_rtx_CONST (Pmode, x);
11839 emit_move_insn (reg11, x);
11840 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11841 x = gen_const_mem (Pmode, x);
11842 emit_move_insn (reg11, x);
11844 else
11845 emit_move_insn (reg11, split_stack_fn_large);
11847 fn = reg11;
11849 argval = ((args_size << 16) << 16) + allocate;
11850 emit_move_insn (reg10, GEN_INT (argval));
11852 else
11854 emit_move_insn (reg10, allocate_rtx);
11855 emit_move_insn (reg11, GEN_INT (args_size));
11856 use_reg (&call_fusage, reg11);
11859 use_reg (&call_fusage, reg10);
11861 else
11863 emit_insn (gen_push (GEN_INT (args_size)));
11864 emit_insn (gen_push (allocate_rtx));
11866 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11867 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11868 NULL_RTX, false);
11869 add_function_usage_to (call_insn, call_fusage);
11871 /* In order to make call/return prediction work right, we now need
11872 to execute a return instruction. See
11873 libgcc/config/i386/morestack.S for the details on how this works.
11875 For flow purposes gcc must not see this as a return
11876 instruction--we need control flow to continue at the subsequent
11877 label. Therefore, we use an unspec. */
11878 gcc_assert (crtl->args.pops_args < 65536);
11879 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11881 /* If we are in 64-bit mode and this function uses a static chain,
11882 we saved %r10 in %rax before calling _morestack. */
11883 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11884 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11885 gen_rtx_REG (word_mode, AX_REG));
11887 /* If this function calls va_start, we need to store a pointer to
11888 the arguments on the old stack, because they may not have been
11889 all copied to the new stack. At this point the old stack can be
11890 found at the frame pointer value used by __morestack, because
11891 __morestack has set that up before calling back to us. Here we
11892 store that pointer in a scratch register, and in
11893 ix86_expand_prologue we store the scratch register in a stack
11894 slot. */
11895 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11897 unsigned int scratch_regno;
11898 rtx frame_reg;
11899 int words;
11901 scratch_regno = split_stack_prologue_scratch_regno ();
11902 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11903 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11905 /* 64-bit:
11906 fp -> old fp value
11907 return address within this function
11908 return address of caller of this function
11909 stack arguments
11910 So we add three words to get to the stack arguments.
11912 32-bit:
11913 fp -> old fp value
11914 return address within this function
11915 first argument to __morestack
11916 second argument to __morestack
11917 return address of caller of this function
11918 stack arguments
11919 So we add five words to get to the stack arguments.
11921 words = TARGET_64BIT ? 3 : 5;
11922 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11923 gen_rtx_PLUS (Pmode, frame_reg,
11924 GEN_INT (words * UNITS_PER_WORD))));
11926 varargs_label = gen_label_rtx ();
11927 emit_jump_insn (gen_jump (varargs_label));
11928 JUMP_LABEL (get_last_insn ()) = varargs_label;
11930 emit_barrier ();
11933 emit_label (label);
11934 LABEL_NUSES (label) = 1;
11936 /* If this function calls va_start, we now have to set the scratch
11937 register for the case where we do not call __morestack. In this
11938 case we need to set it based on the stack pointer. */
11939 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11941 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11942 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11943 GEN_INT (UNITS_PER_WORD))));
11945 emit_label (varargs_label);
11946 LABEL_NUSES (varargs_label) = 1;
11950 /* We may have to tell the dataflow pass that the split stack prologue
11951 is initializing a scratch register. */
11953 static void
11954 ix86_live_on_entry (bitmap regs)
11956 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11958 gcc_assert (flag_split_stack);
11959 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11963 /* Extract the parts of an RTL expression that is a valid memory address
11964 for an instruction. Return 0 if the structure of the address is
11965 grossly off. Return -1 if the address contains ASHIFT, so it is not
11966 strictly valid, but still used for computing length of lea instruction. */
11969 ix86_decompose_address (rtx addr, struct ix86_address *out)
11971 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11972 rtx base_reg, index_reg;
11973 HOST_WIDE_INT scale = 1;
11974 rtx scale_rtx = NULL_RTX;
11975 rtx tmp;
11976 int retval = 1;
11977 enum ix86_address_seg seg = SEG_DEFAULT;
11979 /* Allow zero-extended SImode addresses,
11980 they will be emitted with addr32 prefix. */
11981 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11983 if (GET_CODE (addr) == ZERO_EXTEND
11984 && GET_MODE (XEXP (addr, 0)) == SImode)
11986 addr = XEXP (addr, 0);
11987 if (CONST_INT_P (addr))
11988 return 0;
11990 else if (GET_CODE (addr) == AND
11991 && const_32bit_mask (XEXP (addr, 1), DImode))
11993 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11994 if (addr == NULL_RTX)
11995 return 0;
11997 if (CONST_INT_P (addr))
11998 return 0;
12002 /* Allow SImode subregs of DImode addresses,
12003 they will be emitted with addr32 prefix. */
12004 if (TARGET_64BIT && GET_MODE (addr) == SImode)
12006 if (GET_CODE (addr) == SUBREG
12007 && GET_MODE (SUBREG_REG (addr)) == DImode)
12009 addr = SUBREG_REG (addr);
12010 if (CONST_INT_P (addr))
12011 return 0;
12015 if (REG_P (addr))
12016 base = addr;
12017 else if (GET_CODE (addr) == SUBREG)
12019 if (REG_P (SUBREG_REG (addr)))
12020 base = addr;
12021 else
12022 return 0;
12024 else if (GET_CODE (addr) == PLUS)
12026 rtx addends[4], op;
12027 int n = 0, i;
12029 op = addr;
12032 if (n >= 4)
12033 return 0;
12034 addends[n++] = XEXP (op, 1);
12035 op = XEXP (op, 0);
12037 while (GET_CODE (op) == PLUS);
12038 if (n >= 4)
12039 return 0;
12040 addends[n] = op;
12042 for (i = n; i >= 0; --i)
12044 op = addends[i];
12045 switch (GET_CODE (op))
12047 case MULT:
12048 if (index)
12049 return 0;
12050 index = XEXP (op, 0);
12051 scale_rtx = XEXP (op, 1);
12052 break;
12054 case ASHIFT:
12055 if (index)
12056 return 0;
12057 index = XEXP (op, 0);
12058 tmp = XEXP (op, 1);
12059 if (!CONST_INT_P (tmp))
12060 return 0;
12061 scale = INTVAL (tmp);
12062 if ((unsigned HOST_WIDE_INT) scale > 3)
12063 return 0;
12064 scale = 1 << scale;
12065 break;
12067 case ZERO_EXTEND:
12068 op = XEXP (op, 0);
12069 if (GET_CODE (op) != UNSPEC)
12070 return 0;
12071 /* FALLTHRU */
12073 case UNSPEC:
12074 if (XINT (op, 1) == UNSPEC_TP
12075 && TARGET_TLS_DIRECT_SEG_REFS
12076 && seg == SEG_DEFAULT)
12077 seg = DEFAULT_TLS_SEG_REG;
12078 else
12079 return 0;
12080 break;
12082 case SUBREG:
12083 if (!REG_P (SUBREG_REG (op)))
12084 return 0;
12085 /* FALLTHRU */
12087 case REG:
12088 if (!base)
12089 base = op;
12090 else if (!index)
12091 index = op;
12092 else
12093 return 0;
12094 break;
12096 case CONST:
12097 case CONST_INT:
12098 case SYMBOL_REF:
12099 case LABEL_REF:
12100 if (disp)
12101 return 0;
12102 disp = op;
12103 break;
12105 default:
12106 return 0;
12110 else if (GET_CODE (addr) == MULT)
12112 index = XEXP (addr, 0); /* index*scale */
12113 scale_rtx = XEXP (addr, 1);
12115 else if (GET_CODE (addr) == ASHIFT)
12117 /* We're called for lea too, which implements ashift on occasion. */
12118 index = XEXP (addr, 0);
12119 tmp = XEXP (addr, 1);
12120 if (!CONST_INT_P (tmp))
12121 return 0;
12122 scale = INTVAL (tmp);
12123 if ((unsigned HOST_WIDE_INT) scale > 3)
12124 return 0;
12125 scale = 1 << scale;
12126 retval = -1;
12128 else
12129 disp = addr; /* displacement */
12131 if (index)
12133 if (REG_P (index))
12135 else if (GET_CODE (index) == SUBREG
12136 && REG_P (SUBREG_REG (index)))
12138 else
12139 return 0;
12142 /* Extract the integral value of scale. */
12143 if (scale_rtx)
12145 if (!CONST_INT_P (scale_rtx))
12146 return 0;
12147 scale = INTVAL (scale_rtx);
12150 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
12151 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
12153 /* Avoid useless 0 displacement. */
12154 if (disp == const0_rtx && (base || index))
12155 disp = NULL_RTX;
12157 /* Allow arg pointer and stack pointer as index if there is not scaling. */
12158 if (base_reg && index_reg && scale == 1
12159 && (index_reg == arg_pointer_rtx
12160 || index_reg == frame_pointer_rtx
12161 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
12163 rtx tmp;
12164 tmp = base, base = index, index = tmp;
12165 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12168 /* Special case: %ebp cannot be encoded as a base without a displacement.
12169 Similarly %r13. */
12170 if (!disp
12171 && base_reg
12172 && (base_reg == hard_frame_pointer_rtx
12173 || base_reg == frame_pointer_rtx
12174 || base_reg == arg_pointer_rtx
12175 || (REG_P (base_reg)
12176 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12177 || REGNO (base_reg) == R13_REG))))
12178 disp = const0_rtx;
12180 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12181 Avoid this by transforming to [%esi+0].
12182 Reload calls address legitimization without cfun defined, so we need
12183 to test cfun for being non-NULL. */
12184 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12185 && base_reg && !index_reg && !disp
12186 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12187 disp = const0_rtx;
12189 /* Special case: encode reg+reg instead of reg*2. */
12190 if (!base && index && scale == 2)
12191 base = index, base_reg = index_reg, scale = 1;
12193 /* Special case: scaling cannot be encoded without base or displacement. */
12194 if (!base && !disp && index && scale != 1)
12195 disp = const0_rtx;
12197 out->base = base;
12198 out->index = index;
12199 out->disp = disp;
12200 out->scale = scale;
12201 out->seg = seg;
12203 return retval;
12206 /* Return cost of the memory address x.
12207 For i386, it is better to use a complex address than let gcc copy
12208 the address into a reg and make a new pseudo. But not if the address
12209 requires to two regs - that would mean more pseudos with longer
12210 lifetimes. */
12211 static int
12212 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12213 addr_space_t as ATTRIBUTE_UNUSED,
12214 bool speed ATTRIBUTE_UNUSED)
12216 struct ix86_address parts;
12217 int cost = 1;
12218 int ok = ix86_decompose_address (x, &parts);
12220 gcc_assert (ok);
12222 if (parts.base && GET_CODE (parts.base) == SUBREG)
12223 parts.base = SUBREG_REG (parts.base);
12224 if (parts.index && GET_CODE (parts.index) == SUBREG)
12225 parts.index = SUBREG_REG (parts.index);
12227 /* Attempt to minimize number of registers in the address. */
12228 if ((parts.base
12229 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12230 || (parts.index
12231 && (!REG_P (parts.index)
12232 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12233 cost++;
12235 if (parts.base
12236 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12237 && parts.index
12238 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12239 && parts.base != parts.index)
12240 cost++;
12242 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12243 since it's predecode logic can't detect the length of instructions
12244 and it degenerates to vector decoded. Increase cost of such
12245 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12246 to split such addresses or even refuse such addresses at all.
12248 Following addressing modes are affected:
12249 [base+scale*index]
12250 [scale*index+disp]
12251 [base+index]
12253 The first and last case may be avoidable by explicitly coding the zero in
12254 memory address, but I don't have AMD-K6 machine handy to check this
12255 theory. */
12257 if (TARGET_K6
12258 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12259 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12260 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12261 cost += 10;
12263 return cost;
12266 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12267 this is used for to form addresses to local data when -fPIC is in
12268 use. */
12270 static bool
12271 darwin_local_data_pic (rtx disp)
12273 return (GET_CODE (disp) == UNSPEC
12274 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12277 /* Determine if a given RTX is a valid constant. We already know this
12278 satisfies CONSTANT_P. */
12280 static bool
12281 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12283 switch (GET_CODE (x))
12285 case CONST:
12286 x = XEXP (x, 0);
12288 if (GET_CODE (x) == PLUS)
12290 if (!CONST_INT_P (XEXP (x, 1)))
12291 return false;
12292 x = XEXP (x, 0);
12295 if (TARGET_MACHO && darwin_local_data_pic (x))
12296 return true;
12298 /* Only some unspecs are valid as "constants". */
12299 if (GET_CODE (x) == UNSPEC)
12300 switch (XINT (x, 1))
12302 case UNSPEC_GOT:
12303 case UNSPEC_GOTOFF:
12304 case UNSPEC_PLTOFF:
12305 return TARGET_64BIT;
12306 case UNSPEC_TPOFF:
12307 case UNSPEC_NTPOFF:
12308 x = XVECEXP (x, 0, 0);
12309 return (GET_CODE (x) == SYMBOL_REF
12310 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12311 case UNSPEC_DTPOFF:
12312 x = XVECEXP (x, 0, 0);
12313 return (GET_CODE (x) == SYMBOL_REF
12314 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12315 default:
12316 return false;
12319 /* We must have drilled down to a symbol. */
12320 if (GET_CODE (x) == LABEL_REF)
12321 return true;
12322 if (GET_CODE (x) != SYMBOL_REF)
12323 return false;
12324 /* FALLTHRU */
12326 case SYMBOL_REF:
12327 /* TLS symbols are never valid. */
12328 if (SYMBOL_REF_TLS_MODEL (x))
12329 return false;
12331 /* DLLIMPORT symbols are never valid. */
12332 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12333 && SYMBOL_REF_DLLIMPORT_P (x))
12334 return false;
12336 #if TARGET_MACHO
12337 /* mdynamic-no-pic */
12338 if (MACHO_DYNAMIC_NO_PIC_P)
12339 return machopic_symbol_defined_p (x);
12340 #endif
12341 break;
12343 case CONST_DOUBLE:
12344 if (GET_MODE (x) == TImode
12345 && x != CONST0_RTX (TImode)
12346 && !TARGET_64BIT)
12347 return false;
12348 break;
12350 case CONST_VECTOR:
12351 if (!standard_sse_constant_p (x))
12352 return false;
12354 default:
12355 break;
12358 /* Otherwise we handle everything else in the move patterns. */
12359 return true;
12362 /* Determine if it's legal to put X into the constant pool. This
12363 is not possible for the address of thread-local symbols, which
12364 is checked above. */
12366 static bool
12367 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12369 /* We can always put integral constants and vectors in memory. */
12370 switch (GET_CODE (x))
12372 case CONST_INT:
12373 case CONST_DOUBLE:
12374 case CONST_VECTOR:
12375 return false;
12377 default:
12378 break;
12380 return !ix86_legitimate_constant_p (mode, x);
12383 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12384 otherwise zero. */
12386 static bool
12387 is_imported_p (rtx x)
12389 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12390 || GET_CODE (x) != SYMBOL_REF)
12391 return false;
12393 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12397 /* Nonzero if the constant value X is a legitimate general operand
12398 when generating PIC code. It is given that flag_pic is on and
12399 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12401 bool
12402 legitimate_pic_operand_p (rtx x)
12404 rtx inner;
12406 switch (GET_CODE (x))
12408 case CONST:
12409 inner = XEXP (x, 0);
12410 if (GET_CODE (inner) == PLUS
12411 && CONST_INT_P (XEXP (inner, 1)))
12412 inner = XEXP (inner, 0);
12414 /* Only some unspecs are valid as "constants". */
12415 if (GET_CODE (inner) == UNSPEC)
12416 switch (XINT (inner, 1))
12418 case UNSPEC_GOT:
12419 case UNSPEC_GOTOFF:
12420 case UNSPEC_PLTOFF:
12421 return TARGET_64BIT;
12422 case UNSPEC_TPOFF:
12423 x = XVECEXP (inner, 0, 0);
12424 return (GET_CODE (x) == SYMBOL_REF
12425 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12426 case UNSPEC_MACHOPIC_OFFSET:
12427 return legitimate_pic_address_disp_p (x);
12428 default:
12429 return false;
12431 /* FALLTHRU */
12433 case SYMBOL_REF:
12434 case LABEL_REF:
12435 return legitimate_pic_address_disp_p (x);
12437 default:
12438 return true;
12442 /* Determine if a given CONST RTX is a valid memory displacement
12443 in PIC mode. */
12445 bool
12446 legitimate_pic_address_disp_p (rtx disp)
12448 bool saw_plus;
12450 /* In 64bit mode we can allow direct addresses of symbols and labels
12451 when they are not dynamic symbols. */
12452 if (TARGET_64BIT)
12454 rtx op0 = disp, op1;
12456 switch (GET_CODE (disp))
12458 case LABEL_REF:
12459 return true;
12461 case CONST:
12462 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12463 break;
12464 op0 = XEXP (XEXP (disp, 0), 0);
12465 op1 = XEXP (XEXP (disp, 0), 1);
12466 if (!CONST_INT_P (op1)
12467 || INTVAL (op1) >= 16*1024*1024
12468 || INTVAL (op1) < -16*1024*1024)
12469 break;
12470 if (GET_CODE (op0) == LABEL_REF)
12471 return true;
12472 if (GET_CODE (op0) == CONST
12473 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12474 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12475 return true;
12476 if (GET_CODE (op0) == UNSPEC
12477 && XINT (op0, 1) == UNSPEC_PCREL)
12478 return true;
12479 if (GET_CODE (op0) != SYMBOL_REF)
12480 break;
12481 /* FALLTHRU */
12483 case SYMBOL_REF:
12484 /* TLS references should always be enclosed in UNSPEC.
12485 The dllimported symbol needs always to be resolved. */
12486 if (SYMBOL_REF_TLS_MODEL (op0)
12487 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12488 return false;
12490 if (TARGET_PECOFF)
12492 if (is_imported_p (op0))
12493 return true;
12495 if (SYMBOL_REF_FAR_ADDR_P (op0)
12496 || !SYMBOL_REF_LOCAL_P (op0))
12497 break;
12499 /* Function-symbols need to be resolved only for
12500 large-model.
12501 For the small-model we don't need to resolve anything
12502 here. */
12503 if ((ix86_cmodel != CM_LARGE_PIC
12504 && SYMBOL_REF_FUNCTION_P (op0))
12505 || ix86_cmodel == CM_SMALL_PIC)
12506 return true;
12507 /* Non-external symbols don't need to be resolved for
12508 large, and medium-model. */
12509 if ((ix86_cmodel == CM_LARGE_PIC
12510 || ix86_cmodel == CM_MEDIUM_PIC)
12511 && !SYMBOL_REF_EXTERNAL_P (op0))
12512 return true;
12514 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12515 && SYMBOL_REF_LOCAL_P (op0)
12516 && ix86_cmodel != CM_LARGE_PIC)
12517 return true;
12518 break;
12520 default:
12521 break;
12524 if (GET_CODE (disp) != CONST)
12525 return false;
12526 disp = XEXP (disp, 0);
12528 if (TARGET_64BIT)
12530 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12531 of GOT tables. We should not need these anyway. */
12532 if (GET_CODE (disp) != UNSPEC
12533 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12534 && XINT (disp, 1) != UNSPEC_GOTOFF
12535 && XINT (disp, 1) != UNSPEC_PCREL
12536 && XINT (disp, 1) != UNSPEC_PLTOFF))
12537 return false;
12539 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12540 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12541 return false;
12542 return true;
12545 saw_plus = false;
12546 if (GET_CODE (disp) == PLUS)
12548 if (!CONST_INT_P (XEXP (disp, 1)))
12549 return false;
12550 disp = XEXP (disp, 0);
12551 saw_plus = true;
12554 if (TARGET_MACHO && darwin_local_data_pic (disp))
12555 return true;
12557 if (GET_CODE (disp) != UNSPEC)
12558 return false;
12560 switch (XINT (disp, 1))
12562 case UNSPEC_GOT:
12563 if (saw_plus)
12564 return false;
12565 /* We need to check for both symbols and labels because VxWorks loads
12566 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12567 details. */
12568 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12569 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12570 case UNSPEC_GOTOFF:
12571 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12572 While ABI specify also 32bit relocation but we don't produce it in
12573 small PIC model at all. */
12574 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12575 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12576 && !TARGET_64BIT)
12577 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12578 return false;
12579 case UNSPEC_GOTTPOFF:
12580 case UNSPEC_GOTNTPOFF:
12581 case UNSPEC_INDNTPOFF:
12582 if (saw_plus)
12583 return false;
12584 disp = XVECEXP (disp, 0, 0);
12585 return (GET_CODE (disp) == SYMBOL_REF
12586 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12587 case UNSPEC_NTPOFF:
12588 disp = XVECEXP (disp, 0, 0);
12589 return (GET_CODE (disp) == SYMBOL_REF
12590 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12591 case UNSPEC_DTPOFF:
12592 disp = XVECEXP (disp, 0, 0);
12593 return (GET_CODE (disp) == SYMBOL_REF
12594 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12597 return false;
12600 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12601 replace the input X, or the original X if no replacement is called for.
12602 The output parameter *WIN is 1 if the calling macro should goto WIN,
12603 0 if it should not. */
12605 bool
12606 ix86_legitimize_reload_address (rtx x,
12607 enum machine_mode mode ATTRIBUTE_UNUSED,
12608 int opnum, int type,
12609 int ind_levels ATTRIBUTE_UNUSED)
12611 /* Reload can generate:
12613 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12614 (reg:DI 97))
12615 (reg:DI 2 cx))
12617 This RTX is rejected from ix86_legitimate_address_p due to
12618 non-strictness of base register 97. Following this rejection,
12619 reload pushes all three components into separate registers,
12620 creating invalid memory address RTX.
12622 Following code reloads only the invalid part of the
12623 memory address RTX. */
12625 if (GET_CODE (x) == PLUS
12626 && REG_P (XEXP (x, 1))
12627 && GET_CODE (XEXP (x, 0)) == PLUS
12628 && REG_P (XEXP (XEXP (x, 0), 1)))
12630 rtx base, index;
12631 bool something_reloaded = false;
12633 base = XEXP (XEXP (x, 0), 1);
12634 if (!REG_OK_FOR_BASE_STRICT_P (base))
12636 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12637 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12638 opnum, (enum reload_type) type);
12639 something_reloaded = true;
12642 index = XEXP (x, 1);
12643 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12645 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12646 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12647 opnum, (enum reload_type) type);
12648 something_reloaded = true;
12651 gcc_assert (something_reloaded);
12652 return true;
12655 return false;
12658 /* Determine if op is suitable RTX for an address register.
12659 Return naked register if a register or a register subreg is
12660 found, otherwise return NULL_RTX. */
12662 static rtx
12663 ix86_validate_address_register (rtx op)
12665 enum machine_mode mode = GET_MODE (op);
12667 /* Only SImode or DImode registers can form the address. */
12668 if (mode != SImode && mode != DImode)
12669 return NULL_RTX;
12671 if (REG_P (op))
12672 return op;
12673 else if (GET_CODE (op) == SUBREG)
12675 rtx reg = SUBREG_REG (op);
12677 if (!REG_P (reg))
12678 return NULL_RTX;
12680 mode = GET_MODE (reg);
12682 /* Don't allow SUBREGs that span more than a word. It can
12683 lead to spill failures when the register is one word out
12684 of a two word structure. */
12685 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
12686 return NULL_RTX;
12688 /* Allow only SUBREGs of non-eliminable hard registers. */
12689 if (register_no_elim_operand (reg, mode))
12690 return reg;
12693 /* Op is not a register. */
12694 return NULL_RTX;
12697 /* Recognizes RTL expressions that are valid memory addresses for an
12698 instruction. The MODE argument is the machine mode for the MEM
12699 expression that wants to use this address.
12701 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12702 convert common non-canonical forms to canonical form so that they will
12703 be recognized. */
12705 static bool
12706 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12707 rtx addr, bool strict)
12709 struct ix86_address parts;
12710 rtx base, index, disp;
12711 HOST_WIDE_INT scale;
12712 enum ix86_address_seg seg;
12714 if (ix86_decompose_address (addr, &parts) <= 0)
12715 /* Decomposition failed. */
12716 return false;
12718 base = parts.base;
12719 index = parts.index;
12720 disp = parts.disp;
12721 scale = parts.scale;
12722 seg = parts.seg;
12724 /* Validate base register. */
12725 if (base)
12727 rtx reg = ix86_validate_address_register (base);
12729 if (reg == NULL_RTX)
12730 return false;
12732 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12733 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12734 /* Base is not valid. */
12735 return false;
12738 /* Validate index register. */
12739 if (index)
12741 rtx reg = ix86_validate_address_register (index);
12743 if (reg == NULL_RTX)
12744 return false;
12746 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12747 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12748 /* Index is not valid. */
12749 return false;
12752 /* Index and base should have the same mode. */
12753 if (base && index
12754 && GET_MODE (base) != GET_MODE (index))
12755 return false;
12757 /* Address override works only on the (%reg) part of %fs:(%reg). */
12758 if (seg != SEG_DEFAULT
12759 && ((base && GET_MODE (base) != word_mode)
12760 || (index && GET_MODE (index) != word_mode)))
12761 return false;
12763 /* Validate scale factor. */
12764 if (scale != 1)
12766 if (!index)
12767 /* Scale without index. */
12768 return false;
12770 if (scale != 2 && scale != 4 && scale != 8)
12771 /* Scale is not a valid multiplier. */
12772 return false;
12775 /* Validate displacement. */
12776 if (disp)
12778 if (GET_CODE (disp) == CONST
12779 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12780 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12781 switch (XINT (XEXP (disp, 0), 1))
12783 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12784 used. While ABI specify also 32bit relocations, we don't produce
12785 them at all and use IP relative instead. */
12786 case UNSPEC_GOT:
12787 case UNSPEC_GOTOFF:
12788 gcc_assert (flag_pic);
12789 if (!TARGET_64BIT)
12790 goto is_legitimate_pic;
12792 /* 64bit address unspec. */
12793 return false;
12795 case UNSPEC_GOTPCREL:
12796 case UNSPEC_PCREL:
12797 gcc_assert (flag_pic);
12798 goto is_legitimate_pic;
12800 case UNSPEC_GOTTPOFF:
12801 case UNSPEC_GOTNTPOFF:
12802 case UNSPEC_INDNTPOFF:
12803 case UNSPEC_NTPOFF:
12804 case UNSPEC_DTPOFF:
12805 break;
12807 case UNSPEC_STACK_CHECK:
12808 gcc_assert (flag_split_stack);
12809 break;
12811 default:
12812 /* Invalid address unspec. */
12813 return false;
12816 else if (SYMBOLIC_CONST (disp)
12817 && (flag_pic
12818 || (TARGET_MACHO
12819 #if TARGET_MACHO
12820 && MACHOPIC_INDIRECT
12821 && !machopic_operand_p (disp)
12822 #endif
12826 is_legitimate_pic:
12827 if (TARGET_64BIT && (index || base))
12829 /* foo@dtpoff(%rX) is ok. */
12830 if (GET_CODE (disp) != CONST
12831 || GET_CODE (XEXP (disp, 0)) != PLUS
12832 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12833 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12834 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12835 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12836 /* Non-constant pic memory reference. */
12837 return false;
12839 else if ((!TARGET_MACHO || flag_pic)
12840 && ! legitimate_pic_address_disp_p (disp))
12841 /* Displacement is an invalid pic construct. */
12842 return false;
12843 #if TARGET_MACHO
12844 else if (MACHO_DYNAMIC_NO_PIC_P
12845 && !ix86_legitimate_constant_p (Pmode, disp))
12846 /* displacment must be referenced via non_lazy_pointer */
12847 return false;
12848 #endif
12850 /* This code used to verify that a symbolic pic displacement
12851 includes the pic_offset_table_rtx register.
12853 While this is good idea, unfortunately these constructs may
12854 be created by "adds using lea" optimization for incorrect
12855 code like:
12857 int a;
12858 int foo(int i)
12860 return *(&a+i);
12863 This code is nonsensical, but results in addressing
12864 GOT table with pic_offset_table_rtx base. We can't
12865 just refuse it easily, since it gets matched by
12866 "addsi3" pattern, that later gets split to lea in the
12867 case output register differs from input. While this
12868 can be handled by separate addsi pattern for this case
12869 that never results in lea, this seems to be easier and
12870 correct fix for crash to disable this test. */
12872 else if (GET_CODE (disp) != LABEL_REF
12873 && !CONST_INT_P (disp)
12874 && (GET_CODE (disp) != CONST
12875 || !ix86_legitimate_constant_p (Pmode, disp))
12876 && (GET_CODE (disp) != SYMBOL_REF
12877 || !ix86_legitimate_constant_p (Pmode, disp)))
12878 /* Displacement is not constant. */
12879 return false;
12880 else if (TARGET_64BIT
12881 && !x86_64_immediate_operand (disp, VOIDmode))
12882 /* Displacement is out of range. */
12883 return false;
12884 /* In x32 mode, constant addresses are sign extended to 64bit, so
12885 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12886 else if (TARGET_X32 && !(index || base)
12887 && CONST_INT_P (disp)
12888 && val_signbit_known_set_p (SImode, INTVAL (disp)))
12889 return false;
12892 /* Everything looks valid. */
12893 return true;
12896 /* Determine if a given RTX is a valid constant address. */
12898 bool
12899 constant_address_p (rtx x)
12901 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12904 /* Return a unique alias set for the GOT. */
12906 static alias_set_type
12907 ix86_GOT_alias_set (void)
12909 static alias_set_type set = -1;
12910 if (set == -1)
12911 set = new_alias_set ();
12912 return set;
12915 /* Return a legitimate reference for ORIG (an address) using the
12916 register REG. If REG is 0, a new pseudo is generated.
12918 There are two types of references that must be handled:
12920 1. Global data references must load the address from the GOT, via
12921 the PIC reg. An insn is emitted to do this load, and the reg is
12922 returned.
12924 2. Static data references, constant pool addresses, and code labels
12925 compute the address as an offset from the GOT, whose base is in
12926 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12927 differentiate them from global data objects. The returned
12928 address is the PIC reg + an unspec constant.
12930 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12931 reg also appears in the address. */
12933 static rtx
12934 legitimize_pic_address (rtx orig, rtx reg)
12936 rtx addr = orig;
12937 rtx new_rtx = orig;
12939 #if TARGET_MACHO
12940 if (TARGET_MACHO && !TARGET_64BIT)
12942 if (reg == 0)
12943 reg = gen_reg_rtx (Pmode);
12944 /* Use the generic Mach-O PIC machinery. */
12945 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12947 #endif
12949 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12951 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12952 if (tmp)
12953 return tmp;
12956 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12957 new_rtx = addr;
12958 else if (TARGET_64BIT && !TARGET_PECOFF
12959 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12961 rtx tmpreg;
12962 /* This symbol may be referenced via a displacement from the PIC
12963 base address (@GOTOFF). */
12965 if (reload_in_progress)
12966 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12967 if (GET_CODE (addr) == CONST)
12968 addr = XEXP (addr, 0);
12969 if (GET_CODE (addr) == PLUS)
12971 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12972 UNSPEC_GOTOFF);
12973 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12975 else
12976 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12977 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12978 if (!reg)
12979 tmpreg = gen_reg_rtx (Pmode);
12980 else
12981 tmpreg = reg;
12982 emit_move_insn (tmpreg, new_rtx);
12984 if (reg != 0)
12986 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12987 tmpreg, 1, OPTAB_DIRECT);
12988 new_rtx = reg;
12990 else
12991 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12993 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12995 /* This symbol may be referenced via a displacement from the PIC
12996 base address (@GOTOFF). */
12998 if (reload_in_progress)
12999 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13000 if (GET_CODE (addr) == CONST)
13001 addr = XEXP (addr, 0);
13002 if (GET_CODE (addr) == PLUS)
13004 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
13005 UNSPEC_GOTOFF);
13006 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
13008 else
13009 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
13010 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13011 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13013 if (reg != 0)
13015 emit_move_insn (reg, new_rtx);
13016 new_rtx = reg;
13019 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
13020 /* We can't use @GOTOFF for text labels on VxWorks;
13021 see gotoff_operand. */
13022 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
13024 rtx tmp = legitimize_pe_coff_symbol (addr, true);
13025 if (tmp)
13026 return tmp;
13028 /* For x64 PE-COFF there is no GOT table. So we use address
13029 directly. */
13030 if (TARGET_64BIT && TARGET_PECOFF)
13032 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
13033 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13035 if (reg == 0)
13036 reg = gen_reg_rtx (Pmode);
13037 emit_move_insn (reg, new_rtx);
13038 new_rtx = reg;
13040 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
13042 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
13043 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13044 new_rtx = gen_const_mem (Pmode, new_rtx);
13045 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
13047 if (reg == 0)
13048 reg = gen_reg_rtx (Pmode);
13049 /* Use directly gen_movsi, otherwise the address is loaded
13050 into register for CSE. We don't want to CSE this addresses,
13051 instead we CSE addresses from the GOT table, so skip this. */
13052 emit_insn (gen_movsi (reg, new_rtx));
13053 new_rtx = reg;
13055 else
13057 /* This symbol must be referenced via a load from the
13058 Global Offset Table (@GOT). */
13060 if (reload_in_progress)
13061 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13062 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
13063 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13064 if (TARGET_64BIT)
13065 new_rtx = force_reg (Pmode, new_rtx);
13066 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13067 new_rtx = gen_const_mem (Pmode, new_rtx);
13068 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
13070 if (reg == 0)
13071 reg = gen_reg_rtx (Pmode);
13072 emit_move_insn (reg, new_rtx);
13073 new_rtx = reg;
13076 else
13078 if (CONST_INT_P (addr)
13079 && !x86_64_immediate_operand (addr, VOIDmode))
13081 if (reg)
13083 emit_move_insn (reg, addr);
13084 new_rtx = reg;
13086 else
13087 new_rtx = force_reg (Pmode, addr);
13089 else if (GET_CODE (addr) == CONST)
13091 addr = XEXP (addr, 0);
13093 /* We must match stuff we generate before. Assume the only
13094 unspecs that can get here are ours. Not that we could do
13095 anything with them anyway.... */
13096 if (GET_CODE (addr) == UNSPEC
13097 || (GET_CODE (addr) == PLUS
13098 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
13099 return orig;
13100 gcc_assert (GET_CODE (addr) == PLUS);
13102 if (GET_CODE (addr) == PLUS)
13104 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
13106 /* Check first to see if this is a constant offset from a @GOTOFF
13107 symbol reference. */
13108 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
13109 && CONST_INT_P (op1))
13111 if (!TARGET_64BIT)
13113 if (reload_in_progress)
13114 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13115 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
13116 UNSPEC_GOTOFF);
13117 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
13118 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
13119 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
13121 if (reg != 0)
13123 emit_move_insn (reg, new_rtx);
13124 new_rtx = reg;
13127 else
13129 if (INTVAL (op1) < -16*1024*1024
13130 || INTVAL (op1) >= 16*1024*1024)
13132 if (!x86_64_immediate_operand (op1, Pmode))
13133 op1 = force_reg (Pmode, op1);
13134 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
13138 else
13140 rtx base = legitimize_pic_address (op0, reg);
13141 enum machine_mode mode = GET_MODE (base);
13142 new_rtx
13143 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
13145 if (CONST_INT_P (new_rtx))
13147 if (INTVAL (new_rtx) < -16*1024*1024
13148 || INTVAL (new_rtx) >= 16*1024*1024)
13150 if (!x86_64_immediate_operand (new_rtx, mode))
13151 new_rtx = force_reg (mode, new_rtx);
13152 new_rtx
13153 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
13155 else
13156 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
13158 else
13160 if (GET_CODE (new_rtx) == PLUS
13161 && CONSTANT_P (XEXP (new_rtx, 1)))
13163 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
13164 new_rtx = XEXP (new_rtx, 1);
13166 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
13171 return new_rtx;
13174 /* Load the thread pointer. If TO_REG is true, force it into a register. */
13176 static rtx
13177 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
13179 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
13181 if (GET_MODE (tp) != tp_mode)
13183 gcc_assert (GET_MODE (tp) == SImode);
13184 gcc_assert (tp_mode == DImode);
13186 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
13189 if (to_reg)
13190 tp = copy_to_mode_reg (tp_mode, tp);
13192 return tp;
13195 /* Construct the SYMBOL_REF for the tls_get_addr function. */
13197 static GTY(()) rtx ix86_tls_symbol;
13199 static rtx
13200 ix86_tls_get_addr (void)
13202 if (!ix86_tls_symbol)
13204 const char *sym
13205 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13206 ? "___tls_get_addr" : "__tls_get_addr");
13208 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13211 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13213 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13214 UNSPEC_PLTOFF);
13215 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13216 gen_rtx_CONST (Pmode, unspec));
13219 return ix86_tls_symbol;
13222 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13224 static GTY(()) rtx ix86_tls_module_base_symbol;
13227 ix86_tls_module_base (void)
13229 if (!ix86_tls_module_base_symbol)
13231 ix86_tls_module_base_symbol
13232 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13234 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13235 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13238 return ix86_tls_module_base_symbol;
13241 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13242 false if we expect this to be used for a memory address and true if
13243 we expect to load the address into a register. */
13245 static rtx
13246 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13248 rtx dest, base, off;
13249 rtx pic = NULL_RTX, tp = NULL_RTX;
13250 enum machine_mode tp_mode = Pmode;
13251 int type;
13253 switch (model)
13255 case TLS_MODEL_GLOBAL_DYNAMIC:
13256 dest = gen_reg_rtx (Pmode);
13258 if (!TARGET_64BIT)
13260 if (flag_pic && !TARGET_PECOFF)
13261 pic = pic_offset_table_rtx;
13262 else
13264 pic = gen_reg_rtx (Pmode);
13265 emit_insn (gen_set_got (pic));
13269 if (TARGET_GNU2_TLS)
13271 if (TARGET_64BIT)
13272 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13273 else
13274 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13276 tp = get_thread_pointer (Pmode, true);
13277 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13279 if (GET_MODE (x) != Pmode)
13280 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13282 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13284 else
13286 rtx caddr = ix86_tls_get_addr ();
13288 if (TARGET_64BIT)
13290 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13291 rtx insns;
13293 start_sequence ();
13294 emit_call_insn
13295 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13296 insns = get_insns ();
13297 end_sequence ();
13299 if (GET_MODE (x) != Pmode)
13300 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13302 RTL_CONST_CALL_P (insns) = 1;
13303 emit_libcall_block (insns, dest, rax, x);
13305 else
13306 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13308 break;
13310 case TLS_MODEL_LOCAL_DYNAMIC:
13311 base = gen_reg_rtx (Pmode);
13313 if (!TARGET_64BIT)
13315 if (flag_pic)
13316 pic = pic_offset_table_rtx;
13317 else
13319 pic = gen_reg_rtx (Pmode);
13320 emit_insn (gen_set_got (pic));
13324 if (TARGET_GNU2_TLS)
13326 rtx tmp = ix86_tls_module_base ();
13328 if (TARGET_64BIT)
13329 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13330 else
13331 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13333 tp = get_thread_pointer (Pmode, true);
13334 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13335 gen_rtx_MINUS (Pmode, tmp, tp));
13337 else
13339 rtx caddr = ix86_tls_get_addr ();
13341 if (TARGET_64BIT)
13343 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13344 rtx insns, eqv;
13346 start_sequence ();
13347 emit_call_insn
13348 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13349 insns = get_insns ();
13350 end_sequence ();
13352 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13353 share the LD_BASE result with other LD model accesses. */
13354 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13355 UNSPEC_TLS_LD_BASE);
13357 RTL_CONST_CALL_P (insns) = 1;
13358 emit_libcall_block (insns, base, rax, eqv);
13360 else
13361 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13364 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13365 off = gen_rtx_CONST (Pmode, off);
13367 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13369 if (TARGET_GNU2_TLS)
13371 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13373 if (GET_MODE (x) != Pmode)
13374 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13376 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13378 break;
13380 case TLS_MODEL_INITIAL_EXEC:
13381 if (TARGET_64BIT)
13383 if (TARGET_SUN_TLS && !TARGET_X32)
13385 /* The Sun linker took the AMD64 TLS spec literally
13386 and can only handle %rax as destination of the
13387 initial executable code sequence. */
13389 dest = gen_reg_rtx (DImode);
13390 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13391 return dest;
13394 /* Generate DImode references to avoid %fs:(%reg32)
13395 problems and linker IE->LE relaxation bug. */
13396 tp_mode = DImode;
13397 pic = NULL;
13398 type = UNSPEC_GOTNTPOFF;
13400 else if (flag_pic)
13402 if (reload_in_progress)
13403 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13404 pic = pic_offset_table_rtx;
13405 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13407 else if (!TARGET_ANY_GNU_TLS)
13409 pic = gen_reg_rtx (Pmode);
13410 emit_insn (gen_set_got (pic));
13411 type = UNSPEC_GOTTPOFF;
13413 else
13415 pic = NULL;
13416 type = UNSPEC_INDNTPOFF;
13419 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13420 off = gen_rtx_CONST (tp_mode, off);
13421 if (pic)
13422 off = gen_rtx_PLUS (tp_mode, pic, off);
13423 off = gen_const_mem (tp_mode, off);
13424 set_mem_alias_set (off, ix86_GOT_alias_set ());
13426 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13428 base = get_thread_pointer (tp_mode,
13429 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13430 off = force_reg (tp_mode, off);
13431 return gen_rtx_PLUS (tp_mode, base, off);
13433 else
13435 base = get_thread_pointer (Pmode, true);
13436 dest = gen_reg_rtx (Pmode);
13437 emit_insn (ix86_gen_sub3 (dest, base, off));
13439 break;
13441 case TLS_MODEL_LOCAL_EXEC:
13442 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13443 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13444 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13445 off = gen_rtx_CONST (Pmode, off);
13447 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13449 base = get_thread_pointer (Pmode,
13450 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13451 return gen_rtx_PLUS (Pmode, base, off);
13453 else
13455 base = get_thread_pointer (Pmode, true);
13456 dest = gen_reg_rtx (Pmode);
13457 emit_insn (ix86_gen_sub3 (dest, base, off));
13459 break;
13461 default:
13462 gcc_unreachable ();
13465 return dest;
13468 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13469 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13470 unique refptr-DECL symbol corresponding to symbol DECL. */
13472 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13473 htab_t dllimport_map;
13475 static tree
13476 get_dllimport_decl (tree decl, bool beimport)
13478 struct tree_map *h, in;
13479 void **loc;
13480 const char *name;
13481 const char *prefix;
13482 size_t namelen, prefixlen;
13483 char *imp_name;
13484 tree to;
13485 rtx rtl;
13487 if (!dllimport_map)
13488 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13490 in.hash = htab_hash_pointer (decl);
13491 in.base.from = decl;
13492 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13493 h = (struct tree_map *) *loc;
13494 if (h)
13495 return h->to;
13497 *loc = h = ggc_alloc_tree_map ();
13498 h->hash = in.hash;
13499 h->base.from = decl;
13500 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13501 VAR_DECL, NULL, ptr_type_node);
13502 DECL_ARTIFICIAL (to) = 1;
13503 DECL_IGNORED_P (to) = 1;
13504 DECL_EXTERNAL (to) = 1;
13505 TREE_READONLY (to) = 1;
13507 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13508 name = targetm.strip_name_encoding (name);
13509 if (beimport)
13510 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13511 ? "*__imp_" : "*__imp__";
13512 else
13513 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13514 namelen = strlen (name);
13515 prefixlen = strlen (prefix);
13516 imp_name = (char *) alloca (namelen + prefixlen + 1);
13517 memcpy (imp_name, prefix, prefixlen);
13518 memcpy (imp_name + prefixlen, name, namelen + 1);
13520 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13521 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13522 SET_SYMBOL_REF_DECL (rtl, to);
13523 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13524 if (!beimport)
13526 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13527 #ifdef SUB_TARGET_RECORD_STUB
13528 SUB_TARGET_RECORD_STUB (name);
13529 #endif
13532 rtl = gen_const_mem (Pmode, rtl);
13533 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13535 SET_DECL_RTL (to, rtl);
13536 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13538 return to;
13541 /* Expand SYMBOL into its corresponding far-addresse symbol.
13542 WANT_REG is true if we require the result be a register. */
13544 static rtx
13545 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13547 tree imp_decl;
13548 rtx x;
13550 gcc_assert (SYMBOL_REF_DECL (symbol));
13551 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13553 x = DECL_RTL (imp_decl);
13554 if (want_reg)
13555 x = force_reg (Pmode, x);
13556 return x;
13559 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13560 true if we require the result be a register. */
13562 static rtx
13563 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13565 tree imp_decl;
13566 rtx x;
13568 gcc_assert (SYMBOL_REF_DECL (symbol));
13569 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13571 x = DECL_RTL (imp_decl);
13572 if (want_reg)
13573 x = force_reg (Pmode, x);
13574 return x;
13577 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13578 is true if we require the result be a register. */
13580 static rtx
13581 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13583 if (!TARGET_PECOFF)
13584 return NULL_RTX;
13586 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13588 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13589 return legitimize_dllimport_symbol (addr, inreg);
13590 if (GET_CODE (addr) == CONST
13591 && GET_CODE (XEXP (addr, 0)) == PLUS
13592 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13593 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13595 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13596 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13600 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13601 return NULL_RTX;
13602 if (GET_CODE (addr) == SYMBOL_REF
13603 && !is_imported_p (addr)
13604 && SYMBOL_REF_EXTERNAL_P (addr)
13605 && SYMBOL_REF_DECL (addr))
13606 return legitimize_pe_coff_extern_decl (addr, inreg);
13608 if (GET_CODE (addr) == CONST
13609 && GET_CODE (XEXP (addr, 0)) == PLUS
13610 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13611 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13612 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13613 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13615 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13616 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13618 return NULL_RTX;
13621 /* Try machine-dependent ways of modifying an illegitimate address
13622 to be legitimate. If we find one, return the new, valid address.
13623 This macro is used in only one place: `memory_address' in explow.c.
13625 OLDX is the address as it was before break_out_memory_refs was called.
13626 In some cases it is useful to look at this to decide what needs to be done.
13628 It is always safe for this macro to do nothing. It exists to recognize
13629 opportunities to optimize the output.
13631 For the 80386, we handle X+REG by loading X into a register R and
13632 using R+REG. R will go in a general reg and indexing will be used.
13633 However, if REG is a broken-out memory address or multiplication,
13634 nothing needs to be done because REG can certainly go in a general reg.
13636 When -fpic is used, special handling is needed for symbolic references.
13637 See comments by legitimize_pic_address in i386.c for details. */
13639 static rtx
13640 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13641 enum machine_mode mode)
13643 int changed = 0;
13644 unsigned log;
13646 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13647 if (log)
13648 return legitimize_tls_address (x, (enum tls_model) log, false);
13649 if (GET_CODE (x) == CONST
13650 && GET_CODE (XEXP (x, 0)) == PLUS
13651 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13652 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13654 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13655 (enum tls_model) log, false);
13656 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13659 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13661 rtx tmp = legitimize_pe_coff_symbol (x, true);
13662 if (tmp)
13663 return tmp;
13666 if (flag_pic && SYMBOLIC_CONST (x))
13667 return legitimize_pic_address (x, 0);
13669 #if TARGET_MACHO
13670 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13671 return machopic_indirect_data_reference (x, 0);
13672 #endif
13674 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13675 if (GET_CODE (x) == ASHIFT
13676 && CONST_INT_P (XEXP (x, 1))
13677 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13679 changed = 1;
13680 log = INTVAL (XEXP (x, 1));
13681 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13682 GEN_INT (1 << log));
13685 if (GET_CODE (x) == PLUS)
13687 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13689 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13690 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13691 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13693 changed = 1;
13694 log = INTVAL (XEXP (XEXP (x, 0), 1));
13695 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13696 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13697 GEN_INT (1 << log));
13700 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13701 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13702 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13704 changed = 1;
13705 log = INTVAL (XEXP (XEXP (x, 1), 1));
13706 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13707 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13708 GEN_INT (1 << log));
13711 /* Put multiply first if it isn't already. */
13712 if (GET_CODE (XEXP (x, 1)) == MULT)
13714 rtx tmp = XEXP (x, 0);
13715 XEXP (x, 0) = XEXP (x, 1);
13716 XEXP (x, 1) = tmp;
13717 changed = 1;
13720 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13721 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13722 created by virtual register instantiation, register elimination, and
13723 similar optimizations. */
13724 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13726 changed = 1;
13727 x = gen_rtx_PLUS (Pmode,
13728 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13729 XEXP (XEXP (x, 1), 0)),
13730 XEXP (XEXP (x, 1), 1));
13733 /* Canonicalize
13734 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13735 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13736 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13737 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13738 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13739 && CONSTANT_P (XEXP (x, 1)))
13741 rtx constant;
13742 rtx other = NULL_RTX;
13744 if (CONST_INT_P (XEXP (x, 1)))
13746 constant = XEXP (x, 1);
13747 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13749 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13751 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13752 other = XEXP (x, 1);
13754 else
13755 constant = 0;
13757 if (constant)
13759 changed = 1;
13760 x = gen_rtx_PLUS (Pmode,
13761 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13762 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13763 plus_constant (Pmode, other,
13764 INTVAL (constant)));
13768 if (changed && ix86_legitimate_address_p (mode, x, false))
13769 return x;
13771 if (GET_CODE (XEXP (x, 0)) == MULT)
13773 changed = 1;
13774 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13777 if (GET_CODE (XEXP (x, 1)) == MULT)
13779 changed = 1;
13780 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13783 if (changed
13784 && REG_P (XEXP (x, 1))
13785 && REG_P (XEXP (x, 0)))
13786 return x;
13788 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13790 changed = 1;
13791 x = legitimize_pic_address (x, 0);
13794 if (changed && ix86_legitimate_address_p (mode, x, false))
13795 return x;
13797 if (REG_P (XEXP (x, 0)))
13799 rtx temp = gen_reg_rtx (Pmode);
13800 rtx val = force_operand (XEXP (x, 1), temp);
13801 if (val != temp)
13803 val = convert_to_mode (Pmode, val, 1);
13804 emit_move_insn (temp, val);
13807 XEXP (x, 1) = temp;
13808 return x;
13811 else if (REG_P (XEXP (x, 1)))
13813 rtx temp = gen_reg_rtx (Pmode);
13814 rtx val = force_operand (XEXP (x, 0), temp);
13815 if (val != temp)
13817 val = convert_to_mode (Pmode, val, 1);
13818 emit_move_insn (temp, val);
13821 XEXP (x, 0) = temp;
13822 return x;
13826 return x;
13829 /* Print an integer constant expression in assembler syntax. Addition
13830 and subtraction are the only arithmetic that may appear in these
13831 expressions. FILE is the stdio stream to write to, X is the rtx, and
13832 CODE is the operand print code from the output string. */
13834 static void
13835 output_pic_addr_const (FILE *file, rtx x, int code)
13837 char buf[256];
13839 switch (GET_CODE (x))
13841 case PC:
13842 gcc_assert (flag_pic);
13843 putc ('.', file);
13844 break;
13846 case SYMBOL_REF:
13847 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13848 output_addr_const (file, x);
13849 else
13851 const char *name = XSTR (x, 0);
13853 /* Mark the decl as referenced so that cgraph will
13854 output the function. */
13855 if (SYMBOL_REF_DECL (x))
13856 mark_decl_referenced (SYMBOL_REF_DECL (x));
13858 #if TARGET_MACHO
13859 if (MACHOPIC_INDIRECT
13860 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13861 name = machopic_indirection_name (x, /*stub_p=*/true);
13862 #endif
13863 assemble_name (file, name);
13865 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13866 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13867 fputs ("@PLT", file);
13868 break;
13870 case LABEL_REF:
13871 x = XEXP (x, 0);
13872 /* FALLTHRU */
13873 case CODE_LABEL:
13874 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13875 assemble_name (asm_out_file, buf);
13876 break;
13878 case CONST_INT:
13879 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13880 break;
13882 case CONST:
13883 /* This used to output parentheses around the expression,
13884 but that does not work on the 386 (either ATT or BSD assembler). */
13885 output_pic_addr_const (file, XEXP (x, 0), code);
13886 break;
13888 case CONST_DOUBLE:
13889 if (GET_MODE (x) == VOIDmode)
13891 /* We can use %d if the number is <32 bits and positive. */
13892 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13893 fprintf (file, "0x%lx%08lx",
13894 (unsigned long) CONST_DOUBLE_HIGH (x),
13895 (unsigned long) CONST_DOUBLE_LOW (x));
13896 else
13897 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13899 else
13900 /* We can't handle floating point constants;
13901 TARGET_PRINT_OPERAND must handle them. */
13902 output_operand_lossage ("floating constant misused");
13903 break;
13905 case PLUS:
13906 /* Some assemblers need integer constants to appear first. */
13907 if (CONST_INT_P (XEXP (x, 0)))
13909 output_pic_addr_const (file, XEXP (x, 0), code);
13910 putc ('+', file);
13911 output_pic_addr_const (file, XEXP (x, 1), code);
13913 else
13915 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13916 output_pic_addr_const (file, XEXP (x, 1), code);
13917 putc ('+', file);
13918 output_pic_addr_const (file, XEXP (x, 0), code);
13920 break;
13922 case MINUS:
13923 if (!TARGET_MACHO)
13924 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13925 output_pic_addr_const (file, XEXP (x, 0), code);
13926 putc ('-', file);
13927 output_pic_addr_const (file, XEXP (x, 1), code);
13928 if (!TARGET_MACHO)
13929 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13930 break;
13932 case UNSPEC:
13933 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13935 bool f = i386_asm_output_addr_const_extra (file, x);
13936 gcc_assert (f);
13937 break;
13940 gcc_assert (XVECLEN (x, 0) == 1);
13941 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13942 switch (XINT (x, 1))
13944 case UNSPEC_GOT:
13945 fputs ("@GOT", file);
13946 break;
13947 case UNSPEC_GOTOFF:
13948 fputs ("@GOTOFF", file);
13949 break;
13950 case UNSPEC_PLTOFF:
13951 fputs ("@PLTOFF", file);
13952 break;
13953 case UNSPEC_PCREL:
13954 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13955 "(%rip)" : "[rip]", file);
13956 break;
13957 case UNSPEC_GOTPCREL:
13958 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13959 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13960 break;
13961 case UNSPEC_GOTTPOFF:
13962 /* FIXME: This might be @TPOFF in Sun ld too. */
13963 fputs ("@gottpoff", file);
13964 break;
13965 case UNSPEC_TPOFF:
13966 fputs ("@tpoff", file);
13967 break;
13968 case UNSPEC_NTPOFF:
13969 if (TARGET_64BIT)
13970 fputs ("@tpoff", file);
13971 else
13972 fputs ("@ntpoff", file);
13973 break;
13974 case UNSPEC_DTPOFF:
13975 fputs ("@dtpoff", file);
13976 break;
13977 case UNSPEC_GOTNTPOFF:
13978 if (TARGET_64BIT)
13979 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13980 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13981 else
13982 fputs ("@gotntpoff", file);
13983 break;
13984 case UNSPEC_INDNTPOFF:
13985 fputs ("@indntpoff", file);
13986 break;
13987 #if TARGET_MACHO
13988 case UNSPEC_MACHOPIC_OFFSET:
13989 putc ('-', file);
13990 machopic_output_function_base_name (file);
13991 break;
13992 #endif
13993 default:
13994 output_operand_lossage ("invalid UNSPEC as operand");
13995 break;
13997 break;
13999 default:
14000 output_operand_lossage ("invalid expression as operand");
14004 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
14005 We need to emit DTP-relative relocations. */
14007 static void ATTRIBUTE_UNUSED
14008 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
14010 fputs (ASM_LONG, file);
14011 output_addr_const (file, x);
14012 fputs ("@dtpoff", file);
14013 switch (size)
14015 case 4:
14016 break;
14017 case 8:
14018 fputs (", 0", file);
14019 break;
14020 default:
14021 gcc_unreachable ();
14025 /* Return true if X is a representation of the PIC register. This copes
14026 with calls from ix86_find_base_term, where the register might have
14027 been replaced by a cselib value. */
14029 static bool
14030 ix86_pic_register_p (rtx x)
14032 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
14033 return (pic_offset_table_rtx
14034 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
14035 else
14036 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
14039 /* Helper function for ix86_delegitimize_address.
14040 Attempt to delegitimize TLS local-exec accesses. */
14042 static rtx
14043 ix86_delegitimize_tls_address (rtx orig_x)
14045 rtx x = orig_x, unspec;
14046 struct ix86_address addr;
14048 if (!TARGET_TLS_DIRECT_SEG_REFS)
14049 return orig_x;
14050 if (MEM_P (x))
14051 x = XEXP (x, 0);
14052 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
14053 return orig_x;
14054 if (ix86_decompose_address (x, &addr) == 0
14055 || addr.seg != DEFAULT_TLS_SEG_REG
14056 || addr.disp == NULL_RTX
14057 || GET_CODE (addr.disp) != CONST)
14058 return orig_x;
14059 unspec = XEXP (addr.disp, 0);
14060 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
14061 unspec = XEXP (unspec, 0);
14062 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
14063 return orig_x;
14064 x = XVECEXP (unspec, 0, 0);
14065 gcc_assert (GET_CODE (x) == SYMBOL_REF);
14066 if (unspec != XEXP (addr.disp, 0))
14067 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
14068 if (addr.index)
14070 rtx idx = addr.index;
14071 if (addr.scale != 1)
14072 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
14073 x = gen_rtx_PLUS (Pmode, idx, x);
14075 if (addr.base)
14076 x = gen_rtx_PLUS (Pmode, addr.base, x);
14077 if (MEM_P (orig_x))
14078 x = replace_equiv_address_nv (orig_x, x);
14079 return x;
14082 /* In the name of slightly smaller debug output, and to cater to
14083 general assembler lossage, recognize PIC+GOTOFF and turn it back
14084 into a direct symbol reference.
14086 On Darwin, this is necessary to avoid a crash, because Darwin
14087 has a different PIC label for each routine but the DWARF debugging
14088 information is not associated with any particular routine, so it's
14089 necessary to remove references to the PIC label from RTL stored by
14090 the DWARF output code. */
14092 static rtx
14093 ix86_delegitimize_address (rtx x)
14095 rtx orig_x = delegitimize_mem_from_attrs (x);
14096 /* addend is NULL or some rtx if x is something+GOTOFF where
14097 something doesn't include the PIC register. */
14098 rtx addend = NULL_RTX;
14099 /* reg_addend is NULL or a multiple of some register. */
14100 rtx reg_addend = NULL_RTX;
14101 /* const_addend is NULL or a const_int. */
14102 rtx const_addend = NULL_RTX;
14103 /* This is the result, or NULL. */
14104 rtx result = NULL_RTX;
14106 x = orig_x;
14108 if (MEM_P (x))
14109 x = XEXP (x, 0);
14111 if (TARGET_64BIT)
14113 if (GET_CODE (x) == CONST
14114 && GET_CODE (XEXP (x, 0)) == PLUS
14115 && GET_MODE (XEXP (x, 0)) == Pmode
14116 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
14117 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
14118 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
14120 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
14121 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
14122 if (MEM_P (orig_x))
14123 x = replace_equiv_address_nv (orig_x, x);
14124 return x;
14127 if (GET_CODE (x) == CONST
14128 && GET_CODE (XEXP (x, 0)) == UNSPEC
14129 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
14130 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
14131 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
14133 x = XVECEXP (XEXP (x, 0), 0, 0);
14134 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
14136 x = simplify_gen_subreg (GET_MODE (orig_x), x,
14137 GET_MODE (x), 0);
14138 if (x == NULL_RTX)
14139 return orig_x;
14141 return x;
14144 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
14145 return ix86_delegitimize_tls_address (orig_x);
14147 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
14148 and -mcmodel=medium -fpic. */
14151 if (GET_CODE (x) != PLUS
14152 || GET_CODE (XEXP (x, 1)) != CONST)
14153 return ix86_delegitimize_tls_address (orig_x);
14155 if (ix86_pic_register_p (XEXP (x, 0)))
14156 /* %ebx + GOT/GOTOFF */
14158 else if (GET_CODE (XEXP (x, 0)) == PLUS)
14160 /* %ebx + %reg * scale + GOT/GOTOFF */
14161 reg_addend = XEXP (x, 0);
14162 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
14163 reg_addend = XEXP (reg_addend, 1);
14164 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
14165 reg_addend = XEXP (reg_addend, 0);
14166 else
14168 reg_addend = NULL_RTX;
14169 addend = XEXP (x, 0);
14172 else
14173 addend = XEXP (x, 0);
14175 x = XEXP (XEXP (x, 1), 0);
14176 if (GET_CODE (x) == PLUS
14177 && CONST_INT_P (XEXP (x, 1)))
14179 const_addend = XEXP (x, 1);
14180 x = XEXP (x, 0);
14183 if (GET_CODE (x) == UNSPEC
14184 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
14185 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
14186 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
14187 && !MEM_P (orig_x) && !addend)))
14188 result = XVECEXP (x, 0, 0);
14190 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
14191 && !MEM_P (orig_x))
14192 result = XVECEXP (x, 0, 0);
14194 if (! result)
14195 return ix86_delegitimize_tls_address (orig_x);
14197 if (const_addend)
14198 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
14199 if (reg_addend)
14200 result = gen_rtx_PLUS (Pmode, reg_addend, result);
14201 if (addend)
14203 /* If the rest of original X doesn't involve the PIC register, add
14204 addend and subtract pic_offset_table_rtx. This can happen e.g.
14205 for code like:
14206 leal (%ebx, %ecx, 4), %ecx
14208 movl foo@GOTOFF(%ecx), %edx
14209 in which case we return (%ecx - %ebx) + foo. */
14210 if (pic_offset_table_rtx)
14211 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14212 pic_offset_table_rtx),
14213 result);
14214 else
14215 return orig_x;
14217 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14219 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14220 if (result == NULL_RTX)
14221 return orig_x;
14223 return result;
14226 /* If X is a machine specific address (i.e. a symbol or label being
14227 referenced as a displacement from the GOT implemented using an
14228 UNSPEC), then return the base term. Otherwise return X. */
14231 ix86_find_base_term (rtx x)
14233 rtx term;
14235 if (TARGET_64BIT)
14237 if (GET_CODE (x) != CONST)
14238 return x;
14239 term = XEXP (x, 0);
14240 if (GET_CODE (term) == PLUS
14241 && (CONST_INT_P (XEXP (term, 1))
14242 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14243 term = XEXP (term, 0);
14244 if (GET_CODE (term) != UNSPEC
14245 || (XINT (term, 1) != UNSPEC_GOTPCREL
14246 && XINT (term, 1) != UNSPEC_PCREL))
14247 return x;
14249 return XVECEXP (term, 0, 0);
14252 return ix86_delegitimize_address (x);
14255 static void
14256 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14257 bool fp, FILE *file)
14259 const char *suffix;
14261 if (mode == CCFPmode || mode == CCFPUmode)
14263 code = ix86_fp_compare_code_to_integer (code);
14264 mode = CCmode;
14266 if (reverse)
14267 code = reverse_condition (code);
14269 switch (code)
14271 case EQ:
14272 switch (mode)
14274 case CCAmode:
14275 suffix = "a";
14276 break;
14278 case CCCmode:
14279 suffix = "c";
14280 break;
14282 case CCOmode:
14283 suffix = "o";
14284 break;
14286 case CCSmode:
14287 suffix = "s";
14288 break;
14290 default:
14291 suffix = "e";
14293 break;
14294 case NE:
14295 switch (mode)
14297 case CCAmode:
14298 suffix = "na";
14299 break;
14301 case CCCmode:
14302 suffix = "nc";
14303 break;
14305 case CCOmode:
14306 suffix = "no";
14307 break;
14309 case CCSmode:
14310 suffix = "ns";
14311 break;
14313 default:
14314 suffix = "ne";
14316 break;
14317 case GT:
14318 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14319 suffix = "g";
14320 break;
14321 case GTU:
14322 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14323 Those same assemblers have the same but opposite lossage on cmov. */
14324 if (mode == CCmode)
14325 suffix = fp ? "nbe" : "a";
14326 else
14327 gcc_unreachable ();
14328 break;
14329 case LT:
14330 switch (mode)
14332 case CCNOmode:
14333 case CCGOCmode:
14334 suffix = "s";
14335 break;
14337 case CCmode:
14338 case CCGCmode:
14339 suffix = "l";
14340 break;
14342 default:
14343 gcc_unreachable ();
14345 break;
14346 case LTU:
14347 if (mode == CCmode)
14348 suffix = "b";
14349 else if (mode == CCCmode)
14350 suffix = "c";
14351 else
14352 gcc_unreachable ();
14353 break;
14354 case GE:
14355 switch (mode)
14357 case CCNOmode:
14358 case CCGOCmode:
14359 suffix = "ns";
14360 break;
14362 case CCmode:
14363 case CCGCmode:
14364 suffix = "ge";
14365 break;
14367 default:
14368 gcc_unreachable ();
14370 break;
14371 case GEU:
14372 if (mode == CCmode)
14373 suffix = fp ? "nb" : "ae";
14374 else if (mode == CCCmode)
14375 suffix = "nc";
14376 else
14377 gcc_unreachable ();
14378 break;
14379 case LE:
14380 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14381 suffix = "le";
14382 break;
14383 case LEU:
14384 if (mode == CCmode)
14385 suffix = "be";
14386 else
14387 gcc_unreachable ();
14388 break;
14389 case UNORDERED:
14390 suffix = fp ? "u" : "p";
14391 break;
14392 case ORDERED:
14393 suffix = fp ? "nu" : "np";
14394 break;
14395 default:
14396 gcc_unreachable ();
14398 fputs (suffix, file);
14401 /* Print the name of register X to FILE based on its machine mode and number.
14402 If CODE is 'w', pretend the mode is HImode.
14403 If CODE is 'b', pretend the mode is QImode.
14404 If CODE is 'k', pretend the mode is SImode.
14405 If CODE is 'q', pretend the mode is DImode.
14406 If CODE is 'x', pretend the mode is V4SFmode.
14407 If CODE is 't', pretend the mode is V8SFmode.
14408 If CODE is 'g', pretend the mode is V16SFmode.
14409 If CODE is 'h', pretend the reg is the 'high' byte register.
14410 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14411 If CODE is 'd', duplicate the operand for AVX instruction.
14414 void
14415 print_reg (rtx x, int code, FILE *file)
14417 const char *reg;
14418 unsigned int regno;
14419 bool duplicated = code == 'd' && TARGET_AVX;
14421 if (ASSEMBLER_DIALECT == ASM_ATT)
14422 putc ('%', file);
14424 if (x == pc_rtx)
14426 gcc_assert (TARGET_64BIT);
14427 fputs ("rip", file);
14428 return;
14431 regno = true_regnum (x);
14432 gcc_assert (regno != ARG_POINTER_REGNUM
14433 && regno != FRAME_POINTER_REGNUM
14434 && regno != FLAGS_REG
14435 && regno != FPSR_REG
14436 && regno != FPCR_REG);
14438 if (code == 'w' || MMX_REG_P (x))
14439 code = 2;
14440 else if (code == 'b')
14441 code = 1;
14442 else if (code == 'k')
14443 code = 4;
14444 else if (code == 'q')
14445 code = 8;
14446 else if (code == 'y')
14447 code = 3;
14448 else if (code == 'h')
14449 code = 0;
14450 else if (code == 'x')
14451 code = 16;
14452 else if (code == 't')
14453 code = 32;
14454 else if (code == 'g')
14455 code = 64;
14456 else
14457 code = GET_MODE_SIZE (GET_MODE (x));
14459 /* Irritatingly, AMD extended registers use different naming convention
14460 from the normal registers: "r%d[bwd]" */
14461 if (REX_INT_REGNO_P (regno))
14463 gcc_assert (TARGET_64BIT);
14464 putc ('r', file);
14465 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14466 switch (code)
14468 case 0:
14469 error ("extended registers have no high halves");
14470 break;
14471 case 1:
14472 putc ('b', file);
14473 break;
14474 case 2:
14475 putc ('w', file);
14476 break;
14477 case 4:
14478 putc ('d', file);
14479 break;
14480 case 8:
14481 /* no suffix */
14482 break;
14483 default:
14484 error ("unsupported operand size for extended register");
14485 break;
14487 return;
14490 reg = NULL;
14491 switch (code)
14493 case 3:
14494 if (STACK_TOP_P (x))
14496 reg = "st(0)";
14497 break;
14499 /* FALLTHRU */
14500 case 8:
14501 case 4:
14502 case 12:
14503 if (! ANY_FP_REG_P (x))
14504 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14505 /* FALLTHRU */
14506 case 16:
14507 case 2:
14508 normal:
14509 reg = hi_reg_name[regno];
14510 break;
14511 case 1:
14512 if (regno >= ARRAY_SIZE (qi_reg_name))
14513 goto normal;
14514 reg = qi_reg_name[regno];
14515 break;
14516 case 0:
14517 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14518 goto normal;
14519 reg = qi_high_reg_name[regno];
14520 break;
14521 case 32:
14522 if (SSE_REG_P (x))
14524 gcc_assert (!duplicated);
14525 putc ('y', file);
14526 fputs (hi_reg_name[regno] + 1, file);
14527 return;
14529 case 64:
14530 if (SSE_REG_P (x))
14532 gcc_assert (!duplicated);
14533 putc ('z', file);
14534 fputs (hi_reg_name[REGNO (x)] + 1, file);
14535 return;
14537 break;
14538 default:
14539 gcc_unreachable ();
14542 fputs (reg, file);
14543 if (duplicated)
14545 if (ASSEMBLER_DIALECT == ASM_ATT)
14546 fprintf (file, ", %%%s", reg);
14547 else
14548 fprintf (file, ", %s", reg);
14552 /* Locate some local-dynamic symbol still in use by this function
14553 so that we can print its name in some tls_local_dynamic_base
14554 pattern. */
14556 static int
14557 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14559 rtx x = *px;
14561 if (GET_CODE (x) == SYMBOL_REF
14562 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14564 cfun->machine->some_ld_name = XSTR (x, 0);
14565 return 1;
14568 return 0;
14571 static const char *
14572 get_some_local_dynamic_name (void)
14574 rtx insn;
14576 if (cfun->machine->some_ld_name)
14577 return cfun->machine->some_ld_name;
14579 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14580 if (NONDEBUG_INSN_P (insn)
14581 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14582 return cfun->machine->some_ld_name;
14584 return NULL;
14587 /* Meaning of CODE:
14588 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14589 C -- print opcode suffix for set/cmov insn.
14590 c -- like C, but print reversed condition
14591 F,f -- likewise, but for floating-point.
14592 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14593 otherwise nothing
14594 R -- print the prefix for register names.
14595 z -- print the opcode suffix for the size of the current operand.
14596 Z -- likewise, with special suffixes for x87 instructions.
14597 * -- print a star (in certain assembler syntax)
14598 A -- print an absolute memory reference.
14599 E -- print address with DImode register names if TARGET_64BIT.
14600 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14601 s -- print a shift double count, followed by the assemblers argument
14602 delimiter.
14603 b -- print the QImode name of the register for the indicated operand.
14604 %b0 would print %al if operands[0] is reg 0.
14605 w -- likewise, print the HImode name of the register.
14606 k -- likewise, print the SImode name of the register.
14607 q -- likewise, print the DImode name of the register.
14608 x -- likewise, print the V4SFmode name of the register.
14609 t -- likewise, print the V8SFmode name of the register.
14610 g -- likewise, print the V16SFmode name of the register.
14611 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14612 y -- print "st(0)" instead of "st" as a register.
14613 d -- print duplicated register operand for AVX instruction.
14614 D -- print condition for SSE cmp instruction.
14615 P -- if PIC, print an @PLT suffix.
14616 p -- print raw symbol name.
14617 X -- don't print any sort of PIC '@' suffix for a symbol.
14618 & -- print some in-use local-dynamic symbol name.
14619 H -- print a memory address offset by 8; used for sse high-parts
14620 Y -- print condition for XOP pcom* instruction.
14621 + -- print a branch hint as 'cs' or 'ds' prefix
14622 ; -- print a semicolon (after prefixes due to bug in older gas).
14623 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14624 @ -- print a segment register of thread base pointer load
14625 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14628 void
14629 ix86_print_operand (FILE *file, rtx x, int code)
14631 if (code)
14633 switch (code)
14635 case 'A':
14636 switch (ASSEMBLER_DIALECT)
14638 case ASM_ATT:
14639 putc ('*', file);
14640 break;
14642 case ASM_INTEL:
14643 /* Intel syntax. For absolute addresses, registers should not
14644 be surrounded by braces. */
14645 if (!REG_P (x))
14647 putc ('[', file);
14648 ix86_print_operand (file, x, 0);
14649 putc (']', file);
14650 return;
14652 break;
14654 default:
14655 gcc_unreachable ();
14658 ix86_print_operand (file, x, 0);
14659 return;
14661 case 'E':
14662 /* Wrap address in an UNSPEC to declare special handling. */
14663 if (TARGET_64BIT)
14664 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14666 output_address (x);
14667 return;
14669 case 'L':
14670 if (ASSEMBLER_DIALECT == ASM_ATT)
14671 putc ('l', file);
14672 return;
14674 case 'W':
14675 if (ASSEMBLER_DIALECT == ASM_ATT)
14676 putc ('w', file);
14677 return;
14679 case 'B':
14680 if (ASSEMBLER_DIALECT == ASM_ATT)
14681 putc ('b', file);
14682 return;
14684 case 'Q':
14685 if (ASSEMBLER_DIALECT == ASM_ATT)
14686 putc ('l', file);
14687 return;
14689 case 'S':
14690 if (ASSEMBLER_DIALECT == ASM_ATT)
14691 putc ('s', file);
14692 return;
14694 case 'T':
14695 if (ASSEMBLER_DIALECT == ASM_ATT)
14696 putc ('t', file);
14697 return;
14699 case 'O':
14700 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14701 if (ASSEMBLER_DIALECT != ASM_ATT)
14702 return;
14704 switch (GET_MODE_SIZE (GET_MODE (x)))
14706 case 2:
14707 putc ('w', file);
14708 break;
14710 case 4:
14711 putc ('l', file);
14712 break;
14714 case 8:
14715 putc ('q', file);
14716 break;
14718 default:
14719 output_operand_lossage
14720 ("invalid operand size for operand code 'O'");
14721 return;
14724 putc ('.', file);
14725 #endif
14726 return;
14728 case 'z':
14729 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14731 /* Opcodes don't get size suffixes if using Intel opcodes. */
14732 if (ASSEMBLER_DIALECT == ASM_INTEL)
14733 return;
14735 switch (GET_MODE_SIZE (GET_MODE (x)))
14737 case 1:
14738 putc ('b', file);
14739 return;
14741 case 2:
14742 putc ('w', file);
14743 return;
14745 case 4:
14746 putc ('l', file);
14747 return;
14749 case 8:
14750 putc ('q', file);
14751 return;
14753 default:
14754 output_operand_lossage
14755 ("invalid operand size for operand code 'z'");
14756 return;
14760 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14761 warning
14762 (0, "non-integer operand used with operand code 'z'");
14763 /* FALLTHRU */
14765 case 'Z':
14766 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14767 if (ASSEMBLER_DIALECT == ASM_INTEL)
14768 return;
14770 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14772 switch (GET_MODE_SIZE (GET_MODE (x)))
14774 case 2:
14775 #ifdef HAVE_AS_IX86_FILDS
14776 putc ('s', file);
14777 #endif
14778 return;
14780 case 4:
14781 putc ('l', file);
14782 return;
14784 case 8:
14785 #ifdef HAVE_AS_IX86_FILDQ
14786 putc ('q', file);
14787 #else
14788 fputs ("ll", file);
14789 #endif
14790 return;
14792 default:
14793 break;
14796 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14798 /* 387 opcodes don't get size suffixes
14799 if the operands are registers. */
14800 if (STACK_REG_P (x))
14801 return;
14803 switch (GET_MODE_SIZE (GET_MODE (x)))
14805 case 4:
14806 putc ('s', file);
14807 return;
14809 case 8:
14810 putc ('l', file);
14811 return;
14813 case 12:
14814 case 16:
14815 putc ('t', file);
14816 return;
14818 default:
14819 break;
14822 else
14824 output_operand_lossage
14825 ("invalid operand type used with operand code 'Z'");
14826 return;
14829 output_operand_lossage
14830 ("invalid operand size for operand code 'Z'");
14831 return;
14833 case 'd':
14834 case 'b':
14835 case 'w':
14836 case 'k':
14837 case 'q':
14838 case 'h':
14839 case 't':
14840 case 'g':
14841 case 'y':
14842 case 'x':
14843 case 'X':
14844 case 'P':
14845 case 'p':
14846 break;
14848 case 's':
14849 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14851 ix86_print_operand (file, x, 0);
14852 fputs (", ", file);
14854 return;
14856 case 'Y':
14857 switch (GET_CODE (x))
14859 case NE:
14860 fputs ("neq", file);
14861 break;
14862 case EQ:
14863 fputs ("eq", file);
14864 break;
14865 case GE:
14866 case GEU:
14867 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14868 break;
14869 case GT:
14870 case GTU:
14871 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14872 break;
14873 case LE:
14874 case LEU:
14875 fputs ("le", file);
14876 break;
14877 case LT:
14878 case LTU:
14879 fputs ("lt", file);
14880 break;
14881 case UNORDERED:
14882 fputs ("unord", file);
14883 break;
14884 case ORDERED:
14885 fputs ("ord", file);
14886 break;
14887 case UNEQ:
14888 fputs ("ueq", file);
14889 break;
14890 case UNGE:
14891 fputs ("nlt", file);
14892 break;
14893 case UNGT:
14894 fputs ("nle", file);
14895 break;
14896 case UNLE:
14897 fputs ("ule", file);
14898 break;
14899 case UNLT:
14900 fputs ("ult", file);
14901 break;
14902 case LTGT:
14903 fputs ("une", file);
14904 break;
14905 default:
14906 output_operand_lossage ("operand is not a condition code, "
14907 "invalid operand code 'Y'");
14908 return;
14910 return;
14912 case 'D':
14913 /* Little bit of braindamage here. The SSE compare instructions
14914 does use completely different names for the comparisons that the
14915 fp conditional moves. */
14916 switch (GET_CODE (x))
14918 case UNEQ:
14919 if (TARGET_AVX)
14921 fputs ("eq_us", file);
14922 break;
14924 case EQ:
14925 fputs ("eq", file);
14926 break;
14927 case UNLT:
14928 if (TARGET_AVX)
14930 fputs ("nge", file);
14931 break;
14933 case LT:
14934 fputs ("lt", file);
14935 break;
14936 case UNLE:
14937 if (TARGET_AVX)
14939 fputs ("ngt", file);
14940 break;
14942 case LE:
14943 fputs ("le", file);
14944 break;
14945 case UNORDERED:
14946 fputs ("unord", file);
14947 break;
14948 case LTGT:
14949 if (TARGET_AVX)
14951 fputs ("neq_oq", file);
14952 break;
14954 case NE:
14955 fputs ("neq", file);
14956 break;
14957 case GE:
14958 if (TARGET_AVX)
14960 fputs ("ge", file);
14961 break;
14963 case UNGE:
14964 fputs ("nlt", file);
14965 break;
14966 case GT:
14967 if (TARGET_AVX)
14969 fputs ("gt", file);
14970 break;
14972 case UNGT:
14973 fputs ("nle", file);
14974 break;
14975 case ORDERED:
14976 fputs ("ord", file);
14977 break;
14978 default:
14979 output_operand_lossage ("operand is not a condition code, "
14980 "invalid operand code 'D'");
14981 return;
14983 return;
14985 case 'F':
14986 case 'f':
14987 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14988 if (ASSEMBLER_DIALECT == ASM_ATT)
14989 putc ('.', file);
14990 #endif
14992 case 'C':
14993 case 'c':
14994 if (!COMPARISON_P (x))
14996 output_operand_lossage ("operand is not a condition code, "
14997 "invalid operand code '%c'", code);
14998 return;
15000 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
15001 code == 'c' || code == 'f',
15002 code == 'F' || code == 'f',
15003 file);
15004 return;
15006 case 'H':
15007 if (!offsettable_memref_p (x))
15009 output_operand_lossage ("operand is not an offsettable memory "
15010 "reference, invalid operand code 'H'");
15011 return;
15013 /* It doesn't actually matter what mode we use here, as we're
15014 only going to use this for printing. */
15015 x = adjust_address_nv (x, DImode, 8);
15016 /* Output 'qword ptr' for intel assembler dialect. */
15017 if (ASSEMBLER_DIALECT == ASM_INTEL)
15018 code = 'q';
15019 break;
15021 case 'K':
15022 gcc_assert (CONST_INT_P (x));
15024 if (INTVAL (x) & IX86_HLE_ACQUIRE)
15025 #ifdef HAVE_AS_IX86_HLE
15026 fputs ("xacquire ", file);
15027 #else
15028 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
15029 #endif
15030 else if (INTVAL (x) & IX86_HLE_RELEASE)
15031 #ifdef HAVE_AS_IX86_HLE
15032 fputs ("xrelease ", file);
15033 #else
15034 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
15035 #endif
15036 /* We do not want to print value of the operand. */
15037 return;
15039 case 'N':
15040 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
15041 fputs ("{z}", file);
15042 return;
15044 case '*':
15045 if (ASSEMBLER_DIALECT == ASM_ATT)
15046 putc ('*', file);
15047 return;
15049 case '&':
15051 const char *name = get_some_local_dynamic_name ();
15052 if (name == NULL)
15053 output_operand_lossage ("'%%&' used without any "
15054 "local dynamic TLS references");
15055 else
15056 assemble_name (file, name);
15057 return;
15060 case '+':
15062 rtx x;
15064 if (!optimize
15065 || optimize_function_for_size_p (cfun)
15066 || !TARGET_BRANCH_PREDICTION_HINTS)
15067 return;
15069 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
15070 if (x)
15072 int pred_val = XINT (x, 0);
15074 if (pred_val < REG_BR_PROB_BASE * 45 / 100
15075 || pred_val > REG_BR_PROB_BASE * 55 / 100)
15077 bool taken = pred_val > REG_BR_PROB_BASE / 2;
15078 bool cputaken
15079 = final_forward_branch_p (current_output_insn) == 0;
15081 /* Emit hints only in the case default branch prediction
15082 heuristics would fail. */
15083 if (taken != cputaken)
15085 /* We use 3e (DS) prefix for taken branches and
15086 2e (CS) prefix for not taken branches. */
15087 if (taken)
15088 fputs ("ds ; ", file);
15089 else
15090 fputs ("cs ; ", file);
15094 return;
15097 case ';':
15098 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
15099 putc (';', file);
15100 #endif
15101 return;
15103 case '@':
15104 if (ASSEMBLER_DIALECT == ASM_ATT)
15105 putc ('%', file);
15107 /* The kernel uses a different segment register for performance
15108 reasons; a system call would not have to trash the userspace
15109 segment register, which would be expensive. */
15110 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
15111 fputs ("fs", file);
15112 else
15113 fputs ("gs", file);
15114 return;
15116 case '~':
15117 putc (TARGET_AVX2 ? 'i' : 'f', file);
15118 return;
15120 case '^':
15121 if (TARGET_64BIT && Pmode != word_mode)
15122 fputs ("addr32 ", file);
15123 return;
15125 default:
15126 output_operand_lossage ("invalid operand code '%c'", code);
15130 if (REG_P (x))
15131 print_reg (x, code, file);
15133 else if (MEM_P (x))
15135 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
15136 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
15137 && GET_MODE (x) != BLKmode)
15139 const char * size;
15140 switch (GET_MODE_SIZE (GET_MODE (x)))
15142 case 1: size = "BYTE"; break;
15143 case 2: size = "WORD"; break;
15144 case 4: size = "DWORD"; break;
15145 case 8: size = "QWORD"; break;
15146 case 12: size = "TBYTE"; break;
15147 case 16:
15148 if (GET_MODE (x) == XFmode)
15149 size = "TBYTE";
15150 else
15151 size = "XMMWORD";
15152 break;
15153 case 32: size = "YMMWORD"; break;
15154 case 64: size = "ZMMWORD"; break;
15155 default:
15156 gcc_unreachable ();
15159 /* Check for explicit size override (codes 'b', 'w', 'k',
15160 'q' and 'x') */
15161 if (code == 'b')
15162 size = "BYTE";
15163 else if (code == 'w')
15164 size = "WORD";
15165 else if (code == 'k')
15166 size = "DWORD";
15167 else if (code == 'q')
15168 size = "QWORD";
15169 else if (code == 'x')
15170 size = "XMMWORD";
15172 fputs (size, file);
15173 fputs (" PTR ", file);
15176 x = XEXP (x, 0);
15177 /* Avoid (%rip) for call operands. */
15178 if (CONSTANT_ADDRESS_P (x) && code == 'P'
15179 && !CONST_INT_P (x))
15180 output_addr_const (file, x);
15181 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
15182 output_operand_lossage ("invalid constraints for operand");
15183 else
15184 output_address (x);
15187 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
15189 REAL_VALUE_TYPE r;
15190 long l;
15192 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15193 REAL_VALUE_TO_TARGET_SINGLE (r, l);
15195 if (ASSEMBLER_DIALECT == ASM_ATT)
15196 putc ('$', file);
15197 /* Sign extend 32bit SFmode immediate to 8 bytes. */
15198 if (code == 'q')
15199 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
15200 (unsigned long long) (int) l);
15201 else
15202 fprintf (file, "0x%08x", (unsigned int) l);
15205 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15207 REAL_VALUE_TYPE r;
15208 long l[2];
15210 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15211 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15213 if (ASSEMBLER_DIALECT == ASM_ATT)
15214 putc ('$', file);
15215 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15218 /* These float cases don't actually occur as immediate operands. */
15219 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15221 char dstr[30];
15223 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15224 fputs (dstr, file);
15227 else
15229 /* We have patterns that allow zero sets of memory, for instance.
15230 In 64-bit mode, we should probably support all 8-byte vectors,
15231 since we can in fact encode that into an immediate. */
15232 if (GET_CODE (x) == CONST_VECTOR)
15234 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15235 x = const0_rtx;
15238 if (code != 'P' && code != 'p')
15240 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15242 if (ASSEMBLER_DIALECT == ASM_ATT)
15243 putc ('$', file);
15245 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15246 || GET_CODE (x) == LABEL_REF)
15248 if (ASSEMBLER_DIALECT == ASM_ATT)
15249 putc ('$', file);
15250 else
15251 fputs ("OFFSET FLAT:", file);
15254 if (CONST_INT_P (x))
15255 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15256 else if (flag_pic || MACHOPIC_INDIRECT)
15257 output_pic_addr_const (file, x, code);
15258 else
15259 output_addr_const (file, x);
15263 static bool
15264 ix86_print_operand_punct_valid_p (unsigned char code)
15266 return (code == '@' || code == '*' || code == '+' || code == '&'
15267 || code == ';' || code == '~' || code == '^');
15270 /* Print a memory operand whose address is ADDR. */
15272 static void
15273 ix86_print_operand_address (FILE *file, rtx addr)
15275 struct ix86_address parts;
15276 rtx base, index, disp;
15277 int scale;
15278 int ok;
15279 bool vsib = false;
15280 int code = 0;
15282 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15284 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15285 gcc_assert (parts.index == NULL_RTX);
15286 parts.index = XVECEXP (addr, 0, 1);
15287 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15288 addr = XVECEXP (addr, 0, 0);
15289 vsib = true;
15291 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15293 gcc_assert (TARGET_64BIT);
15294 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15295 code = 'q';
15297 else
15298 ok = ix86_decompose_address (addr, &parts);
15300 gcc_assert (ok);
15302 base = parts.base;
15303 index = parts.index;
15304 disp = parts.disp;
15305 scale = parts.scale;
15307 switch (parts.seg)
15309 case SEG_DEFAULT:
15310 break;
15311 case SEG_FS:
15312 case SEG_GS:
15313 if (ASSEMBLER_DIALECT == ASM_ATT)
15314 putc ('%', file);
15315 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15316 break;
15317 default:
15318 gcc_unreachable ();
15321 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15322 if (TARGET_64BIT && !base && !index)
15324 rtx symbol = disp;
15326 if (GET_CODE (disp) == CONST
15327 && GET_CODE (XEXP (disp, 0)) == PLUS
15328 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15329 symbol = XEXP (XEXP (disp, 0), 0);
15331 if (GET_CODE (symbol) == LABEL_REF
15332 || (GET_CODE (symbol) == SYMBOL_REF
15333 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15334 base = pc_rtx;
15336 if (!base && !index)
15338 /* Displacement only requires special attention. */
15340 if (CONST_INT_P (disp))
15342 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15343 fputs ("ds:", file);
15344 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15346 else if (flag_pic)
15347 output_pic_addr_const (file, disp, 0);
15348 else
15349 output_addr_const (file, disp);
15351 else
15353 /* Print SImode register names to force addr32 prefix. */
15354 if (SImode_address_operand (addr, VOIDmode))
15356 #ifdef ENABLE_CHECKING
15357 gcc_assert (TARGET_64BIT);
15358 switch (GET_CODE (addr))
15360 case SUBREG:
15361 gcc_assert (GET_MODE (addr) == SImode);
15362 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15363 break;
15364 case ZERO_EXTEND:
15365 case AND:
15366 gcc_assert (GET_MODE (addr) == DImode);
15367 break;
15368 default:
15369 gcc_unreachable ();
15371 #endif
15372 gcc_assert (!code);
15373 code = 'k';
15375 else if (code == 0
15376 && TARGET_X32
15377 && disp
15378 && CONST_INT_P (disp)
15379 && INTVAL (disp) < -16*1024*1024)
15381 /* X32 runs in 64-bit mode, where displacement, DISP, in
15382 address DISP(%r64), is encoded as 32-bit immediate sign-
15383 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15384 address is %r64 + 0xffffffffbffffd00. When %r64 <
15385 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15386 which is invalid for x32. The correct address is %r64
15387 - 0x40000300 == 0xf7ffdd64. To properly encode
15388 -0x40000300(%r64) for x32, we zero-extend negative
15389 displacement by forcing addr32 prefix which truncates
15390 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15391 zero-extend all negative displacements, including -1(%rsp).
15392 However, for small negative displacements, sign-extension
15393 won't cause overflow. We only zero-extend negative
15394 displacements if they < -16*1024*1024, which is also used
15395 to check legitimate address displacements for PIC. */
15396 code = 'k';
15399 if (ASSEMBLER_DIALECT == ASM_ATT)
15401 if (disp)
15403 if (flag_pic)
15404 output_pic_addr_const (file, disp, 0);
15405 else if (GET_CODE (disp) == LABEL_REF)
15406 output_asm_label (disp);
15407 else
15408 output_addr_const (file, disp);
15411 putc ('(', file);
15412 if (base)
15413 print_reg (base, code, file);
15414 if (index)
15416 putc (',', file);
15417 print_reg (index, vsib ? 0 : code, file);
15418 if (scale != 1 || vsib)
15419 fprintf (file, ",%d", scale);
15421 putc (')', file);
15423 else
15425 rtx offset = NULL_RTX;
15427 if (disp)
15429 /* Pull out the offset of a symbol; print any symbol itself. */
15430 if (GET_CODE (disp) == CONST
15431 && GET_CODE (XEXP (disp, 0)) == PLUS
15432 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15434 offset = XEXP (XEXP (disp, 0), 1);
15435 disp = gen_rtx_CONST (VOIDmode,
15436 XEXP (XEXP (disp, 0), 0));
15439 if (flag_pic)
15440 output_pic_addr_const (file, disp, 0);
15441 else if (GET_CODE (disp) == LABEL_REF)
15442 output_asm_label (disp);
15443 else if (CONST_INT_P (disp))
15444 offset = disp;
15445 else
15446 output_addr_const (file, disp);
15449 putc ('[', file);
15450 if (base)
15452 print_reg (base, code, file);
15453 if (offset)
15455 if (INTVAL (offset) >= 0)
15456 putc ('+', file);
15457 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15460 else if (offset)
15461 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15462 else
15463 putc ('0', file);
15465 if (index)
15467 putc ('+', file);
15468 print_reg (index, vsib ? 0 : code, file);
15469 if (scale != 1 || vsib)
15470 fprintf (file, "*%d", scale);
15472 putc (']', file);
15477 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15479 static bool
15480 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15482 rtx op;
15484 if (GET_CODE (x) != UNSPEC)
15485 return false;
15487 op = XVECEXP (x, 0, 0);
15488 switch (XINT (x, 1))
15490 case UNSPEC_GOTTPOFF:
15491 output_addr_const (file, op);
15492 /* FIXME: This might be @TPOFF in Sun ld. */
15493 fputs ("@gottpoff", file);
15494 break;
15495 case UNSPEC_TPOFF:
15496 output_addr_const (file, op);
15497 fputs ("@tpoff", file);
15498 break;
15499 case UNSPEC_NTPOFF:
15500 output_addr_const (file, op);
15501 if (TARGET_64BIT)
15502 fputs ("@tpoff", file);
15503 else
15504 fputs ("@ntpoff", file);
15505 break;
15506 case UNSPEC_DTPOFF:
15507 output_addr_const (file, op);
15508 fputs ("@dtpoff", file);
15509 break;
15510 case UNSPEC_GOTNTPOFF:
15511 output_addr_const (file, op);
15512 if (TARGET_64BIT)
15513 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15514 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15515 else
15516 fputs ("@gotntpoff", file);
15517 break;
15518 case UNSPEC_INDNTPOFF:
15519 output_addr_const (file, op);
15520 fputs ("@indntpoff", file);
15521 break;
15522 #if TARGET_MACHO
15523 case UNSPEC_MACHOPIC_OFFSET:
15524 output_addr_const (file, op);
15525 putc ('-', file);
15526 machopic_output_function_base_name (file);
15527 break;
15528 #endif
15530 case UNSPEC_STACK_CHECK:
15532 int offset;
15534 gcc_assert (flag_split_stack);
15536 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15537 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15538 #else
15539 gcc_unreachable ();
15540 #endif
15542 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15544 break;
15546 default:
15547 return false;
15550 return true;
15553 /* Split one or more double-mode RTL references into pairs of half-mode
15554 references. The RTL can be REG, offsettable MEM, integer constant, or
15555 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15556 split and "num" is its length. lo_half and hi_half are output arrays
15557 that parallel "operands". */
15559 void
15560 split_double_mode (enum machine_mode mode, rtx operands[],
15561 int num, rtx lo_half[], rtx hi_half[])
15563 enum machine_mode half_mode;
15564 unsigned int byte;
15566 switch (mode)
15568 case TImode:
15569 half_mode = DImode;
15570 break;
15571 case DImode:
15572 half_mode = SImode;
15573 break;
15574 default:
15575 gcc_unreachable ();
15578 byte = GET_MODE_SIZE (half_mode);
15580 while (num--)
15582 rtx op = operands[num];
15584 /* simplify_subreg refuse to split volatile memory addresses,
15585 but we still have to handle it. */
15586 if (MEM_P (op))
15588 lo_half[num] = adjust_address (op, half_mode, 0);
15589 hi_half[num] = adjust_address (op, half_mode, byte);
15591 else
15593 lo_half[num] = simplify_gen_subreg (half_mode, op,
15594 GET_MODE (op) == VOIDmode
15595 ? mode : GET_MODE (op), 0);
15596 hi_half[num] = simplify_gen_subreg (half_mode, op,
15597 GET_MODE (op) == VOIDmode
15598 ? mode : GET_MODE (op), byte);
15603 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15604 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15605 is the expression of the binary operation. The output may either be
15606 emitted here, or returned to the caller, like all output_* functions.
15608 There is no guarantee that the operands are the same mode, as they
15609 might be within FLOAT or FLOAT_EXTEND expressions. */
15611 #ifndef SYSV386_COMPAT
15612 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15613 wants to fix the assemblers because that causes incompatibility
15614 with gcc. No-one wants to fix gcc because that causes
15615 incompatibility with assemblers... You can use the option of
15616 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15617 #define SYSV386_COMPAT 1
15618 #endif
15620 const char *
15621 output_387_binary_op (rtx insn, rtx *operands)
15623 static char buf[40];
15624 const char *p;
15625 const char *ssep;
15626 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15628 #ifdef ENABLE_CHECKING
15629 /* Even if we do not want to check the inputs, this documents input
15630 constraints. Which helps in understanding the following code. */
15631 if (STACK_REG_P (operands[0])
15632 && ((REG_P (operands[1])
15633 && REGNO (operands[0]) == REGNO (operands[1])
15634 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15635 || (REG_P (operands[2])
15636 && REGNO (operands[0]) == REGNO (operands[2])
15637 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15638 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15639 ; /* ok */
15640 else
15641 gcc_assert (is_sse);
15642 #endif
15644 switch (GET_CODE (operands[3]))
15646 case PLUS:
15647 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15648 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15649 p = "fiadd";
15650 else
15651 p = "fadd";
15652 ssep = "vadd";
15653 break;
15655 case MINUS:
15656 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15657 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15658 p = "fisub";
15659 else
15660 p = "fsub";
15661 ssep = "vsub";
15662 break;
15664 case MULT:
15665 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15666 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15667 p = "fimul";
15668 else
15669 p = "fmul";
15670 ssep = "vmul";
15671 break;
15673 case DIV:
15674 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15675 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15676 p = "fidiv";
15677 else
15678 p = "fdiv";
15679 ssep = "vdiv";
15680 break;
15682 default:
15683 gcc_unreachable ();
15686 if (is_sse)
15688 if (TARGET_AVX)
15690 strcpy (buf, ssep);
15691 if (GET_MODE (operands[0]) == SFmode)
15692 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15693 else
15694 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15696 else
15698 strcpy (buf, ssep + 1);
15699 if (GET_MODE (operands[0]) == SFmode)
15700 strcat (buf, "ss\t{%2, %0|%0, %2}");
15701 else
15702 strcat (buf, "sd\t{%2, %0|%0, %2}");
15704 return buf;
15706 strcpy (buf, p);
15708 switch (GET_CODE (operands[3]))
15710 case MULT:
15711 case PLUS:
15712 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15714 rtx temp = operands[2];
15715 operands[2] = operands[1];
15716 operands[1] = temp;
15719 /* know operands[0] == operands[1]. */
15721 if (MEM_P (operands[2]))
15723 p = "%Z2\t%2";
15724 break;
15727 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15729 if (STACK_TOP_P (operands[0]))
15730 /* How is it that we are storing to a dead operand[2]?
15731 Well, presumably operands[1] is dead too. We can't
15732 store the result to st(0) as st(0) gets popped on this
15733 instruction. Instead store to operands[2] (which I
15734 think has to be st(1)). st(1) will be popped later.
15735 gcc <= 2.8.1 didn't have this check and generated
15736 assembly code that the Unixware assembler rejected. */
15737 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15738 else
15739 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15740 break;
15743 if (STACK_TOP_P (operands[0]))
15744 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15745 else
15746 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15747 break;
15749 case MINUS:
15750 case DIV:
15751 if (MEM_P (operands[1]))
15753 p = "r%Z1\t%1";
15754 break;
15757 if (MEM_P (operands[2]))
15759 p = "%Z2\t%2";
15760 break;
15763 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15765 #if SYSV386_COMPAT
15766 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15767 derived assemblers, confusingly reverse the direction of
15768 the operation for fsub{r} and fdiv{r} when the
15769 destination register is not st(0). The Intel assembler
15770 doesn't have this brain damage. Read !SYSV386_COMPAT to
15771 figure out what the hardware really does. */
15772 if (STACK_TOP_P (operands[0]))
15773 p = "{p\t%0, %2|rp\t%2, %0}";
15774 else
15775 p = "{rp\t%2, %0|p\t%0, %2}";
15776 #else
15777 if (STACK_TOP_P (operands[0]))
15778 /* As above for fmul/fadd, we can't store to st(0). */
15779 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15780 else
15781 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15782 #endif
15783 break;
15786 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15788 #if SYSV386_COMPAT
15789 if (STACK_TOP_P (operands[0]))
15790 p = "{rp\t%0, %1|p\t%1, %0}";
15791 else
15792 p = "{p\t%1, %0|rp\t%0, %1}";
15793 #else
15794 if (STACK_TOP_P (operands[0]))
15795 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15796 else
15797 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15798 #endif
15799 break;
15802 if (STACK_TOP_P (operands[0]))
15804 if (STACK_TOP_P (operands[1]))
15805 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15806 else
15807 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15808 break;
15810 else if (STACK_TOP_P (operands[1]))
15812 #if SYSV386_COMPAT
15813 p = "{\t%1, %0|r\t%0, %1}";
15814 #else
15815 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15816 #endif
15818 else
15820 #if SYSV386_COMPAT
15821 p = "{r\t%2, %0|\t%0, %2}";
15822 #else
15823 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15824 #endif
15826 break;
15828 default:
15829 gcc_unreachable ();
15832 strcat (buf, p);
15833 return buf;
15836 /* Check if a 256bit AVX register is referenced inside of EXP. */
15838 static int
15839 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15841 rtx exp = *pexp;
15843 if (GET_CODE (exp) == SUBREG)
15844 exp = SUBREG_REG (exp);
15846 if (REG_P (exp)
15847 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15848 return 1;
15850 return 0;
15853 /* Return needed mode for entity in optimize_mode_switching pass. */
15855 static int
15856 ix86_avx_u128_mode_needed (rtx insn)
15858 if (CALL_P (insn))
15860 rtx link;
15862 /* Needed mode is set to AVX_U128_CLEAN if there are
15863 no 256bit modes used in function arguments. */
15864 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15865 link;
15866 link = XEXP (link, 1))
15868 if (GET_CODE (XEXP (link, 0)) == USE)
15870 rtx arg = XEXP (XEXP (link, 0), 0);
15872 if (ix86_check_avx256_register (&arg, NULL))
15873 return AVX_U128_DIRTY;
15877 return AVX_U128_CLEAN;
15880 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15881 changes state only when a 256bit register is written to, but we need
15882 to prevent the compiler from moving optimal insertion point above
15883 eventual read from 256bit register. */
15884 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15885 return AVX_U128_DIRTY;
15887 return AVX_U128_ANY;
15890 /* Return mode that i387 must be switched into
15891 prior to the execution of insn. */
15893 static int
15894 ix86_i387_mode_needed (int entity, rtx insn)
15896 enum attr_i387_cw mode;
15898 /* The mode UNINITIALIZED is used to store control word after a
15899 function call or ASM pattern. The mode ANY specify that function
15900 has no requirements on the control word and make no changes in the
15901 bits we are interested in. */
15903 if (CALL_P (insn)
15904 || (NONJUMP_INSN_P (insn)
15905 && (asm_noperands (PATTERN (insn)) >= 0
15906 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15907 return I387_CW_UNINITIALIZED;
15909 if (recog_memoized (insn) < 0)
15910 return I387_CW_ANY;
15912 mode = get_attr_i387_cw (insn);
15914 switch (entity)
15916 case I387_TRUNC:
15917 if (mode == I387_CW_TRUNC)
15918 return mode;
15919 break;
15921 case I387_FLOOR:
15922 if (mode == I387_CW_FLOOR)
15923 return mode;
15924 break;
15926 case I387_CEIL:
15927 if (mode == I387_CW_CEIL)
15928 return mode;
15929 break;
15931 case I387_MASK_PM:
15932 if (mode == I387_CW_MASK_PM)
15933 return mode;
15934 break;
15936 default:
15937 gcc_unreachable ();
15940 return I387_CW_ANY;
15943 /* Return mode that entity must be switched into
15944 prior to the execution of insn. */
15947 ix86_mode_needed (int entity, rtx insn)
15949 switch (entity)
15951 case AVX_U128:
15952 return ix86_avx_u128_mode_needed (insn);
15953 case I387_TRUNC:
15954 case I387_FLOOR:
15955 case I387_CEIL:
15956 case I387_MASK_PM:
15957 return ix86_i387_mode_needed (entity, insn);
15958 default:
15959 gcc_unreachable ();
15961 return 0;
15964 /* Check if a 256bit AVX register is referenced in stores. */
15966 static void
15967 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15969 if (ix86_check_avx256_register (&dest, NULL))
15971 bool *used = (bool *) data;
15972 *used = true;
15976 /* Calculate mode of upper 128bit AVX registers after the insn. */
15978 static int
15979 ix86_avx_u128_mode_after (int mode, rtx insn)
15981 rtx pat = PATTERN (insn);
15983 if (vzeroupper_operation (pat, VOIDmode)
15984 || vzeroall_operation (pat, VOIDmode))
15985 return AVX_U128_CLEAN;
15987 /* We know that state is clean after CALL insn if there are no
15988 256bit registers used in the function return register. */
15989 if (CALL_P (insn))
15991 bool avx_reg256_found = false;
15992 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15994 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15997 /* Otherwise, return current mode. Remember that if insn
15998 references AVX 256bit registers, the mode was already changed
15999 to DIRTY from MODE_NEEDED. */
16000 return mode;
16003 /* Return the mode that an insn results in. */
16006 ix86_mode_after (int entity, int mode, rtx insn)
16008 switch (entity)
16010 case AVX_U128:
16011 return ix86_avx_u128_mode_after (mode, insn);
16012 case I387_TRUNC:
16013 case I387_FLOOR:
16014 case I387_CEIL:
16015 case I387_MASK_PM:
16016 return mode;
16017 default:
16018 gcc_unreachable ();
16022 static int
16023 ix86_avx_u128_mode_entry (void)
16025 tree arg;
16027 /* Entry mode is set to AVX_U128_DIRTY if there are
16028 256bit modes used in function arguments. */
16029 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
16030 arg = TREE_CHAIN (arg))
16032 rtx incoming = DECL_INCOMING_RTL (arg);
16034 if (incoming && ix86_check_avx256_register (&incoming, NULL))
16035 return AVX_U128_DIRTY;
16038 return AVX_U128_CLEAN;
16041 /* Return a mode that ENTITY is assumed to be
16042 switched to at function entry. */
16045 ix86_mode_entry (int entity)
16047 switch (entity)
16049 case AVX_U128:
16050 return ix86_avx_u128_mode_entry ();
16051 case I387_TRUNC:
16052 case I387_FLOOR:
16053 case I387_CEIL:
16054 case I387_MASK_PM:
16055 return I387_CW_ANY;
16056 default:
16057 gcc_unreachable ();
16061 static int
16062 ix86_avx_u128_mode_exit (void)
16064 rtx reg = crtl->return_rtx;
16066 /* Exit mode is set to AVX_U128_DIRTY if there are
16067 256bit modes used in the function return register. */
16068 if (reg && ix86_check_avx256_register (&reg, NULL))
16069 return AVX_U128_DIRTY;
16071 return AVX_U128_CLEAN;
16074 /* Return a mode that ENTITY is assumed to be
16075 switched to at function exit. */
16078 ix86_mode_exit (int entity)
16080 switch (entity)
16082 case AVX_U128:
16083 return ix86_avx_u128_mode_exit ();
16084 case I387_TRUNC:
16085 case I387_FLOOR:
16086 case I387_CEIL:
16087 case I387_MASK_PM:
16088 return I387_CW_ANY;
16089 default:
16090 gcc_unreachable ();
16094 /* Output code to initialize control word copies used by trunc?f?i and
16095 rounding patterns. CURRENT_MODE is set to current control word,
16096 while NEW_MODE is set to new control word. */
16098 static void
16099 emit_i387_cw_initialization (int mode)
16101 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
16102 rtx new_mode;
16104 enum ix86_stack_slot slot;
16106 rtx reg = gen_reg_rtx (HImode);
16108 emit_insn (gen_x86_fnstcw_1 (stored_mode));
16109 emit_move_insn (reg, copy_rtx (stored_mode));
16111 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
16112 || optimize_insn_for_size_p ())
16114 switch (mode)
16116 case I387_CW_TRUNC:
16117 /* round toward zero (truncate) */
16118 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
16119 slot = SLOT_CW_TRUNC;
16120 break;
16122 case I387_CW_FLOOR:
16123 /* round down toward -oo */
16124 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16125 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
16126 slot = SLOT_CW_FLOOR;
16127 break;
16129 case I387_CW_CEIL:
16130 /* round up toward +oo */
16131 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
16132 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
16133 slot = SLOT_CW_CEIL;
16134 break;
16136 case I387_CW_MASK_PM:
16137 /* mask precision exception for nearbyint() */
16138 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16139 slot = SLOT_CW_MASK_PM;
16140 break;
16142 default:
16143 gcc_unreachable ();
16146 else
16148 switch (mode)
16150 case I387_CW_TRUNC:
16151 /* round toward zero (truncate) */
16152 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
16153 slot = SLOT_CW_TRUNC;
16154 break;
16156 case I387_CW_FLOOR:
16157 /* round down toward -oo */
16158 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
16159 slot = SLOT_CW_FLOOR;
16160 break;
16162 case I387_CW_CEIL:
16163 /* round up toward +oo */
16164 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
16165 slot = SLOT_CW_CEIL;
16166 break;
16168 case I387_CW_MASK_PM:
16169 /* mask precision exception for nearbyint() */
16170 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
16171 slot = SLOT_CW_MASK_PM;
16172 break;
16174 default:
16175 gcc_unreachable ();
16179 gcc_assert (slot < MAX_386_STACK_LOCALS);
16181 new_mode = assign_386_stack_local (HImode, slot);
16182 emit_move_insn (new_mode, reg);
16185 /* Emit vzeroupper. */
16187 void
16188 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16190 int i;
16192 /* Cancel automatic vzeroupper insertion if there are
16193 live call-saved SSE registers at the insertion point. */
16195 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16196 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16197 return;
16199 if (TARGET_64BIT)
16200 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16201 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16202 return;
16204 emit_insn (gen_avx_vzeroupper ());
16207 /* Generate one or more insns to set ENTITY to MODE. */
16209 void
16210 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16212 switch (entity)
16214 case AVX_U128:
16215 if (mode == AVX_U128_CLEAN)
16216 ix86_avx_emit_vzeroupper (regs_live);
16217 break;
16218 case I387_TRUNC:
16219 case I387_FLOOR:
16220 case I387_CEIL:
16221 case I387_MASK_PM:
16222 if (mode != I387_CW_ANY
16223 && mode != I387_CW_UNINITIALIZED)
16224 emit_i387_cw_initialization (mode);
16225 break;
16226 default:
16227 gcc_unreachable ();
16231 /* Output code for INSN to convert a float to a signed int. OPERANDS
16232 are the insn operands. The output may be [HSD]Imode and the input
16233 operand may be [SDX]Fmode. */
16235 const char *
16236 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16238 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16239 int dimode_p = GET_MODE (operands[0]) == DImode;
16240 int round_mode = get_attr_i387_cw (insn);
16242 /* Jump through a hoop or two for DImode, since the hardware has no
16243 non-popping instruction. We used to do this a different way, but
16244 that was somewhat fragile and broke with post-reload splitters. */
16245 if ((dimode_p || fisttp) && !stack_top_dies)
16246 output_asm_insn ("fld\t%y1", operands);
16248 gcc_assert (STACK_TOP_P (operands[1]));
16249 gcc_assert (MEM_P (operands[0]));
16250 gcc_assert (GET_MODE (operands[1]) != TFmode);
16252 if (fisttp)
16253 output_asm_insn ("fisttp%Z0\t%0", operands);
16254 else
16256 if (round_mode != I387_CW_ANY)
16257 output_asm_insn ("fldcw\t%3", operands);
16258 if (stack_top_dies || dimode_p)
16259 output_asm_insn ("fistp%Z0\t%0", operands);
16260 else
16261 output_asm_insn ("fist%Z0\t%0", operands);
16262 if (round_mode != I387_CW_ANY)
16263 output_asm_insn ("fldcw\t%2", operands);
16266 return "";
16269 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16270 have the values zero or one, indicates the ffreep insn's operand
16271 from the OPERANDS array. */
16273 static const char *
16274 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16276 if (TARGET_USE_FFREEP)
16277 #ifdef HAVE_AS_IX86_FFREEP
16278 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16279 #else
16281 static char retval[32];
16282 int regno = REGNO (operands[opno]);
16284 gcc_assert (STACK_REGNO_P (regno));
16286 regno -= FIRST_STACK_REG;
16288 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16289 return retval;
16291 #endif
16293 return opno ? "fstp\t%y1" : "fstp\t%y0";
16297 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16298 should be used. UNORDERED_P is true when fucom should be used. */
16300 const char *
16301 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16303 int stack_top_dies;
16304 rtx cmp_op0, cmp_op1;
16305 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16307 if (eflags_p)
16309 cmp_op0 = operands[0];
16310 cmp_op1 = operands[1];
16312 else
16314 cmp_op0 = operands[1];
16315 cmp_op1 = operands[2];
16318 if (is_sse)
16320 if (GET_MODE (operands[0]) == SFmode)
16321 if (unordered_p)
16322 return "%vucomiss\t{%1, %0|%0, %1}";
16323 else
16324 return "%vcomiss\t{%1, %0|%0, %1}";
16325 else
16326 if (unordered_p)
16327 return "%vucomisd\t{%1, %0|%0, %1}";
16328 else
16329 return "%vcomisd\t{%1, %0|%0, %1}";
16332 gcc_assert (STACK_TOP_P (cmp_op0));
16334 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16336 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16338 if (stack_top_dies)
16340 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16341 return output_387_ffreep (operands, 1);
16343 else
16344 return "ftst\n\tfnstsw\t%0";
16347 if (STACK_REG_P (cmp_op1)
16348 && stack_top_dies
16349 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16350 && REGNO (cmp_op1) != FIRST_STACK_REG)
16352 /* If both the top of the 387 stack dies, and the other operand
16353 is also a stack register that dies, then this must be a
16354 `fcompp' float compare */
16356 if (eflags_p)
16358 /* There is no double popping fcomi variant. Fortunately,
16359 eflags is immune from the fstp's cc clobbering. */
16360 if (unordered_p)
16361 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16362 else
16363 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16364 return output_387_ffreep (operands, 0);
16366 else
16368 if (unordered_p)
16369 return "fucompp\n\tfnstsw\t%0";
16370 else
16371 return "fcompp\n\tfnstsw\t%0";
16374 else
16376 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16378 static const char * const alt[16] =
16380 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16381 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16382 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16383 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16385 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16386 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16387 NULL,
16388 NULL,
16390 "fcomi\t{%y1, %0|%0, %y1}",
16391 "fcomip\t{%y1, %0|%0, %y1}",
16392 "fucomi\t{%y1, %0|%0, %y1}",
16393 "fucomip\t{%y1, %0|%0, %y1}",
16395 NULL,
16396 NULL,
16397 NULL,
16398 NULL
16401 int mask;
16402 const char *ret;
16404 mask = eflags_p << 3;
16405 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16406 mask |= unordered_p << 1;
16407 mask |= stack_top_dies;
16409 gcc_assert (mask < 16);
16410 ret = alt[mask];
16411 gcc_assert (ret);
16413 return ret;
16417 void
16418 ix86_output_addr_vec_elt (FILE *file, int value)
16420 const char *directive = ASM_LONG;
16422 #ifdef ASM_QUAD
16423 if (TARGET_LP64)
16424 directive = ASM_QUAD;
16425 #else
16426 gcc_assert (!TARGET_64BIT);
16427 #endif
16429 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16432 void
16433 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16435 const char *directive = ASM_LONG;
16437 #ifdef ASM_QUAD
16438 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16439 directive = ASM_QUAD;
16440 #else
16441 gcc_assert (!TARGET_64BIT);
16442 #endif
16443 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16444 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16445 fprintf (file, "%s%s%d-%s%d\n",
16446 directive, LPREFIX, value, LPREFIX, rel);
16447 else if (HAVE_AS_GOTOFF_IN_DATA)
16448 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16449 #if TARGET_MACHO
16450 else if (TARGET_MACHO)
16452 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16453 machopic_output_function_base_name (file);
16454 putc ('\n', file);
16456 #endif
16457 else
16458 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16459 GOT_SYMBOL_NAME, LPREFIX, value);
16462 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16463 for the target. */
16465 void
16466 ix86_expand_clear (rtx dest)
16468 rtx tmp;
16470 /* We play register width games, which are only valid after reload. */
16471 gcc_assert (reload_completed);
16473 /* Avoid HImode and its attendant prefix byte. */
16474 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16475 dest = gen_rtx_REG (SImode, REGNO (dest));
16476 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16478 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16479 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16481 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16482 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16485 emit_insn (tmp);
16488 /* X is an unchanging MEM. If it is a constant pool reference, return
16489 the constant pool rtx, else NULL. */
16492 maybe_get_pool_constant (rtx x)
16494 x = ix86_delegitimize_address (XEXP (x, 0));
16496 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16497 return get_pool_constant (x);
16499 return NULL_RTX;
16502 void
16503 ix86_expand_move (enum machine_mode mode, rtx operands[])
16505 rtx op0, op1;
16506 enum tls_model model;
16508 op0 = operands[0];
16509 op1 = operands[1];
16511 if (GET_CODE (op1) == SYMBOL_REF)
16513 rtx tmp;
16515 model = SYMBOL_REF_TLS_MODEL (op1);
16516 if (model)
16518 op1 = legitimize_tls_address (op1, model, true);
16519 op1 = force_operand (op1, op0);
16520 if (op1 == op0)
16521 return;
16522 op1 = convert_to_mode (mode, op1, 1);
16524 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16525 op1 = tmp;
16527 else if (GET_CODE (op1) == CONST
16528 && GET_CODE (XEXP (op1, 0)) == PLUS
16529 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16531 rtx addend = XEXP (XEXP (op1, 0), 1);
16532 rtx symbol = XEXP (XEXP (op1, 0), 0);
16533 rtx tmp;
16535 model = SYMBOL_REF_TLS_MODEL (symbol);
16536 if (model)
16537 tmp = legitimize_tls_address (symbol, model, true);
16538 else
16539 tmp = legitimize_pe_coff_symbol (symbol, true);
16541 if (tmp)
16543 tmp = force_operand (tmp, NULL);
16544 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16545 op0, 1, OPTAB_DIRECT);
16546 if (tmp == op0)
16547 return;
16548 op1 = convert_to_mode (mode, tmp, 1);
16552 if ((flag_pic || MACHOPIC_INDIRECT)
16553 && symbolic_operand (op1, mode))
16555 if (TARGET_MACHO && !TARGET_64BIT)
16557 #if TARGET_MACHO
16558 /* dynamic-no-pic */
16559 if (MACHOPIC_INDIRECT)
16561 rtx temp = ((reload_in_progress
16562 || ((op0 && REG_P (op0))
16563 && mode == Pmode))
16564 ? op0 : gen_reg_rtx (Pmode));
16565 op1 = machopic_indirect_data_reference (op1, temp);
16566 if (MACHOPIC_PURE)
16567 op1 = machopic_legitimize_pic_address (op1, mode,
16568 temp == op1 ? 0 : temp);
16570 if (op0 != op1 && GET_CODE (op0) != MEM)
16572 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16573 emit_insn (insn);
16574 return;
16576 if (GET_CODE (op0) == MEM)
16577 op1 = force_reg (Pmode, op1);
16578 else
16580 rtx temp = op0;
16581 if (GET_CODE (temp) != REG)
16582 temp = gen_reg_rtx (Pmode);
16583 temp = legitimize_pic_address (op1, temp);
16584 if (temp == op0)
16585 return;
16586 op1 = temp;
16588 /* dynamic-no-pic */
16589 #endif
16591 else
16593 if (MEM_P (op0))
16594 op1 = force_reg (mode, op1);
16595 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16597 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16598 op1 = legitimize_pic_address (op1, reg);
16599 if (op0 == op1)
16600 return;
16601 op1 = convert_to_mode (mode, op1, 1);
16605 else
16607 if (MEM_P (op0)
16608 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16609 || !push_operand (op0, mode))
16610 && MEM_P (op1))
16611 op1 = force_reg (mode, op1);
16613 if (push_operand (op0, mode)
16614 && ! general_no_elim_operand (op1, mode))
16615 op1 = copy_to_mode_reg (mode, op1);
16617 /* Force large constants in 64bit compilation into register
16618 to get them CSEed. */
16619 if (can_create_pseudo_p ()
16620 && (mode == DImode) && TARGET_64BIT
16621 && immediate_operand (op1, mode)
16622 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16623 && !register_operand (op0, mode)
16624 && optimize)
16625 op1 = copy_to_mode_reg (mode, op1);
16627 if (can_create_pseudo_p ()
16628 && FLOAT_MODE_P (mode)
16629 && GET_CODE (op1) == CONST_DOUBLE)
16631 /* If we are loading a floating point constant to a register,
16632 force the value to memory now, since we'll get better code
16633 out the back end. */
16635 op1 = validize_mem (force_const_mem (mode, op1));
16636 if (!register_operand (op0, mode))
16638 rtx temp = gen_reg_rtx (mode);
16639 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16640 emit_move_insn (op0, temp);
16641 return;
16646 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16649 void
16650 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16652 rtx op0 = operands[0], op1 = operands[1];
16653 unsigned int align = GET_MODE_ALIGNMENT (mode);
16655 /* Force constants other than zero into memory. We do not know how
16656 the instructions used to build constants modify the upper 64 bits
16657 of the register, once we have that information we may be able
16658 to handle some of them more efficiently. */
16659 if (can_create_pseudo_p ()
16660 && register_operand (op0, mode)
16661 && (CONSTANT_P (op1)
16662 || (GET_CODE (op1) == SUBREG
16663 && CONSTANT_P (SUBREG_REG (op1))))
16664 && !standard_sse_constant_p (op1))
16665 op1 = validize_mem (force_const_mem (mode, op1));
16667 /* We need to check memory alignment for SSE mode since attribute
16668 can make operands unaligned. */
16669 if (can_create_pseudo_p ()
16670 && SSE_REG_MODE_P (mode)
16671 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16672 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16674 rtx tmp[2];
16676 /* ix86_expand_vector_move_misalign() does not like constants ... */
16677 if (CONSTANT_P (op1)
16678 || (GET_CODE (op1) == SUBREG
16679 && CONSTANT_P (SUBREG_REG (op1))))
16680 op1 = validize_mem (force_const_mem (mode, op1));
16682 /* ... nor both arguments in memory. */
16683 if (!register_operand (op0, mode)
16684 && !register_operand (op1, mode))
16685 op1 = force_reg (mode, op1);
16687 tmp[0] = op0; tmp[1] = op1;
16688 ix86_expand_vector_move_misalign (mode, tmp);
16689 return;
16692 /* Make operand1 a register if it isn't already. */
16693 if (can_create_pseudo_p ()
16694 && !register_operand (op0, mode)
16695 && !register_operand (op1, mode))
16697 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16698 return;
16701 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16704 /* Split 32-byte AVX unaligned load and store if needed. */
16706 static void
16707 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16709 rtx m;
16710 rtx (*extract) (rtx, rtx, rtx);
16711 rtx (*load_unaligned) (rtx, rtx);
16712 rtx (*store_unaligned) (rtx, rtx);
16713 enum machine_mode mode;
16715 switch (GET_MODE (op0))
16717 default:
16718 gcc_unreachable ();
16719 case V32QImode:
16720 extract = gen_avx_vextractf128v32qi;
16721 load_unaligned = gen_avx_loaddquv32qi;
16722 store_unaligned = gen_avx_storedquv32qi;
16723 mode = V16QImode;
16724 break;
16725 case V8SFmode:
16726 extract = gen_avx_vextractf128v8sf;
16727 load_unaligned = gen_avx_loadups256;
16728 store_unaligned = gen_avx_storeups256;
16729 mode = V4SFmode;
16730 break;
16731 case V4DFmode:
16732 extract = gen_avx_vextractf128v4df;
16733 load_unaligned = gen_avx_loadupd256;
16734 store_unaligned = gen_avx_storeupd256;
16735 mode = V2DFmode;
16736 break;
16739 if (MEM_P (op1))
16741 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16743 rtx r = gen_reg_rtx (mode);
16744 m = adjust_address (op1, mode, 0);
16745 emit_move_insn (r, m);
16746 m = adjust_address (op1, mode, 16);
16747 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16748 emit_move_insn (op0, r);
16750 /* Normal *mov<mode>_internal pattern will handle
16751 unaligned loads just fine if misaligned_operand
16752 is true, and without the UNSPEC it can be combined
16753 with arithmetic instructions. */
16754 else if (misaligned_operand (op1, GET_MODE (op1)))
16755 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16756 else
16757 emit_insn (load_unaligned (op0, op1));
16759 else if (MEM_P (op0))
16761 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16763 m = adjust_address (op0, mode, 0);
16764 emit_insn (extract (m, op1, const0_rtx));
16765 m = adjust_address (op0, mode, 16);
16766 emit_insn (extract (m, op1, const1_rtx));
16768 else
16769 emit_insn (store_unaligned (op0, op1));
16771 else
16772 gcc_unreachable ();
16775 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16776 straight to ix86_expand_vector_move. */
16777 /* Code generation for scalar reg-reg moves of single and double precision data:
16778 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16779 movaps reg, reg
16780 else
16781 movss reg, reg
16782 if (x86_sse_partial_reg_dependency == true)
16783 movapd reg, reg
16784 else
16785 movsd reg, reg
16787 Code generation for scalar loads of double precision data:
16788 if (x86_sse_split_regs == true)
16789 movlpd mem, reg (gas syntax)
16790 else
16791 movsd mem, reg
16793 Code generation for unaligned packed loads of single precision data
16794 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16795 if (x86_sse_unaligned_move_optimal)
16796 movups mem, reg
16798 if (x86_sse_partial_reg_dependency == true)
16800 xorps reg, reg
16801 movlps mem, reg
16802 movhps mem+8, reg
16804 else
16806 movlps mem, reg
16807 movhps mem+8, reg
16810 Code generation for unaligned packed loads of double precision data
16811 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16812 if (x86_sse_unaligned_move_optimal)
16813 movupd mem, reg
16815 if (x86_sse_split_regs == true)
16817 movlpd mem, reg
16818 movhpd mem+8, reg
16820 else
16822 movsd mem, reg
16823 movhpd mem+8, reg
16827 void
16828 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16830 rtx op0, op1, orig_op0 = NULL_RTX, m;
16831 rtx (*load_unaligned) (rtx, rtx);
16832 rtx (*store_unaligned) (rtx, rtx);
16834 op0 = operands[0];
16835 op1 = operands[1];
16837 if (GET_MODE_SIZE (mode) == 64)
16839 switch (GET_MODE_CLASS (mode))
16841 case MODE_VECTOR_INT:
16842 case MODE_INT:
16843 if (GET_MODE (op0) != V16SImode)
16845 if (!MEM_P (op0))
16847 orig_op0 = op0;
16848 op0 = gen_reg_rtx (V16SImode);
16850 else
16851 op0 = gen_lowpart (V16SImode, op0);
16853 op1 = gen_lowpart (V16SImode, op1);
16854 /* FALLTHRU */
16856 case MODE_VECTOR_FLOAT:
16857 switch (GET_MODE (op0))
16859 default:
16860 gcc_unreachable ();
16861 case V16SImode:
16862 load_unaligned = gen_avx512f_loaddquv16si;
16863 store_unaligned = gen_avx512f_storedquv16si;
16864 break;
16865 case V16SFmode:
16866 load_unaligned = gen_avx512f_loadups512;
16867 store_unaligned = gen_avx512f_storeups512;
16868 break;
16869 case V8DFmode:
16870 load_unaligned = gen_avx512f_loadupd512;
16871 store_unaligned = gen_avx512f_storeupd512;
16872 break;
16875 if (MEM_P (op1))
16876 emit_insn (load_unaligned (op0, op1));
16877 else if (MEM_P (op0))
16878 emit_insn (store_unaligned (op0, op1));
16879 else
16880 gcc_unreachable ();
16881 if (orig_op0)
16882 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16883 break;
16885 default:
16886 gcc_unreachable ();
16889 return;
16892 if (TARGET_AVX
16893 && GET_MODE_SIZE (mode) == 32)
16895 switch (GET_MODE_CLASS (mode))
16897 case MODE_VECTOR_INT:
16898 case MODE_INT:
16899 if (GET_MODE (op0) != V32QImode)
16901 if (!MEM_P (op0))
16903 orig_op0 = op0;
16904 op0 = gen_reg_rtx (V32QImode);
16906 else
16907 op0 = gen_lowpart (V32QImode, op0);
16909 op1 = gen_lowpart (V32QImode, op1);
16910 /* FALLTHRU */
16912 case MODE_VECTOR_FLOAT:
16913 ix86_avx256_split_vector_move_misalign (op0, op1);
16914 if (orig_op0)
16915 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16916 break;
16918 default:
16919 gcc_unreachable ();
16922 return;
16925 if (MEM_P (op1))
16927 /* Normal *mov<mode>_internal pattern will handle
16928 unaligned loads just fine if misaligned_operand
16929 is true, and without the UNSPEC it can be combined
16930 with arithmetic instructions. */
16931 if (TARGET_AVX
16932 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16933 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16934 && misaligned_operand (op1, GET_MODE (op1)))
16935 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16936 /* ??? If we have typed data, then it would appear that using
16937 movdqu is the only way to get unaligned data loaded with
16938 integer type. */
16939 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16941 if (GET_MODE (op0) != V16QImode)
16943 orig_op0 = op0;
16944 op0 = gen_reg_rtx (V16QImode);
16946 op1 = gen_lowpart (V16QImode, op1);
16947 /* We will eventually emit movups based on insn attributes. */
16948 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16949 if (orig_op0)
16950 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16952 else if (TARGET_SSE2 && mode == V2DFmode)
16954 rtx zero;
16956 if (TARGET_AVX
16957 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16958 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16959 || optimize_insn_for_size_p ())
16961 /* We will eventually emit movups based on insn attributes. */
16962 emit_insn (gen_sse2_loadupd (op0, op1));
16963 return;
16966 /* When SSE registers are split into halves, we can avoid
16967 writing to the top half twice. */
16968 if (TARGET_SSE_SPLIT_REGS)
16970 emit_clobber (op0);
16971 zero = op0;
16973 else
16975 /* ??? Not sure about the best option for the Intel chips.
16976 The following would seem to satisfy; the register is
16977 entirely cleared, breaking the dependency chain. We
16978 then store to the upper half, with a dependency depth
16979 of one. A rumor has it that Intel recommends two movsd
16980 followed by an unpacklpd, but this is unconfirmed. And
16981 given that the dependency depth of the unpacklpd would
16982 still be one, I'm not sure why this would be better. */
16983 zero = CONST0_RTX (V2DFmode);
16986 m = adjust_address (op1, DFmode, 0);
16987 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16988 m = adjust_address (op1, DFmode, 8);
16989 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16991 else
16993 rtx t;
16995 if (TARGET_AVX
16996 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16997 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16998 || optimize_insn_for_size_p ())
17000 if (GET_MODE (op0) != V4SFmode)
17002 orig_op0 = op0;
17003 op0 = gen_reg_rtx (V4SFmode);
17005 op1 = gen_lowpart (V4SFmode, op1);
17006 emit_insn (gen_sse_loadups (op0, op1));
17007 if (orig_op0)
17008 emit_move_insn (orig_op0,
17009 gen_lowpart (GET_MODE (orig_op0), op0));
17010 return;
17013 if (mode != V4SFmode)
17014 t = gen_reg_rtx (V4SFmode);
17015 else
17016 t = op0;
17018 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
17019 emit_move_insn (t, CONST0_RTX (V4SFmode));
17020 else
17021 emit_clobber (t);
17023 m = adjust_address (op1, V2SFmode, 0);
17024 emit_insn (gen_sse_loadlps (t, t, m));
17025 m = adjust_address (op1, V2SFmode, 8);
17026 emit_insn (gen_sse_loadhps (t, t, m));
17027 if (mode != V4SFmode)
17028 emit_move_insn (op0, gen_lowpart (mode, t));
17031 else if (MEM_P (op0))
17033 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
17035 op0 = gen_lowpart (V16QImode, op0);
17036 op1 = gen_lowpart (V16QImode, op1);
17037 /* We will eventually emit movups based on insn attributes. */
17038 emit_insn (gen_sse2_storedquv16qi (op0, op1));
17040 else if (TARGET_SSE2 && mode == V2DFmode)
17042 if (TARGET_AVX
17043 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
17044 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17045 || optimize_insn_for_size_p ())
17046 /* We will eventually emit movups based on insn attributes. */
17047 emit_insn (gen_sse2_storeupd (op0, op1));
17048 else
17050 m = adjust_address (op0, DFmode, 0);
17051 emit_insn (gen_sse2_storelpd (m, op1));
17052 m = adjust_address (op0, DFmode, 8);
17053 emit_insn (gen_sse2_storehpd (m, op1));
17056 else
17058 if (mode != V4SFmode)
17059 op1 = gen_lowpart (V4SFmode, op1);
17061 if (TARGET_AVX
17062 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
17063 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17064 || optimize_insn_for_size_p ())
17066 op0 = gen_lowpart (V4SFmode, op0);
17067 emit_insn (gen_sse_storeups (op0, op1));
17069 else
17071 m = adjust_address (op0, V2SFmode, 0);
17072 emit_insn (gen_sse_storelps (m, op1));
17073 m = adjust_address (op0, V2SFmode, 8);
17074 emit_insn (gen_sse_storehps (m, op1));
17078 else
17079 gcc_unreachable ();
17082 /* Expand a push in MODE. This is some mode for which we do not support
17083 proper push instructions, at least from the registers that we expect
17084 the value to live in. */
17086 void
17087 ix86_expand_push (enum machine_mode mode, rtx x)
17089 rtx tmp;
17091 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
17092 GEN_INT (-GET_MODE_SIZE (mode)),
17093 stack_pointer_rtx, 1, OPTAB_DIRECT);
17094 if (tmp != stack_pointer_rtx)
17095 emit_move_insn (stack_pointer_rtx, tmp);
17097 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
17099 /* When we push an operand onto stack, it has to be aligned at least
17100 at the function argument boundary. However since we don't have
17101 the argument type, we can't determine the actual argument
17102 boundary. */
17103 emit_move_insn (tmp, x);
17106 /* Helper function of ix86_fixup_binary_operands to canonicalize
17107 operand order. Returns true if the operands should be swapped. */
17109 static bool
17110 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
17111 rtx operands[])
17113 rtx dst = operands[0];
17114 rtx src1 = operands[1];
17115 rtx src2 = operands[2];
17117 /* If the operation is not commutative, we can't do anything. */
17118 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
17119 return false;
17121 /* Highest priority is that src1 should match dst. */
17122 if (rtx_equal_p (dst, src1))
17123 return false;
17124 if (rtx_equal_p (dst, src2))
17125 return true;
17127 /* Next highest priority is that immediate constants come second. */
17128 if (immediate_operand (src2, mode))
17129 return false;
17130 if (immediate_operand (src1, mode))
17131 return true;
17133 /* Lowest priority is that memory references should come second. */
17134 if (MEM_P (src2))
17135 return false;
17136 if (MEM_P (src1))
17137 return true;
17139 return false;
17143 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
17144 destination to use for the operation. If different from the true
17145 destination in operands[0], a copy operation will be required. */
17148 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
17149 rtx operands[])
17151 rtx dst = operands[0];
17152 rtx src1 = operands[1];
17153 rtx src2 = operands[2];
17155 /* Canonicalize operand order. */
17156 if (ix86_swap_binary_operands_p (code, mode, operands))
17158 rtx temp;
17160 /* It is invalid to swap operands of different modes. */
17161 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
17163 temp = src1;
17164 src1 = src2;
17165 src2 = temp;
17168 /* Both source operands cannot be in memory. */
17169 if (MEM_P (src1) && MEM_P (src2))
17171 /* Optimization: Only read from memory once. */
17172 if (rtx_equal_p (src1, src2))
17174 src2 = force_reg (mode, src2);
17175 src1 = src2;
17177 else if (rtx_equal_p (dst, src1))
17178 src2 = force_reg (mode, src2);
17179 else
17180 src1 = force_reg (mode, src1);
17183 /* If the destination is memory, and we do not have matching source
17184 operands, do things in registers. */
17185 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17186 dst = gen_reg_rtx (mode);
17188 /* Source 1 cannot be a constant. */
17189 if (CONSTANT_P (src1))
17190 src1 = force_reg (mode, src1);
17192 /* Source 1 cannot be a non-matching memory. */
17193 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17194 src1 = force_reg (mode, src1);
17196 /* Improve address combine. */
17197 if (code == PLUS
17198 && GET_MODE_CLASS (mode) == MODE_INT
17199 && MEM_P (src2))
17200 src2 = force_reg (mode, src2);
17202 operands[1] = src1;
17203 operands[2] = src2;
17204 return dst;
17207 /* Similarly, but assume that the destination has already been
17208 set up properly. */
17210 void
17211 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17212 enum machine_mode mode, rtx operands[])
17214 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17215 gcc_assert (dst == operands[0]);
17218 /* Attempt to expand a binary operator. Make the expansion closer to the
17219 actual machine, then just general_operand, which will allow 3 separate
17220 memory references (one output, two input) in a single insn. */
17222 void
17223 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17224 rtx operands[])
17226 rtx src1, src2, dst, op, clob;
17228 dst = ix86_fixup_binary_operands (code, mode, operands);
17229 src1 = operands[1];
17230 src2 = operands[2];
17232 /* Emit the instruction. */
17234 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17235 if (reload_in_progress)
17237 /* Reload doesn't know about the flags register, and doesn't know that
17238 it doesn't want to clobber it. We can only do this with PLUS. */
17239 gcc_assert (code == PLUS);
17240 emit_insn (op);
17242 else if (reload_completed
17243 && code == PLUS
17244 && !rtx_equal_p (dst, src1))
17246 /* This is going to be an LEA; avoid splitting it later. */
17247 emit_insn (op);
17249 else
17251 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17252 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17255 /* Fix up the destination if needed. */
17256 if (dst != operands[0])
17257 emit_move_insn (operands[0], dst);
17260 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17261 the given OPERANDS. */
17263 void
17264 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17265 rtx operands[])
17267 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17268 if (GET_CODE (operands[1]) == SUBREG)
17270 op1 = operands[1];
17271 op2 = operands[2];
17273 else if (GET_CODE (operands[2]) == SUBREG)
17275 op1 = operands[2];
17276 op2 = operands[1];
17278 /* Optimize (__m128i) d | (__m128i) e and similar code
17279 when d and e are float vectors into float vector logical
17280 insn. In C/C++ without using intrinsics there is no other way
17281 to express vector logical operation on float vectors than
17282 to cast them temporarily to integer vectors. */
17283 if (op1
17284 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17285 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17286 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17287 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17288 && SUBREG_BYTE (op1) == 0
17289 && (GET_CODE (op2) == CONST_VECTOR
17290 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17291 && SUBREG_BYTE (op2) == 0))
17292 && can_create_pseudo_p ())
17294 rtx dst;
17295 switch (GET_MODE (SUBREG_REG (op1)))
17297 case V4SFmode:
17298 case V8SFmode:
17299 case V2DFmode:
17300 case V4DFmode:
17301 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17302 if (GET_CODE (op2) == CONST_VECTOR)
17304 op2 = gen_lowpart (GET_MODE (dst), op2);
17305 op2 = force_reg (GET_MODE (dst), op2);
17307 else
17309 op1 = operands[1];
17310 op2 = SUBREG_REG (operands[2]);
17311 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17312 op2 = force_reg (GET_MODE (dst), op2);
17314 op1 = SUBREG_REG (op1);
17315 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17316 op1 = force_reg (GET_MODE (dst), op1);
17317 emit_insn (gen_rtx_SET (VOIDmode, dst,
17318 gen_rtx_fmt_ee (code, GET_MODE (dst),
17319 op1, op2)));
17320 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17321 return;
17322 default:
17323 break;
17326 if (!nonimmediate_operand (operands[1], mode))
17327 operands[1] = force_reg (mode, operands[1]);
17328 if (!nonimmediate_operand (operands[2], mode))
17329 operands[2] = force_reg (mode, operands[2]);
17330 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17331 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17332 gen_rtx_fmt_ee (code, mode, operands[1],
17333 operands[2])));
17336 /* Return TRUE or FALSE depending on whether the binary operator meets the
17337 appropriate constraints. */
17339 bool
17340 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17341 rtx operands[3])
17343 rtx dst = operands[0];
17344 rtx src1 = operands[1];
17345 rtx src2 = operands[2];
17347 /* Both source operands cannot be in memory. */
17348 if (MEM_P (src1) && MEM_P (src2))
17349 return false;
17351 /* Canonicalize operand order for commutative operators. */
17352 if (ix86_swap_binary_operands_p (code, mode, operands))
17354 rtx temp = src1;
17355 src1 = src2;
17356 src2 = temp;
17359 /* If the destination is memory, we must have a matching source operand. */
17360 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17361 return false;
17363 /* Source 1 cannot be a constant. */
17364 if (CONSTANT_P (src1))
17365 return false;
17367 /* Source 1 cannot be a non-matching memory. */
17368 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17369 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17370 return (code == AND
17371 && (mode == HImode
17372 || mode == SImode
17373 || (TARGET_64BIT && mode == DImode))
17374 && satisfies_constraint_L (src2));
17376 return true;
17379 /* Attempt to expand a unary operator. Make the expansion closer to the
17380 actual machine, then just general_operand, which will allow 2 separate
17381 memory references (one output, one input) in a single insn. */
17383 void
17384 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17385 rtx operands[])
17387 int matching_memory;
17388 rtx src, dst, op, clob;
17390 dst = operands[0];
17391 src = operands[1];
17393 /* If the destination is memory, and we do not have matching source
17394 operands, do things in registers. */
17395 matching_memory = 0;
17396 if (MEM_P (dst))
17398 if (rtx_equal_p (dst, src))
17399 matching_memory = 1;
17400 else
17401 dst = gen_reg_rtx (mode);
17404 /* When source operand is memory, destination must match. */
17405 if (MEM_P (src) && !matching_memory)
17406 src = force_reg (mode, src);
17408 /* Emit the instruction. */
17410 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17411 if (reload_in_progress || code == NOT)
17413 /* Reload doesn't know about the flags register, and doesn't know that
17414 it doesn't want to clobber it. */
17415 gcc_assert (code == NOT);
17416 emit_insn (op);
17418 else
17420 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17421 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17424 /* Fix up the destination if needed. */
17425 if (dst != operands[0])
17426 emit_move_insn (operands[0], dst);
17429 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17430 divisor are within the range [0-255]. */
17432 void
17433 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17434 bool signed_p)
17436 rtx end_label, qimode_label;
17437 rtx insn, div, mod;
17438 rtx scratch, tmp0, tmp1, tmp2;
17439 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17440 rtx (*gen_zero_extend) (rtx, rtx);
17441 rtx (*gen_test_ccno_1) (rtx, rtx);
17443 switch (mode)
17445 case SImode:
17446 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17447 gen_test_ccno_1 = gen_testsi_ccno_1;
17448 gen_zero_extend = gen_zero_extendqisi2;
17449 break;
17450 case DImode:
17451 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17452 gen_test_ccno_1 = gen_testdi_ccno_1;
17453 gen_zero_extend = gen_zero_extendqidi2;
17454 break;
17455 default:
17456 gcc_unreachable ();
17459 end_label = gen_label_rtx ();
17460 qimode_label = gen_label_rtx ();
17462 scratch = gen_reg_rtx (mode);
17464 /* Use 8bit unsigned divimod if dividend and divisor are within
17465 the range [0-255]. */
17466 emit_move_insn (scratch, operands[2]);
17467 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17468 scratch, 1, OPTAB_DIRECT);
17469 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17470 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17471 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17472 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17473 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17474 pc_rtx);
17475 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17476 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17477 JUMP_LABEL (insn) = qimode_label;
17479 /* Generate original signed/unsigned divimod. */
17480 div = gen_divmod4_1 (operands[0], operands[1],
17481 operands[2], operands[3]);
17482 emit_insn (div);
17484 /* Branch to the end. */
17485 emit_jump_insn (gen_jump (end_label));
17486 emit_barrier ();
17488 /* Generate 8bit unsigned divide. */
17489 emit_label (qimode_label);
17490 /* Don't use operands[0] for result of 8bit divide since not all
17491 registers support QImode ZERO_EXTRACT. */
17492 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17493 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17494 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17495 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17497 if (signed_p)
17499 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17500 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17502 else
17504 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17505 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17508 /* Extract remainder from AH. */
17509 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17510 if (REG_P (operands[1]))
17511 insn = emit_move_insn (operands[1], tmp1);
17512 else
17514 /* Need a new scratch register since the old one has result
17515 of 8bit divide. */
17516 scratch = gen_reg_rtx (mode);
17517 emit_move_insn (scratch, tmp1);
17518 insn = emit_move_insn (operands[1], scratch);
17520 set_unique_reg_note (insn, REG_EQUAL, mod);
17522 /* Zero extend quotient from AL. */
17523 tmp1 = gen_lowpart (QImode, tmp0);
17524 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17525 set_unique_reg_note (insn, REG_EQUAL, div);
17527 emit_label (end_label);
17530 /* Whether it is OK to emit CFI directives when emitting asm code. */
17532 bool
17533 ix86_emit_cfi ()
17535 return dwarf2out_do_cfi_asm ();
17538 #define LEA_MAX_STALL (3)
17539 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17541 /* Increase given DISTANCE in half-cycles according to
17542 dependencies between PREV and NEXT instructions.
17543 Add 1 half-cycle if there is no dependency and
17544 go to next cycle if there is some dependecy. */
17546 static unsigned int
17547 increase_distance (rtx prev, rtx next, unsigned int distance)
17549 df_ref *use_rec;
17550 df_ref *def_rec;
17552 if (!prev || !next)
17553 return distance + (distance & 1) + 2;
17555 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17556 return distance + 1;
17558 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17559 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17560 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17561 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17562 return distance + (distance & 1) + 2;
17564 return distance + 1;
17567 /* Function checks if instruction INSN defines register number
17568 REGNO1 or REGNO2. */
17570 static bool
17571 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17572 rtx insn)
17574 df_ref *def_rec;
17576 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17577 if (DF_REF_REG_DEF_P (*def_rec)
17578 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17579 && (regno1 == DF_REF_REGNO (*def_rec)
17580 || regno2 == DF_REF_REGNO (*def_rec)))
17582 return true;
17585 return false;
17588 /* Function checks if instruction INSN uses register number
17589 REGNO as a part of address expression. */
17591 static bool
17592 insn_uses_reg_mem (unsigned int regno, rtx insn)
17594 df_ref *use_rec;
17596 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17597 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17598 return true;
17600 return false;
17603 /* Search backward for non-agu definition of register number REGNO1
17604 or register number REGNO2 in basic block starting from instruction
17605 START up to head of basic block or instruction INSN.
17607 Function puts true value into *FOUND var if definition was found
17608 and false otherwise.
17610 Distance in half-cycles between START and found instruction or head
17611 of BB is added to DISTANCE and returned. */
17613 static int
17614 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17615 rtx insn, int distance,
17616 rtx start, bool *found)
17618 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17619 rtx prev = start;
17620 rtx next = NULL;
17622 *found = false;
17624 while (prev
17625 && prev != insn
17626 && distance < LEA_SEARCH_THRESHOLD)
17628 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17630 distance = increase_distance (prev, next, distance);
17631 if (insn_defines_reg (regno1, regno2, prev))
17633 if (recog_memoized (prev) < 0
17634 || get_attr_type (prev) != TYPE_LEA)
17636 *found = true;
17637 return distance;
17641 next = prev;
17643 if (prev == BB_HEAD (bb))
17644 break;
17646 prev = PREV_INSN (prev);
17649 return distance;
17652 /* Search backward for non-agu definition of register number REGNO1
17653 or register number REGNO2 in INSN's basic block until
17654 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17655 2. Reach neighbour BBs boundary, or
17656 3. Reach agu definition.
17657 Returns the distance between the non-agu definition point and INSN.
17658 If no definition point, returns -1. */
17660 static int
17661 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17662 rtx insn)
17664 basic_block bb = BLOCK_FOR_INSN (insn);
17665 int distance = 0;
17666 bool found = false;
17668 if (insn != BB_HEAD (bb))
17669 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17670 distance, PREV_INSN (insn),
17671 &found);
17673 if (!found && distance < LEA_SEARCH_THRESHOLD)
17675 edge e;
17676 edge_iterator ei;
17677 bool simple_loop = false;
17679 FOR_EACH_EDGE (e, ei, bb->preds)
17680 if (e->src == bb)
17682 simple_loop = true;
17683 break;
17686 if (simple_loop)
17687 distance = distance_non_agu_define_in_bb (regno1, regno2,
17688 insn, distance,
17689 BB_END (bb), &found);
17690 else
17692 int shortest_dist = -1;
17693 bool found_in_bb = false;
17695 FOR_EACH_EDGE (e, ei, bb->preds)
17697 int bb_dist
17698 = distance_non_agu_define_in_bb (regno1, regno2,
17699 insn, distance,
17700 BB_END (e->src),
17701 &found_in_bb);
17702 if (found_in_bb)
17704 if (shortest_dist < 0)
17705 shortest_dist = bb_dist;
17706 else if (bb_dist > 0)
17707 shortest_dist = MIN (bb_dist, shortest_dist);
17709 found = true;
17713 distance = shortest_dist;
17717 /* get_attr_type may modify recog data. We want to make sure
17718 that recog data is valid for instruction INSN, on which
17719 distance_non_agu_define is called. INSN is unchanged here. */
17720 extract_insn_cached (insn);
17722 if (!found)
17723 return -1;
17725 return distance >> 1;
17728 /* Return the distance in half-cycles between INSN and the next
17729 insn that uses register number REGNO in memory address added
17730 to DISTANCE. Return -1 if REGNO0 is set.
17732 Put true value into *FOUND if register usage was found and
17733 false otherwise.
17734 Put true value into *REDEFINED if register redefinition was
17735 found and false otherwise. */
17737 static int
17738 distance_agu_use_in_bb (unsigned int regno,
17739 rtx insn, int distance, rtx start,
17740 bool *found, bool *redefined)
17742 basic_block bb = NULL;
17743 rtx next = start;
17744 rtx prev = NULL;
17746 *found = false;
17747 *redefined = false;
17749 if (start != NULL_RTX)
17751 bb = BLOCK_FOR_INSN (start);
17752 if (start != BB_HEAD (bb))
17753 /* If insn and start belong to the same bb, set prev to insn,
17754 so the call to increase_distance will increase the distance
17755 between insns by 1. */
17756 prev = insn;
17759 while (next
17760 && next != insn
17761 && distance < LEA_SEARCH_THRESHOLD)
17763 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17765 distance = increase_distance(prev, next, distance);
17766 if (insn_uses_reg_mem (regno, next))
17768 /* Return DISTANCE if OP0 is used in memory
17769 address in NEXT. */
17770 *found = true;
17771 return distance;
17774 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17776 /* Return -1 if OP0 is set in NEXT. */
17777 *redefined = true;
17778 return -1;
17781 prev = next;
17784 if (next == BB_END (bb))
17785 break;
17787 next = NEXT_INSN (next);
17790 return distance;
17793 /* Return the distance between INSN and the next insn that uses
17794 register number REGNO0 in memory address. Return -1 if no such
17795 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17797 static int
17798 distance_agu_use (unsigned int regno0, rtx insn)
17800 basic_block bb = BLOCK_FOR_INSN (insn);
17801 int distance = 0;
17802 bool found = false;
17803 bool redefined = false;
17805 if (insn != BB_END (bb))
17806 distance = distance_agu_use_in_bb (regno0, insn, distance,
17807 NEXT_INSN (insn),
17808 &found, &redefined);
17810 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17812 edge e;
17813 edge_iterator ei;
17814 bool simple_loop = false;
17816 FOR_EACH_EDGE (e, ei, bb->succs)
17817 if (e->dest == bb)
17819 simple_loop = true;
17820 break;
17823 if (simple_loop)
17824 distance = distance_agu_use_in_bb (regno0, insn,
17825 distance, BB_HEAD (bb),
17826 &found, &redefined);
17827 else
17829 int shortest_dist = -1;
17830 bool found_in_bb = false;
17831 bool redefined_in_bb = false;
17833 FOR_EACH_EDGE (e, ei, bb->succs)
17835 int bb_dist
17836 = distance_agu_use_in_bb (regno0, insn,
17837 distance, BB_HEAD (e->dest),
17838 &found_in_bb, &redefined_in_bb);
17839 if (found_in_bb)
17841 if (shortest_dist < 0)
17842 shortest_dist = bb_dist;
17843 else if (bb_dist > 0)
17844 shortest_dist = MIN (bb_dist, shortest_dist);
17846 found = true;
17850 distance = shortest_dist;
17854 if (!found || redefined)
17855 return -1;
17857 return distance >> 1;
17860 /* Define this macro to tune LEA priority vs ADD, it take effect when
17861 there is a dilemma of choicing LEA or ADD
17862 Negative value: ADD is more preferred than LEA
17863 Zero: Netrual
17864 Positive value: LEA is more preferred than ADD*/
17865 #define IX86_LEA_PRIORITY 0
17867 /* Return true if usage of lea INSN has performance advantage
17868 over a sequence of instructions. Instructions sequence has
17869 SPLIT_COST cycles higher latency than lea latency. */
17871 static bool
17872 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17873 unsigned int regno2, int split_cost, bool has_scale)
17875 int dist_define, dist_use;
17877 /* For Silvermont if using a 2-source or 3-source LEA for
17878 non-destructive destination purposes, or due to wanting
17879 ability to use SCALE, the use of LEA is justified. */
17880 if (ix86_tune == PROCESSOR_SLM)
17882 if (has_scale)
17883 return true;
17884 if (split_cost < 1)
17885 return false;
17886 if (regno0 == regno1 || regno0 == regno2)
17887 return false;
17888 return true;
17891 dist_define = distance_non_agu_define (regno1, regno2, insn);
17892 dist_use = distance_agu_use (regno0, insn);
17894 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17896 /* If there is no non AGU operand definition, no AGU
17897 operand usage and split cost is 0 then both lea
17898 and non lea variants have same priority. Currently
17899 we prefer lea for 64 bit code and non lea on 32 bit
17900 code. */
17901 if (dist_use < 0 && split_cost == 0)
17902 return TARGET_64BIT || IX86_LEA_PRIORITY;
17903 else
17904 return true;
17907 /* With longer definitions distance lea is more preferable.
17908 Here we change it to take into account splitting cost and
17909 lea priority. */
17910 dist_define += split_cost + IX86_LEA_PRIORITY;
17912 /* If there is no use in memory addess then we just check
17913 that split cost exceeds AGU stall. */
17914 if (dist_use < 0)
17915 return dist_define > LEA_MAX_STALL;
17917 /* If this insn has both backward non-agu dependence and forward
17918 agu dependence, the one with short distance takes effect. */
17919 return dist_define >= dist_use;
17922 /* Return true if it is legal to clobber flags by INSN and
17923 false otherwise. */
17925 static bool
17926 ix86_ok_to_clobber_flags (rtx insn)
17928 basic_block bb = BLOCK_FOR_INSN (insn);
17929 df_ref *use;
17930 bitmap live;
17932 while (insn)
17934 if (NONDEBUG_INSN_P (insn))
17936 for (use = DF_INSN_USES (insn); *use; use++)
17937 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17938 return false;
17940 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17941 return true;
17944 if (insn == BB_END (bb))
17945 break;
17947 insn = NEXT_INSN (insn);
17950 live = df_get_live_out(bb);
17951 return !REGNO_REG_SET_P (live, FLAGS_REG);
17954 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17955 move and add to avoid AGU stalls. */
17957 bool
17958 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17960 unsigned int regno0, regno1, regno2;
17962 /* Check if we need to optimize. */
17963 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17964 return false;
17966 /* Check it is correct to split here. */
17967 if (!ix86_ok_to_clobber_flags(insn))
17968 return false;
17970 regno0 = true_regnum (operands[0]);
17971 regno1 = true_regnum (operands[1]);
17972 regno2 = true_regnum (operands[2]);
17974 /* We need to split only adds with non destructive
17975 destination operand. */
17976 if (regno0 == regno1 || regno0 == regno2)
17977 return false;
17978 else
17979 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17982 /* Return true if we should emit lea instruction instead of mov
17983 instruction. */
17985 bool
17986 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17988 unsigned int regno0, regno1;
17990 /* Check if we need to optimize. */
17991 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17992 return false;
17994 /* Use lea for reg to reg moves only. */
17995 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17996 return false;
17998 regno0 = true_regnum (operands[0]);
17999 regno1 = true_regnum (operands[1]);
18001 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
18004 /* Return true if we need to split lea into a sequence of
18005 instructions to avoid AGU stalls. */
18007 bool
18008 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
18010 unsigned int regno0, regno1, regno2;
18011 int split_cost;
18012 struct ix86_address parts;
18013 int ok;
18015 /* Check we need to optimize. */
18016 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18017 return false;
18019 /* Check it is correct to split here. */
18020 if (!ix86_ok_to_clobber_flags(insn))
18021 return false;
18023 ok = ix86_decompose_address (operands[1], &parts);
18024 gcc_assert (ok);
18026 /* There should be at least two components in the address. */
18027 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
18028 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
18029 return false;
18031 /* We should not split into add if non legitimate pic
18032 operand is used as displacement. */
18033 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
18034 return false;
18036 regno0 = true_regnum (operands[0]) ;
18037 regno1 = INVALID_REGNUM;
18038 regno2 = INVALID_REGNUM;
18040 if (parts.base)
18041 regno1 = true_regnum (parts.base);
18042 if (parts.index)
18043 regno2 = true_regnum (parts.index);
18045 split_cost = 0;
18047 /* Compute how many cycles we will add to execution time
18048 if split lea into a sequence of instructions. */
18049 if (parts.base || parts.index)
18051 /* Have to use mov instruction if non desctructive
18052 destination form is used. */
18053 if (regno1 != regno0 && regno2 != regno0)
18054 split_cost += 1;
18056 /* Have to add index to base if both exist. */
18057 if (parts.base && parts.index)
18058 split_cost += 1;
18060 /* Have to use shift and adds if scale is 2 or greater. */
18061 if (parts.scale > 1)
18063 if (regno0 != regno1)
18064 split_cost += 1;
18065 else if (regno2 == regno0)
18066 split_cost += 4;
18067 else
18068 split_cost += parts.scale;
18071 /* Have to use add instruction with immediate if
18072 disp is non zero. */
18073 if (parts.disp && parts.disp != const0_rtx)
18074 split_cost += 1;
18076 /* Subtract the price of lea. */
18077 split_cost -= 1;
18080 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
18081 parts.scale > 1);
18084 /* Emit x86 binary operand CODE in mode MODE, where the first operand
18085 matches destination. RTX includes clobber of FLAGS_REG. */
18087 static void
18088 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
18089 rtx dst, rtx src)
18091 rtx op, clob;
18093 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
18094 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18096 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
18099 /* Return true if regno1 def is nearest to the insn. */
18101 static bool
18102 find_nearest_reg_def (rtx insn, int regno1, int regno2)
18104 rtx prev = insn;
18105 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
18107 if (insn == start)
18108 return false;
18109 while (prev && prev != start)
18111 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
18113 prev = PREV_INSN (prev);
18114 continue;
18116 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
18117 return true;
18118 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
18119 return false;
18120 prev = PREV_INSN (prev);
18123 /* None of the regs is defined in the bb. */
18124 return false;
18127 /* Split lea instructions into a sequence of instructions
18128 which are executed on ALU to avoid AGU stalls.
18129 It is assumed that it is allowed to clobber flags register
18130 at lea position. */
18132 void
18133 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
18135 unsigned int regno0, regno1, regno2;
18136 struct ix86_address parts;
18137 rtx target, tmp;
18138 int ok, adds;
18140 ok = ix86_decompose_address (operands[1], &parts);
18141 gcc_assert (ok);
18143 target = gen_lowpart (mode, operands[0]);
18145 regno0 = true_regnum (target);
18146 regno1 = INVALID_REGNUM;
18147 regno2 = INVALID_REGNUM;
18149 if (parts.base)
18151 parts.base = gen_lowpart (mode, parts.base);
18152 regno1 = true_regnum (parts.base);
18155 if (parts.index)
18157 parts.index = gen_lowpart (mode, parts.index);
18158 regno2 = true_regnum (parts.index);
18161 if (parts.disp)
18162 parts.disp = gen_lowpart (mode, parts.disp);
18164 if (parts.scale > 1)
18166 /* Case r1 = r1 + ... */
18167 if (regno1 == regno0)
18169 /* If we have a case r1 = r1 + C * r1 then we
18170 should use multiplication which is very
18171 expensive. Assume cost model is wrong if we
18172 have such case here. */
18173 gcc_assert (regno2 != regno0);
18175 for (adds = parts.scale; adds > 0; adds--)
18176 ix86_emit_binop (PLUS, mode, target, parts.index);
18178 else
18180 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
18181 if (regno0 != regno2)
18182 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18184 /* Use shift for scaling. */
18185 ix86_emit_binop (ASHIFT, mode, target,
18186 GEN_INT (exact_log2 (parts.scale)));
18188 if (parts.base)
18189 ix86_emit_binop (PLUS, mode, target, parts.base);
18191 if (parts.disp && parts.disp != const0_rtx)
18192 ix86_emit_binop (PLUS, mode, target, parts.disp);
18195 else if (!parts.base && !parts.index)
18197 gcc_assert(parts.disp);
18198 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18200 else
18202 if (!parts.base)
18204 if (regno0 != regno2)
18205 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18207 else if (!parts.index)
18209 if (regno0 != regno1)
18210 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18212 else
18214 if (regno0 == regno1)
18215 tmp = parts.index;
18216 else if (regno0 == regno2)
18217 tmp = parts.base;
18218 else
18220 rtx tmp1;
18222 /* Find better operand for SET instruction, depending
18223 on which definition is farther from the insn. */
18224 if (find_nearest_reg_def (insn, regno1, regno2))
18225 tmp = parts.index, tmp1 = parts.base;
18226 else
18227 tmp = parts.base, tmp1 = parts.index;
18229 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18231 if (parts.disp && parts.disp != const0_rtx)
18232 ix86_emit_binop (PLUS, mode, target, parts.disp);
18234 ix86_emit_binop (PLUS, mode, target, tmp1);
18235 return;
18238 ix86_emit_binop (PLUS, mode, target, tmp);
18241 if (parts.disp && parts.disp != const0_rtx)
18242 ix86_emit_binop (PLUS, mode, target, parts.disp);
18246 /* Return true if it is ok to optimize an ADD operation to LEA
18247 operation to avoid flag register consumation. For most processors,
18248 ADD is faster than LEA. For the processors like ATOM, if the
18249 destination register of LEA holds an actual address which will be
18250 used soon, LEA is better and otherwise ADD is better. */
18252 bool
18253 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18255 unsigned int regno0 = true_regnum (operands[0]);
18256 unsigned int regno1 = true_regnum (operands[1]);
18257 unsigned int regno2 = true_regnum (operands[2]);
18259 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18260 if (regno0 != regno1 && regno0 != regno2)
18261 return true;
18263 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18264 return false;
18266 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18269 /* Return true if destination reg of SET_BODY is shift count of
18270 USE_BODY. */
18272 static bool
18273 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18275 rtx set_dest;
18276 rtx shift_rtx;
18277 int i;
18279 /* Retrieve destination of SET_BODY. */
18280 switch (GET_CODE (set_body))
18282 case SET:
18283 set_dest = SET_DEST (set_body);
18284 if (!set_dest || !REG_P (set_dest))
18285 return false;
18286 break;
18287 case PARALLEL:
18288 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18289 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18290 use_body))
18291 return true;
18292 default:
18293 return false;
18294 break;
18297 /* Retrieve shift count of USE_BODY. */
18298 switch (GET_CODE (use_body))
18300 case SET:
18301 shift_rtx = XEXP (use_body, 1);
18302 break;
18303 case PARALLEL:
18304 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18305 if (ix86_dep_by_shift_count_body (set_body,
18306 XVECEXP (use_body, 0, i)))
18307 return true;
18308 default:
18309 return false;
18310 break;
18313 if (shift_rtx
18314 && (GET_CODE (shift_rtx) == ASHIFT
18315 || GET_CODE (shift_rtx) == LSHIFTRT
18316 || GET_CODE (shift_rtx) == ASHIFTRT
18317 || GET_CODE (shift_rtx) == ROTATE
18318 || GET_CODE (shift_rtx) == ROTATERT))
18320 rtx shift_count = XEXP (shift_rtx, 1);
18322 /* Return true if shift count is dest of SET_BODY. */
18323 if (REG_P (shift_count))
18325 /* Add check since it can be invoked before register
18326 allocation in pre-reload schedule. */
18327 if (reload_completed
18328 && true_regnum (set_dest) == true_regnum (shift_count))
18329 return true;
18330 else if (REGNO(set_dest) == REGNO(shift_count))
18331 return true;
18335 return false;
18338 /* Return true if destination reg of SET_INSN is shift count of
18339 USE_INSN. */
18341 bool
18342 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18344 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18345 PATTERN (use_insn));
18348 /* Return TRUE or FALSE depending on whether the unary operator meets the
18349 appropriate constraints. */
18351 bool
18352 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18353 enum machine_mode mode ATTRIBUTE_UNUSED,
18354 rtx operands[2])
18356 /* If one of operands is memory, source and destination must match. */
18357 if ((MEM_P (operands[0])
18358 || MEM_P (operands[1]))
18359 && ! rtx_equal_p (operands[0], operands[1]))
18360 return false;
18361 return true;
18364 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18365 are ok, keeping in mind the possible movddup alternative. */
18367 bool
18368 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18370 if (MEM_P (operands[0]))
18371 return rtx_equal_p (operands[0], operands[1 + high]);
18372 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18373 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18374 return true;
18377 /* Post-reload splitter for converting an SF or DFmode value in an
18378 SSE register into an unsigned SImode. */
18380 void
18381 ix86_split_convert_uns_si_sse (rtx operands[])
18383 enum machine_mode vecmode;
18384 rtx value, large, zero_or_two31, input, two31, x;
18386 large = operands[1];
18387 zero_or_two31 = operands[2];
18388 input = operands[3];
18389 two31 = operands[4];
18390 vecmode = GET_MODE (large);
18391 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18393 /* Load up the value into the low element. We must ensure that the other
18394 elements are valid floats -- zero is the easiest such value. */
18395 if (MEM_P (input))
18397 if (vecmode == V4SFmode)
18398 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18399 else
18400 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18402 else
18404 input = gen_rtx_REG (vecmode, REGNO (input));
18405 emit_move_insn (value, CONST0_RTX (vecmode));
18406 if (vecmode == V4SFmode)
18407 emit_insn (gen_sse_movss (value, value, input));
18408 else
18409 emit_insn (gen_sse2_movsd (value, value, input));
18412 emit_move_insn (large, two31);
18413 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18415 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18416 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18418 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18419 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18421 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18422 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18424 large = gen_rtx_REG (V4SImode, REGNO (large));
18425 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18427 x = gen_rtx_REG (V4SImode, REGNO (value));
18428 if (vecmode == V4SFmode)
18429 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18430 else
18431 emit_insn (gen_sse2_cvttpd2dq (x, value));
18432 value = x;
18434 emit_insn (gen_xorv4si3 (value, value, large));
18437 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18438 Expects the 64-bit DImode to be supplied in a pair of integral
18439 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18440 -mfpmath=sse, !optimize_size only. */
18442 void
18443 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18445 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18446 rtx int_xmm, fp_xmm;
18447 rtx biases, exponents;
18448 rtx x;
18450 int_xmm = gen_reg_rtx (V4SImode);
18451 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18452 emit_insn (gen_movdi_to_sse (int_xmm, input));
18453 else if (TARGET_SSE_SPLIT_REGS)
18455 emit_clobber (int_xmm);
18456 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18458 else
18460 x = gen_reg_rtx (V2DImode);
18461 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18462 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18465 x = gen_rtx_CONST_VECTOR (V4SImode,
18466 gen_rtvec (4, GEN_INT (0x43300000UL),
18467 GEN_INT (0x45300000UL),
18468 const0_rtx, const0_rtx));
18469 exponents = validize_mem (force_const_mem (V4SImode, x));
18471 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18472 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18474 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18475 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18476 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18477 (0x1.0p84 + double(fp_value_hi_xmm)).
18478 Note these exponents differ by 32. */
18480 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18482 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18483 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18484 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18485 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18486 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18487 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18488 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18489 biases = validize_mem (force_const_mem (V2DFmode, biases));
18490 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18492 /* Add the upper and lower DFmode values together. */
18493 if (TARGET_SSE3)
18494 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18495 else
18497 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18498 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18499 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18502 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18505 /* Not used, but eases macroization of patterns. */
18506 void
18507 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18508 rtx input ATTRIBUTE_UNUSED)
18510 gcc_unreachable ();
18513 /* Convert an unsigned SImode value into a DFmode. Only currently used
18514 for SSE, but applicable anywhere. */
18516 void
18517 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18519 REAL_VALUE_TYPE TWO31r;
18520 rtx x, fp;
18522 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18523 NULL, 1, OPTAB_DIRECT);
18525 fp = gen_reg_rtx (DFmode);
18526 emit_insn (gen_floatsidf2 (fp, x));
18528 real_ldexp (&TWO31r, &dconst1, 31);
18529 x = const_double_from_real_value (TWO31r, DFmode);
18531 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18532 if (x != target)
18533 emit_move_insn (target, x);
18536 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18537 32-bit mode; otherwise we have a direct convert instruction. */
18539 void
18540 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18542 REAL_VALUE_TYPE TWO32r;
18543 rtx fp_lo, fp_hi, x;
18545 fp_lo = gen_reg_rtx (DFmode);
18546 fp_hi = gen_reg_rtx (DFmode);
18548 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18550 real_ldexp (&TWO32r, &dconst1, 32);
18551 x = const_double_from_real_value (TWO32r, DFmode);
18552 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18554 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18556 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18557 0, OPTAB_DIRECT);
18558 if (x != target)
18559 emit_move_insn (target, x);
18562 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18563 For x86_32, -mfpmath=sse, !optimize_size only. */
18564 void
18565 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18567 REAL_VALUE_TYPE ONE16r;
18568 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18570 real_ldexp (&ONE16r, &dconst1, 16);
18571 x = const_double_from_real_value (ONE16r, SFmode);
18572 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18573 NULL, 0, OPTAB_DIRECT);
18574 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18575 NULL, 0, OPTAB_DIRECT);
18576 fp_hi = gen_reg_rtx (SFmode);
18577 fp_lo = gen_reg_rtx (SFmode);
18578 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18579 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18580 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18581 0, OPTAB_DIRECT);
18582 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18583 0, OPTAB_DIRECT);
18584 if (!rtx_equal_p (target, fp_hi))
18585 emit_move_insn (target, fp_hi);
18588 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18589 a vector of unsigned ints VAL to vector of floats TARGET. */
18591 void
18592 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18594 rtx tmp[8];
18595 REAL_VALUE_TYPE TWO16r;
18596 enum machine_mode intmode = GET_MODE (val);
18597 enum machine_mode fltmode = GET_MODE (target);
18598 rtx (*cvt) (rtx, rtx);
18600 if (intmode == V4SImode)
18601 cvt = gen_floatv4siv4sf2;
18602 else
18603 cvt = gen_floatv8siv8sf2;
18604 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18605 tmp[0] = force_reg (intmode, tmp[0]);
18606 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18607 OPTAB_DIRECT);
18608 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18609 NULL_RTX, 1, OPTAB_DIRECT);
18610 tmp[3] = gen_reg_rtx (fltmode);
18611 emit_insn (cvt (tmp[3], tmp[1]));
18612 tmp[4] = gen_reg_rtx (fltmode);
18613 emit_insn (cvt (tmp[4], tmp[2]));
18614 real_ldexp (&TWO16r, &dconst1, 16);
18615 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18616 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18617 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18618 OPTAB_DIRECT);
18619 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18620 OPTAB_DIRECT);
18621 if (tmp[7] != target)
18622 emit_move_insn (target, tmp[7]);
18625 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18626 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18627 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18628 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18631 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18633 REAL_VALUE_TYPE TWO31r;
18634 rtx two31r, tmp[4];
18635 enum machine_mode mode = GET_MODE (val);
18636 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18637 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18638 rtx (*cmp) (rtx, rtx, rtx, rtx);
18639 int i;
18641 for (i = 0; i < 3; i++)
18642 tmp[i] = gen_reg_rtx (mode);
18643 real_ldexp (&TWO31r, &dconst1, 31);
18644 two31r = const_double_from_real_value (TWO31r, scalarmode);
18645 two31r = ix86_build_const_vector (mode, 1, two31r);
18646 two31r = force_reg (mode, two31r);
18647 switch (mode)
18649 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18650 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18651 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18652 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18653 default: gcc_unreachable ();
18655 tmp[3] = gen_rtx_LE (mode, two31r, val);
18656 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18657 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18658 0, OPTAB_DIRECT);
18659 if (intmode == V4SImode || TARGET_AVX2)
18660 *xorp = expand_simple_binop (intmode, ASHIFT,
18661 gen_lowpart (intmode, tmp[0]),
18662 GEN_INT (31), NULL_RTX, 0,
18663 OPTAB_DIRECT);
18664 else
18666 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18667 two31 = ix86_build_const_vector (intmode, 1, two31);
18668 *xorp = expand_simple_binop (intmode, AND,
18669 gen_lowpart (intmode, tmp[0]),
18670 two31, NULL_RTX, 0,
18671 OPTAB_DIRECT);
18673 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18674 0, OPTAB_DIRECT);
18677 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18678 then replicate the value for all elements of the vector
18679 register. */
18682 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18684 int i, n_elt;
18685 rtvec v;
18686 enum machine_mode scalar_mode;
18688 switch (mode)
18690 case V32QImode:
18691 case V16QImode:
18692 case V16HImode:
18693 case V8HImode:
18694 case V8SImode:
18695 case V4SImode:
18696 case V4DImode:
18697 case V2DImode:
18698 gcc_assert (vect);
18699 case V8SFmode:
18700 case V4SFmode:
18701 case V4DFmode:
18702 case V2DFmode:
18703 n_elt = GET_MODE_NUNITS (mode);
18704 v = rtvec_alloc (n_elt);
18705 scalar_mode = GET_MODE_INNER (mode);
18707 RTVEC_ELT (v, 0) = value;
18709 for (i = 1; i < n_elt; ++i)
18710 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18712 return gen_rtx_CONST_VECTOR (mode, v);
18714 default:
18715 gcc_unreachable ();
18719 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18720 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18721 for an SSE register. If VECT is true, then replicate the mask for
18722 all elements of the vector register. If INVERT is true, then create
18723 a mask excluding the sign bit. */
18726 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18728 enum machine_mode vec_mode, imode;
18729 HOST_WIDE_INT hi, lo;
18730 int shift = 63;
18731 rtx v;
18732 rtx mask;
18734 /* Find the sign bit, sign extended to 2*HWI. */
18735 switch (mode)
18737 case V8SImode:
18738 case V4SImode:
18739 case V8SFmode:
18740 case V4SFmode:
18741 vec_mode = mode;
18742 mode = GET_MODE_INNER (mode);
18743 imode = SImode;
18744 lo = 0x80000000, hi = lo < 0;
18745 break;
18747 case V4DImode:
18748 case V2DImode:
18749 case V4DFmode:
18750 case V2DFmode:
18751 vec_mode = mode;
18752 mode = GET_MODE_INNER (mode);
18753 imode = DImode;
18754 if (HOST_BITS_PER_WIDE_INT >= 64)
18755 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18756 else
18757 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18758 break;
18760 case TImode:
18761 case TFmode:
18762 vec_mode = VOIDmode;
18763 if (HOST_BITS_PER_WIDE_INT >= 64)
18765 imode = TImode;
18766 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18768 else
18770 rtvec vec;
18772 imode = DImode;
18773 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18775 if (invert)
18777 lo = ~lo, hi = ~hi;
18778 v = constm1_rtx;
18780 else
18781 v = const0_rtx;
18783 mask = immed_double_const (lo, hi, imode);
18785 vec = gen_rtvec (2, v, mask);
18786 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18787 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18789 return v;
18791 break;
18793 default:
18794 gcc_unreachable ();
18797 if (invert)
18798 lo = ~lo, hi = ~hi;
18800 /* Force this value into the low part of a fp vector constant. */
18801 mask = immed_double_const (lo, hi, imode);
18802 mask = gen_lowpart (mode, mask);
18804 if (vec_mode == VOIDmode)
18805 return force_reg (mode, mask);
18807 v = ix86_build_const_vector (vec_mode, vect, mask);
18808 return force_reg (vec_mode, v);
18811 /* Generate code for floating point ABS or NEG. */
18813 void
18814 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18815 rtx operands[])
18817 rtx mask, set, dst, src;
18818 bool use_sse = false;
18819 bool vector_mode = VECTOR_MODE_P (mode);
18820 enum machine_mode vmode = mode;
18822 if (vector_mode)
18823 use_sse = true;
18824 else if (mode == TFmode)
18825 use_sse = true;
18826 else if (TARGET_SSE_MATH)
18828 use_sse = SSE_FLOAT_MODE_P (mode);
18829 if (mode == SFmode)
18830 vmode = V4SFmode;
18831 else if (mode == DFmode)
18832 vmode = V2DFmode;
18835 /* NEG and ABS performed with SSE use bitwise mask operations.
18836 Create the appropriate mask now. */
18837 if (use_sse)
18838 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18839 else
18840 mask = NULL_RTX;
18842 dst = operands[0];
18843 src = operands[1];
18845 set = gen_rtx_fmt_e (code, mode, src);
18846 set = gen_rtx_SET (VOIDmode, dst, set);
18848 if (mask)
18850 rtx use, clob;
18851 rtvec par;
18853 use = gen_rtx_USE (VOIDmode, mask);
18854 if (vector_mode)
18855 par = gen_rtvec (2, set, use);
18856 else
18858 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18859 par = gen_rtvec (3, set, use, clob);
18861 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18863 else
18864 emit_insn (set);
18867 /* Expand a copysign operation. Special case operand 0 being a constant. */
18869 void
18870 ix86_expand_copysign (rtx operands[])
18872 enum machine_mode mode, vmode;
18873 rtx dest, op0, op1, mask, nmask;
18875 dest = operands[0];
18876 op0 = operands[1];
18877 op1 = operands[2];
18879 mode = GET_MODE (dest);
18881 if (mode == SFmode)
18882 vmode = V4SFmode;
18883 else if (mode == DFmode)
18884 vmode = V2DFmode;
18885 else
18886 vmode = mode;
18888 if (GET_CODE (op0) == CONST_DOUBLE)
18890 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18892 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18893 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18895 if (mode == SFmode || mode == DFmode)
18897 if (op0 == CONST0_RTX (mode))
18898 op0 = CONST0_RTX (vmode);
18899 else
18901 rtx v = ix86_build_const_vector (vmode, false, op0);
18903 op0 = force_reg (vmode, v);
18906 else if (op0 != CONST0_RTX (mode))
18907 op0 = force_reg (mode, op0);
18909 mask = ix86_build_signbit_mask (vmode, 0, 0);
18911 if (mode == SFmode)
18912 copysign_insn = gen_copysignsf3_const;
18913 else if (mode == DFmode)
18914 copysign_insn = gen_copysigndf3_const;
18915 else
18916 copysign_insn = gen_copysigntf3_const;
18918 emit_insn (copysign_insn (dest, op0, op1, mask));
18920 else
18922 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18924 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18925 mask = ix86_build_signbit_mask (vmode, 0, 0);
18927 if (mode == SFmode)
18928 copysign_insn = gen_copysignsf3_var;
18929 else if (mode == DFmode)
18930 copysign_insn = gen_copysigndf3_var;
18931 else
18932 copysign_insn = gen_copysigntf3_var;
18934 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18938 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18939 be a constant, and so has already been expanded into a vector constant. */
18941 void
18942 ix86_split_copysign_const (rtx operands[])
18944 enum machine_mode mode, vmode;
18945 rtx dest, op0, mask, x;
18947 dest = operands[0];
18948 op0 = operands[1];
18949 mask = operands[3];
18951 mode = GET_MODE (dest);
18952 vmode = GET_MODE (mask);
18954 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18955 x = gen_rtx_AND (vmode, dest, mask);
18956 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18958 if (op0 != CONST0_RTX (vmode))
18960 x = gen_rtx_IOR (vmode, dest, op0);
18961 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18965 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18966 so we have to do two masks. */
18968 void
18969 ix86_split_copysign_var (rtx operands[])
18971 enum machine_mode mode, vmode;
18972 rtx dest, scratch, op0, op1, mask, nmask, x;
18974 dest = operands[0];
18975 scratch = operands[1];
18976 op0 = operands[2];
18977 op1 = operands[3];
18978 nmask = operands[4];
18979 mask = operands[5];
18981 mode = GET_MODE (dest);
18982 vmode = GET_MODE (mask);
18984 if (rtx_equal_p (op0, op1))
18986 /* Shouldn't happen often (it's useless, obviously), but when it does
18987 we'd generate incorrect code if we continue below. */
18988 emit_move_insn (dest, op0);
18989 return;
18992 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18994 gcc_assert (REGNO (op1) == REGNO (scratch));
18996 x = gen_rtx_AND (vmode, scratch, mask);
18997 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18999 dest = mask;
19000 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
19001 x = gen_rtx_NOT (vmode, dest);
19002 x = gen_rtx_AND (vmode, x, op0);
19003 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19005 else
19007 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
19009 x = gen_rtx_AND (vmode, scratch, mask);
19011 else /* alternative 2,4 */
19013 gcc_assert (REGNO (mask) == REGNO (scratch));
19014 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
19015 x = gen_rtx_AND (vmode, scratch, op1);
19017 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
19019 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
19021 dest = simplify_gen_subreg (vmode, op0, mode, 0);
19022 x = gen_rtx_AND (vmode, dest, nmask);
19024 else /* alternative 3,4 */
19026 gcc_assert (REGNO (nmask) == REGNO (dest));
19027 dest = nmask;
19028 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
19029 x = gen_rtx_AND (vmode, dest, op0);
19031 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19034 x = gen_rtx_IOR (vmode, dest, scratch);
19035 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19038 /* Return TRUE or FALSE depending on whether the first SET in INSN
19039 has source and destination with matching CC modes, and that the
19040 CC mode is at least as constrained as REQ_MODE. */
19042 bool
19043 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
19045 rtx set;
19046 enum machine_mode set_mode;
19048 set = PATTERN (insn);
19049 if (GET_CODE (set) == PARALLEL)
19050 set = XVECEXP (set, 0, 0);
19051 gcc_assert (GET_CODE (set) == SET);
19052 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
19054 set_mode = GET_MODE (SET_DEST (set));
19055 switch (set_mode)
19057 case CCNOmode:
19058 if (req_mode != CCNOmode
19059 && (req_mode != CCmode
19060 || XEXP (SET_SRC (set), 1) != const0_rtx))
19061 return false;
19062 break;
19063 case CCmode:
19064 if (req_mode == CCGCmode)
19065 return false;
19066 /* FALLTHRU */
19067 case CCGCmode:
19068 if (req_mode == CCGOCmode || req_mode == CCNOmode)
19069 return false;
19070 /* FALLTHRU */
19071 case CCGOCmode:
19072 if (req_mode == CCZmode)
19073 return false;
19074 /* FALLTHRU */
19075 case CCZmode:
19076 break;
19078 case CCAmode:
19079 case CCCmode:
19080 case CCOmode:
19081 case CCSmode:
19082 if (set_mode != req_mode)
19083 return false;
19084 break;
19086 default:
19087 gcc_unreachable ();
19090 return GET_MODE (SET_SRC (set)) == set_mode;
19093 /* Generate insn patterns to do an integer compare of OPERANDS. */
19095 static rtx
19096 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
19098 enum machine_mode cmpmode;
19099 rtx tmp, flags;
19101 cmpmode = SELECT_CC_MODE (code, op0, op1);
19102 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
19104 /* This is very simple, but making the interface the same as in the
19105 FP case makes the rest of the code easier. */
19106 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
19107 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
19109 /* Return the test that should be put into the flags user, i.e.
19110 the bcc, scc, or cmov instruction. */
19111 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
19114 /* Figure out whether to use ordered or unordered fp comparisons.
19115 Return the appropriate mode to use. */
19117 enum machine_mode
19118 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
19120 /* ??? In order to make all comparisons reversible, we do all comparisons
19121 non-trapping when compiling for IEEE. Once gcc is able to distinguish
19122 all forms trapping and nontrapping comparisons, we can make inequality
19123 comparisons trapping again, since it results in better code when using
19124 FCOM based compares. */
19125 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
19128 enum machine_mode
19129 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
19131 enum machine_mode mode = GET_MODE (op0);
19133 if (SCALAR_FLOAT_MODE_P (mode))
19135 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19136 return ix86_fp_compare_mode (code);
19139 switch (code)
19141 /* Only zero flag is needed. */
19142 case EQ: /* ZF=0 */
19143 case NE: /* ZF!=0 */
19144 return CCZmode;
19145 /* Codes needing carry flag. */
19146 case GEU: /* CF=0 */
19147 case LTU: /* CF=1 */
19148 /* Detect overflow checks. They need just the carry flag. */
19149 if (GET_CODE (op0) == PLUS
19150 && rtx_equal_p (op1, XEXP (op0, 0)))
19151 return CCCmode;
19152 else
19153 return CCmode;
19154 case GTU: /* CF=0 & ZF=0 */
19155 case LEU: /* CF=1 | ZF=1 */
19156 return CCmode;
19157 /* Codes possibly doable only with sign flag when
19158 comparing against zero. */
19159 case GE: /* SF=OF or SF=0 */
19160 case LT: /* SF<>OF or SF=1 */
19161 if (op1 == const0_rtx)
19162 return CCGOCmode;
19163 else
19164 /* For other cases Carry flag is not required. */
19165 return CCGCmode;
19166 /* Codes doable only with sign flag when comparing
19167 against zero, but we miss jump instruction for it
19168 so we need to use relational tests against overflow
19169 that thus needs to be zero. */
19170 case GT: /* ZF=0 & SF=OF */
19171 case LE: /* ZF=1 | SF<>OF */
19172 if (op1 == const0_rtx)
19173 return CCNOmode;
19174 else
19175 return CCGCmode;
19176 /* strcmp pattern do (use flags) and combine may ask us for proper
19177 mode. */
19178 case USE:
19179 return CCmode;
19180 default:
19181 gcc_unreachable ();
19185 /* Return the fixed registers used for condition codes. */
19187 static bool
19188 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
19190 *p1 = FLAGS_REG;
19191 *p2 = FPSR_REG;
19192 return true;
19195 /* If two condition code modes are compatible, return a condition code
19196 mode which is compatible with both. Otherwise, return
19197 VOIDmode. */
19199 static enum machine_mode
19200 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19202 if (m1 == m2)
19203 return m1;
19205 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19206 return VOIDmode;
19208 if ((m1 == CCGCmode && m2 == CCGOCmode)
19209 || (m1 == CCGOCmode && m2 == CCGCmode))
19210 return CCGCmode;
19212 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19213 return m2;
19214 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19215 return m1;
19217 switch (m1)
19219 default:
19220 gcc_unreachable ();
19222 case CCmode:
19223 case CCGCmode:
19224 case CCGOCmode:
19225 case CCNOmode:
19226 case CCAmode:
19227 case CCCmode:
19228 case CCOmode:
19229 case CCSmode:
19230 case CCZmode:
19231 switch (m2)
19233 default:
19234 return VOIDmode;
19236 case CCmode:
19237 case CCGCmode:
19238 case CCGOCmode:
19239 case CCNOmode:
19240 case CCAmode:
19241 case CCCmode:
19242 case CCOmode:
19243 case CCSmode:
19244 case CCZmode:
19245 return CCmode;
19248 case CCFPmode:
19249 case CCFPUmode:
19250 /* These are only compatible with themselves, which we already
19251 checked above. */
19252 return VOIDmode;
19257 /* Return a comparison we can do and that it is equivalent to
19258 swap_condition (code) apart possibly from orderedness.
19259 But, never change orderedness if TARGET_IEEE_FP, returning
19260 UNKNOWN in that case if necessary. */
19262 static enum rtx_code
19263 ix86_fp_swap_condition (enum rtx_code code)
19265 switch (code)
19267 case GT: /* GTU - CF=0 & ZF=0 */
19268 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19269 case GE: /* GEU - CF=0 */
19270 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19271 case UNLT: /* LTU - CF=1 */
19272 return TARGET_IEEE_FP ? UNKNOWN : GT;
19273 case UNLE: /* LEU - CF=1 | ZF=1 */
19274 return TARGET_IEEE_FP ? UNKNOWN : GE;
19275 default:
19276 return swap_condition (code);
19280 /* Return cost of comparison CODE using the best strategy for performance.
19281 All following functions do use number of instructions as a cost metrics.
19282 In future this should be tweaked to compute bytes for optimize_size and
19283 take into account performance of various instructions on various CPUs. */
19285 static int
19286 ix86_fp_comparison_cost (enum rtx_code code)
19288 int arith_cost;
19290 /* The cost of code using bit-twiddling on %ah. */
19291 switch (code)
19293 case UNLE:
19294 case UNLT:
19295 case LTGT:
19296 case GT:
19297 case GE:
19298 case UNORDERED:
19299 case ORDERED:
19300 case UNEQ:
19301 arith_cost = 4;
19302 break;
19303 case LT:
19304 case NE:
19305 case EQ:
19306 case UNGE:
19307 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19308 break;
19309 case LE:
19310 case UNGT:
19311 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19312 break;
19313 default:
19314 gcc_unreachable ();
19317 switch (ix86_fp_comparison_strategy (code))
19319 case IX86_FPCMP_COMI:
19320 return arith_cost > 4 ? 3 : 2;
19321 case IX86_FPCMP_SAHF:
19322 return arith_cost > 4 ? 4 : 3;
19323 default:
19324 return arith_cost;
19328 /* Return strategy to use for floating-point. We assume that fcomi is always
19329 preferrable where available, since that is also true when looking at size
19330 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19332 enum ix86_fpcmp_strategy
19333 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19335 /* Do fcomi/sahf based test when profitable. */
19337 if (TARGET_CMOVE)
19338 return IX86_FPCMP_COMI;
19340 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19341 return IX86_FPCMP_SAHF;
19343 return IX86_FPCMP_ARITH;
19346 /* Swap, force into registers, or otherwise massage the two operands
19347 to a fp comparison. The operands are updated in place; the new
19348 comparison code is returned. */
19350 static enum rtx_code
19351 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19353 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19354 rtx op0 = *pop0, op1 = *pop1;
19355 enum machine_mode op_mode = GET_MODE (op0);
19356 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19358 /* All of the unordered compare instructions only work on registers.
19359 The same is true of the fcomi compare instructions. The XFmode
19360 compare instructions require registers except when comparing
19361 against zero or when converting operand 1 from fixed point to
19362 floating point. */
19364 if (!is_sse
19365 && (fpcmp_mode == CCFPUmode
19366 || (op_mode == XFmode
19367 && ! (standard_80387_constant_p (op0) == 1
19368 || standard_80387_constant_p (op1) == 1)
19369 && GET_CODE (op1) != FLOAT)
19370 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19372 op0 = force_reg (op_mode, op0);
19373 op1 = force_reg (op_mode, op1);
19375 else
19377 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19378 things around if they appear profitable, otherwise force op0
19379 into a register. */
19381 if (standard_80387_constant_p (op0) == 0
19382 || (MEM_P (op0)
19383 && ! (standard_80387_constant_p (op1) == 0
19384 || MEM_P (op1))))
19386 enum rtx_code new_code = ix86_fp_swap_condition (code);
19387 if (new_code != UNKNOWN)
19389 rtx tmp;
19390 tmp = op0, op0 = op1, op1 = tmp;
19391 code = new_code;
19395 if (!REG_P (op0))
19396 op0 = force_reg (op_mode, op0);
19398 if (CONSTANT_P (op1))
19400 int tmp = standard_80387_constant_p (op1);
19401 if (tmp == 0)
19402 op1 = validize_mem (force_const_mem (op_mode, op1));
19403 else if (tmp == 1)
19405 if (TARGET_CMOVE)
19406 op1 = force_reg (op_mode, op1);
19408 else
19409 op1 = force_reg (op_mode, op1);
19413 /* Try to rearrange the comparison to make it cheaper. */
19414 if (ix86_fp_comparison_cost (code)
19415 > ix86_fp_comparison_cost (swap_condition (code))
19416 && (REG_P (op1) || can_create_pseudo_p ()))
19418 rtx tmp;
19419 tmp = op0, op0 = op1, op1 = tmp;
19420 code = swap_condition (code);
19421 if (!REG_P (op0))
19422 op0 = force_reg (op_mode, op0);
19425 *pop0 = op0;
19426 *pop1 = op1;
19427 return code;
19430 /* Convert comparison codes we use to represent FP comparison to integer
19431 code that will result in proper branch. Return UNKNOWN if no such code
19432 is available. */
19434 enum rtx_code
19435 ix86_fp_compare_code_to_integer (enum rtx_code code)
19437 switch (code)
19439 case GT:
19440 return GTU;
19441 case GE:
19442 return GEU;
19443 case ORDERED:
19444 case UNORDERED:
19445 return code;
19446 break;
19447 case UNEQ:
19448 return EQ;
19449 break;
19450 case UNLT:
19451 return LTU;
19452 break;
19453 case UNLE:
19454 return LEU;
19455 break;
19456 case LTGT:
19457 return NE;
19458 break;
19459 default:
19460 return UNKNOWN;
19464 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19466 static rtx
19467 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19469 enum machine_mode fpcmp_mode, intcmp_mode;
19470 rtx tmp, tmp2;
19472 fpcmp_mode = ix86_fp_compare_mode (code);
19473 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19475 /* Do fcomi/sahf based test when profitable. */
19476 switch (ix86_fp_comparison_strategy (code))
19478 case IX86_FPCMP_COMI:
19479 intcmp_mode = fpcmp_mode;
19480 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19481 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19482 tmp);
19483 emit_insn (tmp);
19484 break;
19486 case IX86_FPCMP_SAHF:
19487 intcmp_mode = fpcmp_mode;
19488 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19489 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19490 tmp);
19492 if (!scratch)
19493 scratch = gen_reg_rtx (HImode);
19494 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19495 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19496 break;
19498 case IX86_FPCMP_ARITH:
19499 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19500 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19501 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19502 if (!scratch)
19503 scratch = gen_reg_rtx (HImode);
19504 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19506 /* In the unordered case, we have to check C2 for NaN's, which
19507 doesn't happen to work out to anything nice combination-wise.
19508 So do some bit twiddling on the value we've got in AH to come
19509 up with an appropriate set of condition codes. */
19511 intcmp_mode = CCNOmode;
19512 switch (code)
19514 case GT:
19515 case UNGT:
19516 if (code == GT || !TARGET_IEEE_FP)
19518 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19519 code = EQ;
19521 else
19523 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19524 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19525 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19526 intcmp_mode = CCmode;
19527 code = GEU;
19529 break;
19530 case LT:
19531 case UNLT:
19532 if (code == LT && TARGET_IEEE_FP)
19534 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19535 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19536 intcmp_mode = CCmode;
19537 code = EQ;
19539 else
19541 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19542 code = NE;
19544 break;
19545 case GE:
19546 case UNGE:
19547 if (code == GE || !TARGET_IEEE_FP)
19549 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19550 code = EQ;
19552 else
19554 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19555 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19556 code = NE;
19558 break;
19559 case LE:
19560 case UNLE:
19561 if (code == LE && TARGET_IEEE_FP)
19563 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19564 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19565 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19566 intcmp_mode = CCmode;
19567 code = LTU;
19569 else
19571 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19572 code = NE;
19574 break;
19575 case EQ:
19576 case UNEQ:
19577 if (code == EQ && TARGET_IEEE_FP)
19579 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19580 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19581 intcmp_mode = CCmode;
19582 code = EQ;
19584 else
19586 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19587 code = NE;
19589 break;
19590 case NE:
19591 case LTGT:
19592 if (code == NE && TARGET_IEEE_FP)
19594 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19595 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19596 GEN_INT (0x40)));
19597 code = NE;
19599 else
19601 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19602 code = EQ;
19604 break;
19606 case UNORDERED:
19607 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19608 code = NE;
19609 break;
19610 case ORDERED:
19611 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19612 code = EQ;
19613 break;
19615 default:
19616 gcc_unreachable ();
19618 break;
19620 default:
19621 gcc_unreachable();
19624 /* Return the test that should be put into the flags user, i.e.
19625 the bcc, scc, or cmov instruction. */
19626 return gen_rtx_fmt_ee (code, VOIDmode,
19627 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19628 const0_rtx);
19631 static rtx
19632 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19634 rtx ret;
19636 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19637 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19639 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19641 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19642 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19644 else
19645 ret = ix86_expand_int_compare (code, op0, op1);
19647 return ret;
19650 void
19651 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19653 enum machine_mode mode = GET_MODE (op0);
19654 rtx tmp;
19656 switch (mode)
19658 case SFmode:
19659 case DFmode:
19660 case XFmode:
19661 case QImode:
19662 case HImode:
19663 case SImode:
19664 simple:
19665 tmp = ix86_expand_compare (code, op0, op1);
19666 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19667 gen_rtx_LABEL_REF (VOIDmode, label),
19668 pc_rtx);
19669 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19670 return;
19672 case DImode:
19673 if (TARGET_64BIT)
19674 goto simple;
19675 case TImode:
19676 /* Expand DImode branch into multiple compare+branch. */
19678 rtx lo[2], hi[2], label2;
19679 enum rtx_code code1, code2, code3;
19680 enum machine_mode submode;
19682 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19684 tmp = op0, op0 = op1, op1 = tmp;
19685 code = swap_condition (code);
19688 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19689 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19691 submode = mode == DImode ? SImode : DImode;
19693 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19694 avoid two branches. This costs one extra insn, so disable when
19695 optimizing for size. */
19697 if ((code == EQ || code == NE)
19698 && (!optimize_insn_for_size_p ()
19699 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19701 rtx xor0, xor1;
19703 xor1 = hi[0];
19704 if (hi[1] != const0_rtx)
19705 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19706 NULL_RTX, 0, OPTAB_WIDEN);
19708 xor0 = lo[0];
19709 if (lo[1] != const0_rtx)
19710 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19711 NULL_RTX, 0, OPTAB_WIDEN);
19713 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19714 NULL_RTX, 0, OPTAB_WIDEN);
19716 ix86_expand_branch (code, tmp, const0_rtx, label);
19717 return;
19720 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19721 op1 is a constant and the low word is zero, then we can just
19722 examine the high word. Similarly for low word -1 and
19723 less-or-equal-than or greater-than. */
19725 if (CONST_INT_P (hi[1]))
19726 switch (code)
19728 case LT: case LTU: case GE: case GEU:
19729 if (lo[1] == const0_rtx)
19731 ix86_expand_branch (code, hi[0], hi[1], label);
19732 return;
19734 break;
19735 case LE: case LEU: case GT: case GTU:
19736 if (lo[1] == constm1_rtx)
19738 ix86_expand_branch (code, hi[0], hi[1], label);
19739 return;
19741 break;
19742 default:
19743 break;
19746 /* Otherwise, we need two or three jumps. */
19748 label2 = gen_label_rtx ();
19750 code1 = code;
19751 code2 = swap_condition (code);
19752 code3 = unsigned_condition (code);
19754 switch (code)
19756 case LT: case GT: case LTU: case GTU:
19757 break;
19759 case LE: code1 = LT; code2 = GT; break;
19760 case GE: code1 = GT; code2 = LT; break;
19761 case LEU: code1 = LTU; code2 = GTU; break;
19762 case GEU: code1 = GTU; code2 = LTU; break;
19764 case EQ: code1 = UNKNOWN; code2 = NE; break;
19765 case NE: code2 = UNKNOWN; break;
19767 default:
19768 gcc_unreachable ();
19772 * a < b =>
19773 * if (hi(a) < hi(b)) goto true;
19774 * if (hi(a) > hi(b)) goto false;
19775 * if (lo(a) < lo(b)) goto true;
19776 * false:
19779 if (code1 != UNKNOWN)
19780 ix86_expand_branch (code1, hi[0], hi[1], label);
19781 if (code2 != UNKNOWN)
19782 ix86_expand_branch (code2, hi[0], hi[1], label2);
19784 ix86_expand_branch (code3, lo[0], lo[1], label);
19786 if (code2 != UNKNOWN)
19787 emit_label (label2);
19788 return;
19791 default:
19792 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19793 goto simple;
19797 /* Split branch based on floating point condition. */
19798 void
19799 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19800 rtx target1, rtx target2, rtx tmp, rtx pushed)
19802 rtx condition;
19803 rtx i;
19805 if (target2 != pc_rtx)
19807 rtx tmp = target2;
19808 code = reverse_condition_maybe_unordered (code);
19809 target2 = target1;
19810 target1 = tmp;
19813 condition = ix86_expand_fp_compare (code, op1, op2,
19814 tmp);
19816 /* Remove pushed operand from stack. */
19817 if (pushed)
19818 ix86_free_from_memory (GET_MODE (pushed));
19820 i = emit_jump_insn (gen_rtx_SET
19821 (VOIDmode, pc_rtx,
19822 gen_rtx_IF_THEN_ELSE (VOIDmode,
19823 condition, target1, target2)));
19824 if (split_branch_probability >= 0)
19825 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19828 void
19829 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19831 rtx ret;
19833 gcc_assert (GET_MODE (dest) == QImode);
19835 ret = ix86_expand_compare (code, op0, op1);
19836 PUT_MODE (ret, QImode);
19837 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19840 /* Expand comparison setting or clearing carry flag. Return true when
19841 successful and set pop for the operation. */
19842 static bool
19843 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19845 enum machine_mode mode =
19846 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19848 /* Do not handle double-mode compares that go through special path. */
19849 if (mode == (TARGET_64BIT ? TImode : DImode))
19850 return false;
19852 if (SCALAR_FLOAT_MODE_P (mode))
19854 rtx compare_op, compare_seq;
19856 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19858 /* Shortcut: following common codes never translate
19859 into carry flag compares. */
19860 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19861 || code == ORDERED || code == UNORDERED)
19862 return false;
19864 /* These comparisons require zero flag; swap operands so they won't. */
19865 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19866 && !TARGET_IEEE_FP)
19868 rtx tmp = op0;
19869 op0 = op1;
19870 op1 = tmp;
19871 code = swap_condition (code);
19874 /* Try to expand the comparison and verify that we end up with
19875 carry flag based comparison. This fails to be true only when
19876 we decide to expand comparison using arithmetic that is not
19877 too common scenario. */
19878 start_sequence ();
19879 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19880 compare_seq = get_insns ();
19881 end_sequence ();
19883 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19884 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19885 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19886 else
19887 code = GET_CODE (compare_op);
19889 if (code != LTU && code != GEU)
19890 return false;
19892 emit_insn (compare_seq);
19893 *pop = compare_op;
19894 return true;
19897 if (!INTEGRAL_MODE_P (mode))
19898 return false;
19900 switch (code)
19902 case LTU:
19903 case GEU:
19904 break;
19906 /* Convert a==0 into (unsigned)a<1. */
19907 case EQ:
19908 case NE:
19909 if (op1 != const0_rtx)
19910 return false;
19911 op1 = const1_rtx;
19912 code = (code == EQ ? LTU : GEU);
19913 break;
19915 /* Convert a>b into b<a or a>=b-1. */
19916 case GTU:
19917 case LEU:
19918 if (CONST_INT_P (op1))
19920 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19921 /* Bail out on overflow. We still can swap operands but that
19922 would force loading of the constant into register. */
19923 if (op1 == const0_rtx
19924 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19925 return false;
19926 code = (code == GTU ? GEU : LTU);
19928 else
19930 rtx tmp = op1;
19931 op1 = op0;
19932 op0 = tmp;
19933 code = (code == GTU ? LTU : GEU);
19935 break;
19937 /* Convert a>=0 into (unsigned)a<0x80000000. */
19938 case LT:
19939 case GE:
19940 if (mode == DImode || op1 != const0_rtx)
19941 return false;
19942 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19943 code = (code == LT ? GEU : LTU);
19944 break;
19945 case LE:
19946 case GT:
19947 if (mode == DImode || op1 != constm1_rtx)
19948 return false;
19949 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19950 code = (code == LE ? GEU : LTU);
19951 break;
19953 default:
19954 return false;
19956 /* Swapping operands may cause constant to appear as first operand. */
19957 if (!nonimmediate_operand (op0, VOIDmode))
19959 if (!can_create_pseudo_p ())
19960 return false;
19961 op0 = force_reg (mode, op0);
19963 *pop = ix86_expand_compare (code, op0, op1);
19964 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19965 return true;
19968 bool
19969 ix86_expand_int_movcc (rtx operands[])
19971 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19972 rtx compare_seq, compare_op;
19973 enum machine_mode mode = GET_MODE (operands[0]);
19974 bool sign_bit_compare_p = false;
19975 rtx op0 = XEXP (operands[1], 0);
19976 rtx op1 = XEXP (operands[1], 1);
19978 if (GET_MODE (op0) == TImode
19979 || (GET_MODE (op0) == DImode
19980 && !TARGET_64BIT))
19981 return false;
19983 start_sequence ();
19984 compare_op = ix86_expand_compare (code, op0, op1);
19985 compare_seq = get_insns ();
19986 end_sequence ();
19988 compare_code = GET_CODE (compare_op);
19990 if ((op1 == const0_rtx && (code == GE || code == LT))
19991 || (op1 == constm1_rtx && (code == GT || code == LE)))
19992 sign_bit_compare_p = true;
19994 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19995 HImode insns, we'd be swallowed in word prefix ops. */
19997 if ((mode != HImode || TARGET_FAST_PREFIX)
19998 && (mode != (TARGET_64BIT ? TImode : DImode))
19999 && CONST_INT_P (operands[2])
20000 && CONST_INT_P (operands[3]))
20002 rtx out = operands[0];
20003 HOST_WIDE_INT ct = INTVAL (operands[2]);
20004 HOST_WIDE_INT cf = INTVAL (operands[3]);
20005 HOST_WIDE_INT diff;
20007 diff = ct - cf;
20008 /* Sign bit compares are better done using shifts than we do by using
20009 sbb. */
20010 if (sign_bit_compare_p
20011 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20013 /* Detect overlap between destination and compare sources. */
20014 rtx tmp = out;
20016 if (!sign_bit_compare_p)
20018 rtx flags;
20019 bool fpcmp = false;
20021 compare_code = GET_CODE (compare_op);
20023 flags = XEXP (compare_op, 0);
20025 if (GET_MODE (flags) == CCFPmode
20026 || GET_MODE (flags) == CCFPUmode)
20028 fpcmp = true;
20029 compare_code
20030 = ix86_fp_compare_code_to_integer (compare_code);
20033 /* To simplify rest of code, restrict to the GEU case. */
20034 if (compare_code == LTU)
20036 HOST_WIDE_INT tmp = ct;
20037 ct = cf;
20038 cf = tmp;
20039 compare_code = reverse_condition (compare_code);
20040 code = reverse_condition (code);
20042 else
20044 if (fpcmp)
20045 PUT_CODE (compare_op,
20046 reverse_condition_maybe_unordered
20047 (GET_CODE (compare_op)));
20048 else
20049 PUT_CODE (compare_op,
20050 reverse_condition (GET_CODE (compare_op)));
20052 diff = ct - cf;
20054 if (reg_overlap_mentioned_p (out, op0)
20055 || reg_overlap_mentioned_p (out, op1))
20056 tmp = gen_reg_rtx (mode);
20058 if (mode == DImode)
20059 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
20060 else
20061 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
20062 flags, compare_op));
20064 else
20066 if (code == GT || code == GE)
20067 code = reverse_condition (code);
20068 else
20070 HOST_WIDE_INT tmp = ct;
20071 ct = cf;
20072 cf = tmp;
20073 diff = ct - cf;
20075 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
20078 if (diff == 1)
20081 * cmpl op0,op1
20082 * sbbl dest,dest
20083 * [addl dest, ct]
20085 * Size 5 - 8.
20087 if (ct)
20088 tmp = expand_simple_binop (mode, PLUS,
20089 tmp, GEN_INT (ct),
20090 copy_rtx (tmp), 1, OPTAB_DIRECT);
20092 else if (cf == -1)
20095 * cmpl op0,op1
20096 * sbbl dest,dest
20097 * orl $ct, dest
20099 * Size 8.
20101 tmp = expand_simple_binop (mode, IOR,
20102 tmp, GEN_INT (ct),
20103 copy_rtx (tmp), 1, OPTAB_DIRECT);
20105 else if (diff == -1 && ct)
20108 * cmpl op0,op1
20109 * sbbl dest,dest
20110 * notl dest
20111 * [addl dest, cf]
20113 * Size 8 - 11.
20115 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20116 if (cf)
20117 tmp = expand_simple_binop (mode, PLUS,
20118 copy_rtx (tmp), GEN_INT (cf),
20119 copy_rtx (tmp), 1, OPTAB_DIRECT);
20121 else
20124 * cmpl op0,op1
20125 * sbbl dest,dest
20126 * [notl dest]
20127 * andl cf - ct, dest
20128 * [addl dest, ct]
20130 * Size 8 - 11.
20133 if (cf == 0)
20135 cf = ct;
20136 ct = 0;
20137 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
20140 tmp = expand_simple_binop (mode, AND,
20141 copy_rtx (tmp),
20142 gen_int_mode (cf - ct, mode),
20143 copy_rtx (tmp), 1, OPTAB_DIRECT);
20144 if (ct)
20145 tmp = expand_simple_binop (mode, PLUS,
20146 copy_rtx (tmp), GEN_INT (ct),
20147 copy_rtx (tmp), 1, OPTAB_DIRECT);
20150 if (!rtx_equal_p (tmp, out))
20151 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
20153 return true;
20156 if (diff < 0)
20158 enum machine_mode cmp_mode = GET_MODE (op0);
20160 HOST_WIDE_INT tmp;
20161 tmp = ct, ct = cf, cf = tmp;
20162 diff = -diff;
20164 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20166 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20168 /* We may be reversing unordered compare to normal compare, that
20169 is not valid in general (we may convert non-trapping condition
20170 to trapping one), however on i386 we currently emit all
20171 comparisons unordered. */
20172 compare_code = reverse_condition_maybe_unordered (compare_code);
20173 code = reverse_condition_maybe_unordered (code);
20175 else
20177 compare_code = reverse_condition (compare_code);
20178 code = reverse_condition (code);
20182 compare_code = UNKNOWN;
20183 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
20184 && CONST_INT_P (op1))
20186 if (op1 == const0_rtx
20187 && (code == LT || code == GE))
20188 compare_code = code;
20189 else if (op1 == constm1_rtx)
20191 if (code == LE)
20192 compare_code = LT;
20193 else if (code == GT)
20194 compare_code = GE;
20198 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20199 if (compare_code != UNKNOWN
20200 && GET_MODE (op0) == GET_MODE (out)
20201 && (cf == -1 || ct == -1))
20203 /* If lea code below could be used, only optimize
20204 if it results in a 2 insn sequence. */
20206 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20207 || diff == 3 || diff == 5 || diff == 9)
20208 || (compare_code == LT && ct == -1)
20209 || (compare_code == GE && cf == -1))
20212 * notl op1 (if necessary)
20213 * sarl $31, op1
20214 * orl cf, op1
20216 if (ct != -1)
20218 cf = ct;
20219 ct = -1;
20220 code = reverse_condition (code);
20223 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20225 out = expand_simple_binop (mode, IOR,
20226 out, GEN_INT (cf),
20227 out, 1, OPTAB_DIRECT);
20228 if (out != operands[0])
20229 emit_move_insn (operands[0], out);
20231 return true;
20236 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20237 || diff == 3 || diff == 5 || diff == 9)
20238 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20239 && (mode != DImode
20240 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20243 * xorl dest,dest
20244 * cmpl op1,op2
20245 * setcc dest
20246 * lea cf(dest*(ct-cf)),dest
20248 * Size 14.
20250 * This also catches the degenerate setcc-only case.
20253 rtx tmp;
20254 int nops;
20256 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20258 nops = 0;
20259 /* On x86_64 the lea instruction operates on Pmode, so we need
20260 to get arithmetics done in proper mode to match. */
20261 if (diff == 1)
20262 tmp = copy_rtx (out);
20263 else
20265 rtx out1;
20266 out1 = copy_rtx (out);
20267 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20268 nops++;
20269 if (diff & 1)
20271 tmp = gen_rtx_PLUS (mode, tmp, out1);
20272 nops++;
20275 if (cf != 0)
20277 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20278 nops++;
20280 if (!rtx_equal_p (tmp, out))
20282 if (nops == 1)
20283 out = force_operand (tmp, copy_rtx (out));
20284 else
20285 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20287 if (!rtx_equal_p (out, operands[0]))
20288 emit_move_insn (operands[0], copy_rtx (out));
20290 return true;
20294 * General case: Jumpful:
20295 * xorl dest,dest cmpl op1, op2
20296 * cmpl op1, op2 movl ct, dest
20297 * setcc dest jcc 1f
20298 * decl dest movl cf, dest
20299 * andl (cf-ct),dest 1:
20300 * addl ct,dest
20302 * Size 20. Size 14.
20304 * This is reasonably steep, but branch mispredict costs are
20305 * high on modern cpus, so consider failing only if optimizing
20306 * for space.
20309 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20310 && BRANCH_COST (optimize_insn_for_speed_p (),
20311 false) >= 2)
20313 if (cf == 0)
20315 enum machine_mode cmp_mode = GET_MODE (op0);
20317 cf = ct;
20318 ct = 0;
20320 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20322 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20324 /* We may be reversing unordered compare to normal compare,
20325 that is not valid in general (we may convert non-trapping
20326 condition to trapping one), however on i386 we currently
20327 emit all comparisons unordered. */
20328 code = reverse_condition_maybe_unordered (code);
20330 else
20332 code = reverse_condition (code);
20333 if (compare_code != UNKNOWN)
20334 compare_code = reverse_condition (compare_code);
20338 if (compare_code != UNKNOWN)
20340 /* notl op1 (if needed)
20341 sarl $31, op1
20342 andl (cf-ct), op1
20343 addl ct, op1
20345 For x < 0 (resp. x <= -1) there will be no notl,
20346 so if possible swap the constants to get rid of the
20347 complement.
20348 True/false will be -1/0 while code below (store flag
20349 followed by decrement) is 0/-1, so the constants need
20350 to be exchanged once more. */
20352 if (compare_code == GE || !cf)
20354 code = reverse_condition (code);
20355 compare_code = LT;
20357 else
20359 HOST_WIDE_INT tmp = cf;
20360 cf = ct;
20361 ct = tmp;
20364 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20366 else
20368 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20370 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20371 constm1_rtx,
20372 copy_rtx (out), 1, OPTAB_DIRECT);
20375 out = expand_simple_binop (mode, AND, copy_rtx (out),
20376 gen_int_mode (cf - ct, mode),
20377 copy_rtx (out), 1, OPTAB_DIRECT);
20378 if (ct)
20379 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20380 copy_rtx (out), 1, OPTAB_DIRECT);
20381 if (!rtx_equal_p (out, operands[0]))
20382 emit_move_insn (operands[0], copy_rtx (out));
20384 return true;
20388 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20390 /* Try a few things more with specific constants and a variable. */
20392 optab op;
20393 rtx var, orig_out, out, tmp;
20395 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20396 return false;
20398 /* If one of the two operands is an interesting constant, load a
20399 constant with the above and mask it in with a logical operation. */
20401 if (CONST_INT_P (operands[2]))
20403 var = operands[3];
20404 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20405 operands[3] = constm1_rtx, op = and_optab;
20406 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20407 operands[3] = const0_rtx, op = ior_optab;
20408 else
20409 return false;
20411 else if (CONST_INT_P (operands[3]))
20413 var = operands[2];
20414 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20415 operands[2] = constm1_rtx, op = and_optab;
20416 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20417 operands[2] = const0_rtx, op = ior_optab;
20418 else
20419 return false;
20421 else
20422 return false;
20424 orig_out = operands[0];
20425 tmp = gen_reg_rtx (mode);
20426 operands[0] = tmp;
20428 /* Recurse to get the constant loaded. */
20429 if (ix86_expand_int_movcc (operands) == 0)
20430 return false;
20432 /* Mask in the interesting variable. */
20433 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20434 OPTAB_WIDEN);
20435 if (!rtx_equal_p (out, orig_out))
20436 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20438 return true;
20442 * For comparison with above,
20444 * movl cf,dest
20445 * movl ct,tmp
20446 * cmpl op1,op2
20447 * cmovcc tmp,dest
20449 * Size 15.
20452 if (! nonimmediate_operand (operands[2], mode))
20453 operands[2] = force_reg (mode, operands[2]);
20454 if (! nonimmediate_operand (operands[3], mode))
20455 operands[3] = force_reg (mode, operands[3]);
20457 if (! register_operand (operands[2], VOIDmode)
20458 && (mode == QImode
20459 || ! register_operand (operands[3], VOIDmode)))
20460 operands[2] = force_reg (mode, operands[2]);
20462 if (mode == QImode
20463 && ! register_operand (operands[3], VOIDmode))
20464 operands[3] = force_reg (mode, operands[3]);
20466 emit_insn (compare_seq);
20467 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20468 gen_rtx_IF_THEN_ELSE (mode,
20469 compare_op, operands[2],
20470 operands[3])));
20471 return true;
20474 /* Swap, force into registers, or otherwise massage the two operands
20475 to an sse comparison with a mask result. Thus we differ a bit from
20476 ix86_prepare_fp_compare_args which expects to produce a flags result.
20478 The DEST operand exists to help determine whether to commute commutative
20479 operators. The POP0/POP1 operands are updated in place. The new
20480 comparison code is returned, or UNKNOWN if not implementable. */
20482 static enum rtx_code
20483 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20484 rtx *pop0, rtx *pop1)
20486 rtx tmp;
20488 switch (code)
20490 case LTGT:
20491 case UNEQ:
20492 /* AVX supports all the needed comparisons. */
20493 if (TARGET_AVX)
20494 break;
20495 /* We have no LTGT as an operator. We could implement it with
20496 NE & ORDERED, but this requires an extra temporary. It's
20497 not clear that it's worth it. */
20498 return UNKNOWN;
20500 case LT:
20501 case LE:
20502 case UNGT:
20503 case UNGE:
20504 /* These are supported directly. */
20505 break;
20507 case EQ:
20508 case NE:
20509 case UNORDERED:
20510 case ORDERED:
20511 /* AVX has 3 operand comparisons, no need to swap anything. */
20512 if (TARGET_AVX)
20513 break;
20514 /* For commutative operators, try to canonicalize the destination
20515 operand to be first in the comparison - this helps reload to
20516 avoid extra moves. */
20517 if (!dest || !rtx_equal_p (dest, *pop1))
20518 break;
20519 /* FALLTHRU */
20521 case GE:
20522 case GT:
20523 case UNLE:
20524 case UNLT:
20525 /* These are not supported directly before AVX, and furthermore
20526 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20527 comparison operands to transform into something that is
20528 supported. */
20529 tmp = *pop0;
20530 *pop0 = *pop1;
20531 *pop1 = tmp;
20532 code = swap_condition (code);
20533 break;
20535 default:
20536 gcc_unreachable ();
20539 return code;
20542 /* Detect conditional moves that exactly match min/max operational
20543 semantics. Note that this is IEEE safe, as long as we don't
20544 interchange the operands.
20546 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20547 and TRUE if the operation is successful and instructions are emitted. */
20549 static bool
20550 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20551 rtx cmp_op1, rtx if_true, rtx if_false)
20553 enum machine_mode mode;
20554 bool is_min;
20555 rtx tmp;
20557 if (code == LT)
20559 else if (code == UNGE)
20561 tmp = if_true;
20562 if_true = if_false;
20563 if_false = tmp;
20565 else
20566 return false;
20568 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20569 is_min = true;
20570 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20571 is_min = false;
20572 else
20573 return false;
20575 mode = GET_MODE (dest);
20577 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20578 but MODE may be a vector mode and thus not appropriate. */
20579 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20581 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20582 rtvec v;
20584 if_true = force_reg (mode, if_true);
20585 v = gen_rtvec (2, if_true, if_false);
20586 tmp = gen_rtx_UNSPEC (mode, v, u);
20588 else
20590 code = is_min ? SMIN : SMAX;
20591 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20594 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20595 return true;
20598 /* Expand an sse vector comparison. Return the register with the result. */
20600 static rtx
20601 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20602 rtx op_true, rtx op_false)
20604 enum machine_mode mode = GET_MODE (dest);
20605 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20606 rtx x;
20608 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20609 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20610 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20612 if (optimize
20613 || reg_overlap_mentioned_p (dest, op_true)
20614 || reg_overlap_mentioned_p (dest, op_false))
20615 dest = gen_reg_rtx (mode);
20617 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20618 if (cmp_mode != mode)
20620 x = force_reg (cmp_mode, x);
20621 convert_move (dest, x, false);
20623 else
20624 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20626 return dest;
20629 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20630 operations. This is used for both scalar and vector conditional moves. */
20632 static void
20633 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20635 enum machine_mode mode = GET_MODE (dest);
20636 rtx t2, t3, x;
20638 if (vector_all_ones_operand (op_true, mode)
20639 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20641 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20643 else if (op_false == CONST0_RTX (mode))
20645 op_true = force_reg (mode, op_true);
20646 x = gen_rtx_AND (mode, cmp, op_true);
20647 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20649 else if (op_true == CONST0_RTX (mode))
20651 op_false = force_reg (mode, op_false);
20652 x = gen_rtx_NOT (mode, cmp);
20653 x = gen_rtx_AND (mode, x, op_false);
20654 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20656 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20658 op_false = force_reg (mode, op_false);
20659 x = gen_rtx_IOR (mode, cmp, op_false);
20660 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20662 else if (TARGET_XOP)
20664 op_true = force_reg (mode, op_true);
20666 if (!nonimmediate_operand (op_false, mode))
20667 op_false = force_reg (mode, op_false);
20669 emit_insn (gen_rtx_SET (mode, dest,
20670 gen_rtx_IF_THEN_ELSE (mode, cmp,
20671 op_true,
20672 op_false)));
20674 else
20676 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20677 rtx d = dest;
20679 if (!nonimmediate_operand (op_true, mode))
20680 op_true = force_reg (mode, op_true);
20682 op_false = force_reg (mode, op_false);
20684 switch (mode)
20686 case V4SFmode:
20687 if (TARGET_SSE4_1)
20688 gen = gen_sse4_1_blendvps;
20689 break;
20690 case V2DFmode:
20691 if (TARGET_SSE4_1)
20692 gen = gen_sse4_1_blendvpd;
20693 break;
20694 case V16QImode:
20695 case V8HImode:
20696 case V4SImode:
20697 case V2DImode:
20698 if (TARGET_SSE4_1)
20700 gen = gen_sse4_1_pblendvb;
20701 if (mode != V16QImode)
20702 d = gen_reg_rtx (V16QImode);
20703 op_false = gen_lowpart (V16QImode, op_false);
20704 op_true = gen_lowpart (V16QImode, op_true);
20705 cmp = gen_lowpart (V16QImode, cmp);
20707 break;
20708 case V8SFmode:
20709 if (TARGET_AVX)
20710 gen = gen_avx_blendvps256;
20711 break;
20712 case V4DFmode:
20713 if (TARGET_AVX)
20714 gen = gen_avx_blendvpd256;
20715 break;
20716 case V32QImode:
20717 case V16HImode:
20718 case V8SImode:
20719 case V4DImode:
20720 if (TARGET_AVX2)
20722 gen = gen_avx2_pblendvb;
20723 if (mode != V32QImode)
20724 d = gen_reg_rtx (V32QImode);
20725 op_false = gen_lowpart (V32QImode, op_false);
20726 op_true = gen_lowpart (V32QImode, op_true);
20727 cmp = gen_lowpart (V32QImode, cmp);
20729 break;
20730 default:
20731 break;
20734 if (gen != NULL)
20736 emit_insn (gen (d, op_false, op_true, cmp));
20737 if (d != dest)
20738 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20740 else
20742 op_true = force_reg (mode, op_true);
20744 t2 = gen_reg_rtx (mode);
20745 if (optimize)
20746 t3 = gen_reg_rtx (mode);
20747 else
20748 t3 = dest;
20750 x = gen_rtx_AND (mode, op_true, cmp);
20751 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20753 x = gen_rtx_NOT (mode, cmp);
20754 x = gen_rtx_AND (mode, x, op_false);
20755 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20757 x = gen_rtx_IOR (mode, t3, t2);
20758 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20763 /* Expand a floating-point conditional move. Return true if successful. */
20765 bool
20766 ix86_expand_fp_movcc (rtx operands[])
20768 enum machine_mode mode = GET_MODE (operands[0]);
20769 enum rtx_code code = GET_CODE (operands[1]);
20770 rtx tmp, compare_op;
20771 rtx op0 = XEXP (operands[1], 0);
20772 rtx op1 = XEXP (operands[1], 1);
20774 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20776 enum machine_mode cmode;
20778 /* Since we've no cmove for sse registers, don't force bad register
20779 allocation just to gain access to it. Deny movcc when the
20780 comparison mode doesn't match the move mode. */
20781 cmode = GET_MODE (op0);
20782 if (cmode == VOIDmode)
20783 cmode = GET_MODE (op1);
20784 if (cmode != mode)
20785 return false;
20787 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20788 if (code == UNKNOWN)
20789 return false;
20791 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20792 operands[2], operands[3]))
20793 return true;
20795 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20796 operands[2], operands[3]);
20797 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20798 return true;
20801 if (GET_MODE (op0) == TImode
20802 || (GET_MODE (op0) == DImode
20803 && !TARGET_64BIT))
20804 return false;
20806 /* The floating point conditional move instructions don't directly
20807 support conditions resulting from a signed integer comparison. */
20809 compare_op = ix86_expand_compare (code, op0, op1);
20810 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20812 tmp = gen_reg_rtx (QImode);
20813 ix86_expand_setcc (tmp, code, op0, op1);
20815 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20818 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20819 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20820 operands[2], operands[3])));
20822 return true;
20825 /* Expand a floating-point vector conditional move; a vcond operation
20826 rather than a movcc operation. */
20828 bool
20829 ix86_expand_fp_vcond (rtx operands[])
20831 enum rtx_code code = GET_CODE (operands[3]);
20832 rtx cmp;
20834 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20835 &operands[4], &operands[5]);
20836 if (code == UNKNOWN)
20838 rtx temp;
20839 switch (GET_CODE (operands[3]))
20841 case LTGT:
20842 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20843 operands[5], operands[0], operands[0]);
20844 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20845 operands[5], operands[1], operands[2]);
20846 code = AND;
20847 break;
20848 case UNEQ:
20849 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20850 operands[5], operands[0], operands[0]);
20851 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20852 operands[5], operands[1], operands[2]);
20853 code = IOR;
20854 break;
20855 default:
20856 gcc_unreachable ();
20858 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20859 OPTAB_DIRECT);
20860 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20861 return true;
20864 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20865 operands[5], operands[1], operands[2]))
20866 return true;
20868 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20869 operands[1], operands[2]);
20870 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20871 return true;
20874 /* Expand a signed/unsigned integral vector conditional move. */
20876 bool
20877 ix86_expand_int_vcond (rtx operands[])
20879 enum machine_mode data_mode = GET_MODE (operands[0]);
20880 enum machine_mode mode = GET_MODE (operands[4]);
20881 enum rtx_code code = GET_CODE (operands[3]);
20882 bool negate = false;
20883 rtx x, cop0, cop1;
20885 cop0 = operands[4];
20886 cop1 = operands[5];
20888 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20889 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20890 if ((code == LT || code == GE)
20891 && data_mode == mode
20892 && cop1 == CONST0_RTX (mode)
20893 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20894 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20895 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20896 && (GET_MODE_SIZE (data_mode) == 16
20897 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20899 rtx negop = operands[2 - (code == LT)];
20900 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20901 if (negop == CONST1_RTX (data_mode))
20903 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20904 operands[0], 1, OPTAB_DIRECT);
20905 if (res != operands[0])
20906 emit_move_insn (operands[0], res);
20907 return true;
20909 else if (GET_MODE_INNER (data_mode) != DImode
20910 && vector_all_ones_operand (negop, data_mode))
20912 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20913 operands[0], 0, OPTAB_DIRECT);
20914 if (res != operands[0])
20915 emit_move_insn (operands[0], res);
20916 return true;
20920 if (!nonimmediate_operand (cop1, mode))
20921 cop1 = force_reg (mode, cop1);
20922 if (!general_operand (operands[1], data_mode))
20923 operands[1] = force_reg (data_mode, operands[1]);
20924 if (!general_operand (operands[2], data_mode))
20925 operands[2] = force_reg (data_mode, operands[2]);
20927 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20928 if (TARGET_XOP
20929 && (mode == V16QImode || mode == V8HImode
20930 || mode == V4SImode || mode == V2DImode))
20932 else
20934 /* Canonicalize the comparison to EQ, GT, GTU. */
20935 switch (code)
20937 case EQ:
20938 case GT:
20939 case GTU:
20940 break;
20942 case NE:
20943 case LE:
20944 case LEU:
20945 code = reverse_condition (code);
20946 negate = true;
20947 break;
20949 case GE:
20950 case GEU:
20951 code = reverse_condition (code);
20952 negate = true;
20953 /* FALLTHRU */
20955 case LT:
20956 case LTU:
20957 code = swap_condition (code);
20958 x = cop0, cop0 = cop1, cop1 = x;
20959 break;
20961 default:
20962 gcc_unreachable ();
20965 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20966 if (mode == V2DImode)
20968 switch (code)
20970 case EQ:
20971 /* SSE4.1 supports EQ. */
20972 if (!TARGET_SSE4_1)
20973 return false;
20974 break;
20976 case GT:
20977 case GTU:
20978 /* SSE4.2 supports GT/GTU. */
20979 if (!TARGET_SSE4_2)
20980 return false;
20981 break;
20983 default:
20984 gcc_unreachable ();
20988 /* Unsigned parallel compare is not supported by the hardware.
20989 Play some tricks to turn this into a signed comparison
20990 against 0. */
20991 if (code == GTU)
20993 cop0 = force_reg (mode, cop0);
20995 switch (mode)
20997 case V8SImode:
20998 case V4DImode:
20999 case V4SImode:
21000 case V2DImode:
21002 rtx t1, t2, mask;
21003 rtx (*gen_sub3) (rtx, rtx, rtx);
21005 switch (mode)
21007 case V8SImode: gen_sub3 = gen_subv8si3; break;
21008 case V4DImode: gen_sub3 = gen_subv4di3; break;
21009 case V4SImode: gen_sub3 = gen_subv4si3; break;
21010 case V2DImode: gen_sub3 = gen_subv2di3; break;
21011 default:
21012 gcc_unreachable ();
21014 /* Subtract (-(INT MAX) - 1) from both operands to make
21015 them signed. */
21016 mask = ix86_build_signbit_mask (mode, true, false);
21017 t1 = gen_reg_rtx (mode);
21018 emit_insn (gen_sub3 (t1, cop0, mask));
21020 t2 = gen_reg_rtx (mode);
21021 emit_insn (gen_sub3 (t2, cop1, mask));
21023 cop0 = t1;
21024 cop1 = t2;
21025 code = GT;
21027 break;
21029 case V32QImode:
21030 case V16HImode:
21031 case V16QImode:
21032 case V8HImode:
21033 /* Perform a parallel unsigned saturating subtraction. */
21034 x = gen_reg_rtx (mode);
21035 emit_insn (gen_rtx_SET (VOIDmode, x,
21036 gen_rtx_US_MINUS (mode, cop0, cop1)));
21038 cop0 = x;
21039 cop1 = CONST0_RTX (mode);
21040 code = EQ;
21041 negate = !negate;
21042 break;
21044 default:
21045 gcc_unreachable ();
21050 /* Allow the comparison to be done in one mode, but the movcc to
21051 happen in another mode. */
21052 if (data_mode == mode)
21054 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
21055 operands[1+negate], operands[2-negate]);
21057 else
21059 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
21060 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
21061 operands[1+negate], operands[2-negate]);
21062 x = gen_lowpart (data_mode, x);
21065 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
21066 operands[2-negate]);
21067 return true;
21070 /* Expand a variable vector permutation. */
21072 void
21073 ix86_expand_vec_perm (rtx operands[])
21075 rtx target = operands[0];
21076 rtx op0 = operands[1];
21077 rtx op1 = operands[2];
21078 rtx mask = operands[3];
21079 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
21080 enum machine_mode mode = GET_MODE (op0);
21081 enum machine_mode maskmode = GET_MODE (mask);
21082 int w, e, i;
21083 bool one_operand_shuffle = rtx_equal_p (op0, op1);
21085 /* Number of elements in the vector. */
21086 w = GET_MODE_NUNITS (mode);
21087 e = GET_MODE_UNIT_SIZE (mode);
21088 gcc_assert (w <= 32);
21090 if (TARGET_AVX2)
21092 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
21094 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
21095 an constant shuffle operand. With a tiny bit of effort we can
21096 use VPERMD instead. A re-interpretation stall for V4DFmode is
21097 unfortunate but there's no avoiding it.
21098 Similarly for V16HImode we don't have instructions for variable
21099 shuffling, while for V32QImode we can use after preparing suitable
21100 masks vpshufb; vpshufb; vpermq; vpor. */
21102 if (mode == V16HImode)
21104 maskmode = mode = V32QImode;
21105 w = 32;
21106 e = 1;
21108 else
21110 maskmode = mode = V8SImode;
21111 w = 8;
21112 e = 4;
21114 t1 = gen_reg_rtx (maskmode);
21116 /* Replicate the low bits of the V4DImode mask into V8SImode:
21117 mask = { A B C D }
21118 t1 = { A A B B C C D D }. */
21119 for (i = 0; i < w / 2; ++i)
21120 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
21121 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21122 vt = force_reg (maskmode, vt);
21123 mask = gen_lowpart (maskmode, mask);
21124 if (maskmode == V8SImode)
21125 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
21126 else
21127 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
21129 /* Multiply the shuffle indicies by two. */
21130 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
21131 OPTAB_DIRECT);
21133 /* Add one to the odd shuffle indicies:
21134 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
21135 for (i = 0; i < w / 2; ++i)
21137 vec[i * 2] = const0_rtx;
21138 vec[i * 2 + 1] = const1_rtx;
21140 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21141 vt = validize_mem (force_const_mem (maskmode, vt));
21142 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
21143 OPTAB_DIRECT);
21145 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
21146 operands[3] = mask = t1;
21147 target = gen_reg_rtx (mode);
21148 op0 = gen_lowpart (mode, op0);
21149 op1 = gen_lowpart (mode, op1);
21152 switch (mode)
21154 case V8SImode:
21155 /* The VPERMD and VPERMPS instructions already properly ignore
21156 the high bits of the shuffle elements. No need for us to
21157 perform an AND ourselves. */
21158 if (one_operand_shuffle)
21160 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
21161 if (target != operands[0])
21162 emit_move_insn (operands[0],
21163 gen_lowpart (GET_MODE (operands[0]), target));
21165 else
21167 t1 = gen_reg_rtx (V8SImode);
21168 t2 = gen_reg_rtx (V8SImode);
21169 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
21170 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
21171 goto merge_two;
21173 return;
21175 case V8SFmode:
21176 mask = gen_lowpart (V8SFmode, mask);
21177 if (one_operand_shuffle)
21178 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
21179 else
21181 t1 = gen_reg_rtx (V8SFmode);
21182 t2 = gen_reg_rtx (V8SFmode);
21183 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
21184 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
21185 goto merge_two;
21187 return;
21189 case V4SImode:
21190 /* By combining the two 128-bit input vectors into one 256-bit
21191 input vector, we can use VPERMD and VPERMPS for the full
21192 two-operand shuffle. */
21193 t1 = gen_reg_rtx (V8SImode);
21194 t2 = gen_reg_rtx (V8SImode);
21195 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21196 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21197 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21198 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21199 return;
21201 case V4SFmode:
21202 t1 = gen_reg_rtx (V8SFmode);
21203 t2 = gen_reg_rtx (V8SImode);
21204 mask = gen_lowpart (V4SImode, mask);
21205 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21206 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21207 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21208 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21209 return;
21211 case V32QImode:
21212 t1 = gen_reg_rtx (V32QImode);
21213 t2 = gen_reg_rtx (V32QImode);
21214 t3 = gen_reg_rtx (V32QImode);
21215 vt2 = GEN_INT (128);
21216 for (i = 0; i < 32; i++)
21217 vec[i] = vt2;
21218 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21219 vt = force_reg (V32QImode, vt);
21220 for (i = 0; i < 32; i++)
21221 vec[i] = i < 16 ? vt2 : const0_rtx;
21222 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21223 vt2 = force_reg (V32QImode, vt2);
21224 /* From mask create two adjusted masks, which contain the same
21225 bits as mask in the low 7 bits of each vector element.
21226 The first mask will have the most significant bit clear
21227 if it requests element from the same 128-bit lane
21228 and MSB set if it requests element from the other 128-bit lane.
21229 The second mask will have the opposite values of the MSB,
21230 and additionally will have its 128-bit lanes swapped.
21231 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21232 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21233 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21234 stands for other 12 bytes. */
21235 /* The bit whether element is from the same lane or the other
21236 lane is bit 4, so shift it up by 3 to the MSB position. */
21237 t5 = gen_reg_rtx (V4DImode);
21238 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21239 GEN_INT (3)));
21240 /* Clear MSB bits from the mask just in case it had them set. */
21241 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21242 /* After this t1 will have MSB set for elements from other lane. */
21243 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21244 /* Clear bits other than MSB. */
21245 emit_insn (gen_andv32qi3 (t1, t1, vt));
21246 /* Or in the lower bits from mask into t3. */
21247 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21248 /* And invert MSB bits in t1, so MSB is set for elements from the same
21249 lane. */
21250 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21251 /* Swap 128-bit lanes in t3. */
21252 t6 = gen_reg_rtx (V4DImode);
21253 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21254 const2_rtx, GEN_INT (3),
21255 const0_rtx, const1_rtx));
21256 /* And or in the lower bits from mask into t1. */
21257 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21258 if (one_operand_shuffle)
21260 /* Each of these shuffles will put 0s in places where
21261 element from the other 128-bit lane is needed, otherwise
21262 will shuffle in the requested value. */
21263 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21264 gen_lowpart (V32QImode, t6)));
21265 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21266 /* For t3 the 128-bit lanes are swapped again. */
21267 t7 = gen_reg_rtx (V4DImode);
21268 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21269 const2_rtx, GEN_INT (3),
21270 const0_rtx, const1_rtx));
21271 /* And oring both together leads to the result. */
21272 emit_insn (gen_iorv32qi3 (target, t1,
21273 gen_lowpart (V32QImode, t7)));
21274 if (target != operands[0])
21275 emit_move_insn (operands[0],
21276 gen_lowpart (GET_MODE (operands[0]), target));
21277 return;
21280 t4 = gen_reg_rtx (V32QImode);
21281 /* Similarly to the above one_operand_shuffle code,
21282 just for repeated twice for each operand. merge_two:
21283 code will merge the two results together. */
21284 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21285 gen_lowpart (V32QImode, t6)));
21286 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21287 gen_lowpart (V32QImode, t6)));
21288 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21289 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21290 t7 = gen_reg_rtx (V4DImode);
21291 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21292 const2_rtx, GEN_INT (3),
21293 const0_rtx, const1_rtx));
21294 t8 = gen_reg_rtx (V4DImode);
21295 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21296 const2_rtx, GEN_INT (3),
21297 const0_rtx, const1_rtx));
21298 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21299 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21300 t1 = t4;
21301 t2 = t3;
21302 goto merge_two;
21304 default:
21305 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21306 break;
21310 if (TARGET_XOP)
21312 /* The XOP VPPERM insn supports three inputs. By ignoring the
21313 one_operand_shuffle special case, we avoid creating another
21314 set of constant vectors in memory. */
21315 one_operand_shuffle = false;
21317 /* mask = mask & {2*w-1, ...} */
21318 vt = GEN_INT (2*w - 1);
21320 else
21322 /* mask = mask & {w-1, ...} */
21323 vt = GEN_INT (w - 1);
21326 for (i = 0; i < w; i++)
21327 vec[i] = vt;
21328 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21329 mask = expand_simple_binop (maskmode, AND, mask, vt,
21330 NULL_RTX, 0, OPTAB_DIRECT);
21332 /* For non-QImode operations, convert the word permutation control
21333 into a byte permutation control. */
21334 if (mode != V16QImode)
21336 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21337 GEN_INT (exact_log2 (e)),
21338 NULL_RTX, 0, OPTAB_DIRECT);
21340 /* Convert mask to vector of chars. */
21341 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21343 /* Replicate each of the input bytes into byte positions:
21344 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21345 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21346 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21347 for (i = 0; i < 16; ++i)
21348 vec[i] = GEN_INT (i/e * e);
21349 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21350 vt = validize_mem (force_const_mem (V16QImode, vt));
21351 if (TARGET_XOP)
21352 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21353 else
21354 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21356 /* Convert it into the byte positions by doing
21357 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21358 for (i = 0; i < 16; ++i)
21359 vec[i] = GEN_INT (i % e);
21360 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21361 vt = validize_mem (force_const_mem (V16QImode, vt));
21362 emit_insn (gen_addv16qi3 (mask, mask, vt));
21365 /* The actual shuffle operations all operate on V16QImode. */
21366 op0 = gen_lowpart (V16QImode, op0);
21367 op1 = gen_lowpart (V16QImode, op1);
21369 if (TARGET_XOP)
21371 if (GET_MODE (target) != V16QImode)
21372 target = gen_reg_rtx (V16QImode);
21373 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21374 if (target != operands[0])
21375 emit_move_insn (operands[0],
21376 gen_lowpart (GET_MODE (operands[0]), target));
21378 else if (one_operand_shuffle)
21380 if (GET_MODE (target) != V16QImode)
21381 target = gen_reg_rtx (V16QImode);
21382 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21383 if (target != operands[0])
21384 emit_move_insn (operands[0],
21385 gen_lowpart (GET_MODE (operands[0]), target));
21387 else
21389 rtx xops[6];
21390 bool ok;
21392 /* Shuffle the two input vectors independently. */
21393 t1 = gen_reg_rtx (V16QImode);
21394 t2 = gen_reg_rtx (V16QImode);
21395 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21396 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21398 merge_two:
21399 /* Then merge them together. The key is whether any given control
21400 element contained a bit set that indicates the second word. */
21401 mask = operands[3];
21402 vt = GEN_INT (w);
21403 if (maskmode == V2DImode && !TARGET_SSE4_1)
21405 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21406 more shuffle to convert the V2DI input mask into a V4SI
21407 input mask. At which point the masking that expand_int_vcond
21408 will work as desired. */
21409 rtx t3 = gen_reg_rtx (V4SImode);
21410 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21411 const0_rtx, const0_rtx,
21412 const2_rtx, const2_rtx));
21413 mask = t3;
21414 maskmode = V4SImode;
21415 e = w = 4;
21418 for (i = 0; i < w; i++)
21419 vec[i] = vt;
21420 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21421 vt = force_reg (maskmode, vt);
21422 mask = expand_simple_binop (maskmode, AND, mask, vt,
21423 NULL_RTX, 0, OPTAB_DIRECT);
21425 if (GET_MODE (target) != mode)
21426 target = gen_reg_rtx (mode);
21427 xops[0] = target;
21428 xops[1] = gen_lowpart (mode, t2);
21429 xops[2] = gen_lowpart (mode, t1);
21430 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21431 xops[4] = mask;
21432 xops[5] = vt;
21433 ok = ix86_expand_int_vcond (xops);
21434 gcc_assert (ok);
21435 if (target != operands[0])
21436 emit_move_insn (operands[0],
21437 gen_lowpart (GET_MODE (operands[0]), target));
21441 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21442 true if we should do zero extension, else sign extension. HIGH_P is
21443 true if we want the N/2 high elements, else the low elements. */
21445 void
21446 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21448 enum machine_mode imode = GET_MODE (src);
21449 rtx tmp;
21451 if (TARGET_SSE4_1)
21453 rtx (*unpack)(rtx, rtx);
21454 rtx (*extract)(rtx, rtx) = NULL;
21455 enum machine_mode halfmode = BLKmode;
21457 switch (imode)
21459 case V32QImode:
21460 if (unsigned_p)
21461 unpack = gen_avx2_zero_extendv16qiv16hi2;
21462 else
21463 unpack = gen_avx2_sign_extendv16qiv16hi2;
21464 halfmode = V16QImode;
21465 extract
21466 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21467 break;
21468 case V16HImode:
21469 if (unsigned_p)
21470 unpack = gen_avx2_zero_extendv8hiv8si2;
21471 else
21472 unpack = gen_avx2_sign_extendv8hiv8si2;
21473 halfmode = V8HImode;
21474 extract
21475 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21476 break;
21477 case V8SImode:
21478 if (unsigned_p)
21479 unpack = gen_avx2_zero_extendv4siv4di2;
21480 else
21481 unpack = gen_avx2_sign_extendv4siv4di2;
21482 halfmode = V4SImode;
21483 extract
21484 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21485 break;
21486 case V16QImode:
21487 if (unsigned_p)
21488 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21489 else
21490 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21491 break;
21492 case V8HImode:
21493 if (unsigned_p)
21494 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21495 else
21496 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21497 break;
21498 case V4SImode:
21499 if (unsigned_p)
21500 unpack = gen_sse4_1_zero_extendv2siv2di2;
21501 else
21502 unpack = gen_sse4_1_sign_extendv2siv2di2;
21503 break;
21504 default:
21505 gcc_unreachable ();
21508 if (GET_MODE_SIZE (imode) == 32)
21510 tmp = gen_reg_rtx (halfmode);
21511 emit_insn (extract (tmp, src));
21513 else if (high_p)
21515 /* Shift higher 8 bytes to lower 8 bytes. */
21516 tmp = gen_reg_rtx (V1TImode);
21517 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21518 GEN_INT (64)));
21519 tmp = gen_lowpart (imode, tmp);
21521 else
21522 tmp = src;
21524 emit_insn (unpack (dest, tmp));
21526 else
21528 rtx (*unpack)(rtx, rtx, rtx);
21530 switch (imode)
21532 case V16QImode:
21533 if (high_p)
21534 unpack = gen_vec_interleave_highv16qi;
21535 else
21536 unpack = gen_vec_interleave_lowv16qi;
21537 break;
21538 case V8HImode:
21539 if (high_p)
21540 unpack = gen_vec_interleave_highv8hi;
21541 else
21542 unpack = gen_vec_interleave_lowv8hi;
21543 break;
21544 case V4SImode:
21545 if (high_p)
21546 unpack = gen_vec_interleave_highv4si;
21547 else
21548 unpack = gen_vec_interleave_lowv4si;
21549 break;
21550 default:
21551 gcc_unreachable ();
21554 if (unsigned_p)
21555 tmp = force_reg (imode, CONST0_RTX (imode));
21556 else
21557 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21558 src, pc_rtx, pc_rtx);
21560 rtx tmp2 = gen_reg_rtx (imode);
21561 emit_insn (unpack (tmp2, src, tmp));
21562 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21566 /* Expand conditional increment or decrement using adb/sbb instructions.
21567 The default case using setcc followed by the conditional move can be
21568 done by generic code. */
21569 bool
21570 ix86_expand_int_addcc (rtx operands[])
21572 enum rtx_code code = GET_CODE (operands[1]);
21573 rtx flags;
21574 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21575 rtx compare_op;
21576 rtx val = const0_rtx;
21577 bool fpcmp = false;
21578 enum machine_mode mode;
21579 rtx op0 = XEXP (operands[1], 0);
21580 rtx op1 = XEXP (operands[1], 1);
21582 if (operands[3] != const1_rtx
21583 && operands[3] != constm1_rtx)
21584 return false;
21585 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21586 return false;
21587 code = GET_CODE (compare_op);
21589 flags = XEXP (compare_op, 0);
21591 if (GET_MODE (flags) == CCFPmode
21592 || GET_MODE (flags) == CCFPUmode)
21594 fpcmp = true;
21595 code = ix86_fp_compare_code_to_integer (code);
21598 if (code != LTU)
21600 val = constm1_rtx;
21601 if (fpcmp)
21602 PUT_CODE (compare_op,
21603 reverse_condition_maybe_unordered
21604 (GET_CODE (compare_op)));
21605 else
21606 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21609 mode = GET_MODE (operands[0]);
21611 /* Construct either adc or sbb insn. */
21612 if ((code == LTU) == (operands[3] == constm1_rtx))
21614 switch (mode)
21616 case QImode:
21617 insn = gen_subqi3_carry;
21618 break;
21619 case HImode:
21620 insn = gen_subhi3_carry;
21621 break;
21622 case SImode:
21623 insn = gen_subsi3_carry;
21624 break;
21625 case DImode:
21626 insn = gen_subdi3_carry;
21627 break;
21628 default:
21629 gcc_unreachable ();
21632 else
21634 switch (mode)
21636 case QImode:
21637 insn = gen_addqi3_carry;
21638 break;
21639 case HImode:
21640 insn = gen_addhi3_carry;
21641 break;
21642 case SImode:
21643 insn = gen_addsi3_carry;
21644 break;
21645 case DImode:
21646 insn = gen_adddi3_carry;
21647 break;
21648 default:
21649 gcc_unreachable ();
21652 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21654 return true;
21658 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21659 but works for floating pointer parameters and nonoffsetable memories.
21660 For pushes, it returns just stack offsets; the values will be saved
21661 in the right order. Maximally three parts are generated. */
21663 static int
21664 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21666 int size;
21668 if (!TARGET_64BIT)
21669 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21670 else
21671 size = (GET_MODE_SIZE (mode) + 4) / 8;
21673 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21674 gcc_assert (size >= 2 && size <= 4);
21676 /* Optimize constant pool reference to immediates. This is used by fp
21677 moves, that force all constants to memory to allow combining. */
21678 if (MEM_P (operand) && MEM_READONLY_P (operand))
21680 rtx tmp = maybe_get_pool_constant (operand);
21681 if (tmp)
21682 operand = tmp;
21685 if (MEM_P (operand) && !offsettable_memref_p (operand))
21687 /* The only non-offsetable memories we handle are pushes. */
21688 int ok = push_operand (operand, VOIDmode);
21690 gcc_assert (ok);
21692 operand = copy_rtx (operand);
21693 PUT_MODE (operand, word_mode);
21694 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21695 return size;
21698 if (GET_CODE (operand) == CONST_VECTOR)
21700 enum machine_mode imode = int_mode_for_mode (mode);
21701 /* Caution: if we looked through a constant pool memory above,
21702 the operand may actually have a different mode now. That's
21703 ok, since we want to pun this all the way back to an integer. */
21704 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21705 gcc_assert (operand != NULL);
21706 mode = imode;
21709 if (!TARGET_64BIT)
21711 if (mode == DImode)
21712 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21713 else
21715 int i;
21717 if (REG_P (operand))
21719 gcc_assert (reload_completed);
21720 for (i = 0; i < size; i++)
21721 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21723 else if (offsettable_memref_p (operand))
21725 operand = adjust_address (operand, SImode, 0);
21726 parts[0] = operand;
21727 for (i = 1; i < size; i++)
21728 parts[i] = adjust_address (operand, SImode, 4 * i);
21730 else if (GET_CODE (operand) == CONST_DOUBLE)
21732 REAL_VALUE_TYPE r;
21733 long l[4];
21735 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21736 switch (mode)
21738 case TFmode:
21739 real_to_target (l, &r, mode);
21740 parts[3] = gen_int_mode (l[3], SImode);
21741 parts[2] = gen_int_mode (l[2], SImode);
21742 break;
21743 case XFmode:
21744 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21745 long double may not be 80-bit. */
21746 real_to_target (l, &r, mode);
21747 parts[2] = gen_int_mode (l[2], SImode);
21748 break;
21749 case DFmode:
21750 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21751 break;
21752 default:
21753 gcc_unreachable ();
21755 parts[1] = gen_int_mode (l[1], SImode);
21756 parts[0] = gen_int_mode (l[0], SImode);
21758 else
21759 gcc_unreachable ();
21762 else
21764 if (mode == TImode)
21765 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21766 if (mode == XFmode || mode == TFmode)
21768 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21769 if (REG_P (operand))
21771 gcc_assert (reload_completed);
21772 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21773 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21775 else if (offsettable_memref_p (operand))
21777 operand = adjust_address (operand, DImode, 0);
21778 parts[0] = operand;
21779 parts[1] = adjust_address (operand, upper_mode, 8);
21781 else if (GET_CODE (operand) == CONST_DOUBLE)
21783 REAL_VALUE_TYPE r;
21784 long l[4];
21786 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21787 real_to_target (l, &r, mode);
21789 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21790 if (HOST_BITS_PER_WIDE_INT >= 64)
21791 parts[0]
21792 = gen_int_mode
21793 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21794 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21795 DImode);
21796 else
21797 parts[0] = immed_double_const (l[0], l[1], DImode);
21799 if (upper_mode == SImode)
21800 parts[1] = gen_int_mode (l[2], SImode);
21801 else if (HOST_BITS_PER_WIDE_INT >= 64)
21802 parts[1]
21803 = gen_int_mode
21804 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21805 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21806 DImode);
21807 else
21808 parts[1] = immed_double_const (l[2], l[3], DImode);
21810 else
21811 gcc_unreachable ();
21815 return size;
21818 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21819 Return false when normal moves are needed; true when all required
21820 insns have been emitted. Operands 2-4 contain the input values
21821 int the correct order; operands 5-7 contain the output values. */
21823 void
21824 ix86_split_long_move (rtx operands[])
21826 rtx part[2][4];
21827 int nparts, i, j;
21828 int push = 0;
21829 int collisions = 0;
21830 enum machine_mode mode = GET_MODE (operands[0]);
21831 bool collisionparts[4];
21833 /* The DFmode expanders may ask us to move double.
21834 For 64bit target this is single move. By hiding the fact
21835 here we simplify i386.md splitters. */
21836 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21838 /* Optimize constant pool reference to immediates. This is used by
21839 fp moves, that force all constants to memory to allow combining. */
21841 if (MEM_P (operands[1])
21842 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21843 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21844 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21845 if (push_operand (operands[0], VOIDmode))
21847 operands[0] = copy_rtx (operands[0]);
21848 PUT_MODE (operands[0], word_mode);
21850 else
21851 operands[0] = gen_lowpart (DImode, operands[0]);
21852 operands[1] = gen_lowpart (DImode, operands[1]);
21853 emit_move_insn (operands[0], operands[1]);
21854 return;
21857 /* The only non-offsettable memory we handle is push. */
21858 if (push_operand (operands[0], VOIDmode))
21859 push = 1;
21860 else
21861 gcc_assert (!MEM_P (operands[0])
21862 || offsettable_memref_p (operands[0]));
21864 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21865 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21867 /* When emitting push, take care for source operands on the stack. */
21868 if (push && MEM_P (operands[1])
21869 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21871 rtx src_base = XEXP (part[1][nparts - 1], 0);
21873 /* Compensate for the stack decrement by 4. */
21874 if (!TARGET_64BIT && nparts == 3
21875 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21876 src_base = plus_constant (Pmode, src_base, 4);
21878 /* src_base refers to the stack pointer and is
21879 automatically decreased by emitted push. */
21880 for (i = 0; i < nparts; i++)
21881 part[1][i] = change_address (part[1][i],
21882 GET_MODE (part[1][i]), src_base);
21885 /* We need to do copy in the right order in case an address register
21886 of the source overlaps the destination. */
21887 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21889 rtx tmp;
21891 for (i = 0; i < nparts; i++)
21893 collisionparts[i]
21894 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21895 if (collisionparts[i])
21896 collisions++;
21899 /* Collision in the middle part can be handled by reordering. */
21900 if (collisions == 1 && nparts == 3 && collisionparts [1])
21902 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21903 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21905 else if (collisions == 1
21906 && nparts == 4
21907 && (collisionparts [1] || collisionparts [2]))
21909 if (collisionparts [1])
21911 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21912 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21914 else
21916 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21917 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21921 /* If there are more collisions, we can't handle it by reordering.
21922 Do an lea to the last part and use only one colliding move. */
21923 else if (collisions > 1)
21925 rtx base;
21927 collisions = 1;
21929 base = part[0][nparts - 1];
21931 /* Handle the case when the last part isn't valid for lea.
21932 Happens in 64-bit mode storing the 12-byte XFmode. */
21933 if (GET_MODE (base) != Pmode)
21934 base = gen_rtx_REG (Pmode, REGNO (base));
21936 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21937 part[1][0] = replace_equiv_address (part[1][0], base);
21938 for (i = 1; i < nparts; i++)
21940 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21941 part[1][i] = replace_equiv_address (part[1][i], tmp);
21946 if (push)
21948 if (!TARGET_64BIT)
21950 if (nparts == 3)
21952 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21953 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21954 stack_pointer_rtx, GEN_INT (-4)));
21955 emit_move_insn (part[0][2], part[1][2]);
21957 else if (nparts == 4)
21959 emit_move_insn (part[0][3], part[1][3]);
21960 emit_move_insn (part[0][2], part[1][2]);
21963 else
21965 /* In 64bit mode we don't have 32bit push available. In case this is
21966 register, it is OK - we will just use larger counterpart. We also
21967 retype memory - these comes from attempt to avoid REX prefix on
21968 moving of second half of TFmode value. */
21969 if (GET_MODE (part[1][1]) == SImode)
21971 switch (GET_CODE (part[1][1]))
21973 case MEM:
21974 part[1][1] = adjust_address (part[1][1], DImode, 0);
21975 break;
21977 case REG:
21978 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21979 break;
21981 default:
21982 gcc_unreachable ();
21985 if (GET_MODE (part[1][0]) == SImode)
21986 part[1][0] = part[1][1];
21989 emit_move_insn (part[0][1], part[1][1]);
21990 emit_move_insn (part[0][0], part[1][0]);
21991 return;
21994 /* Choose correct order to not overwrite the source before it is copied. */
21995 if ((REG_P (part[0][0])
21996 && REG_P (part[1][1])
21997 && (REGNO (part[0][0]) == REGNO (part[1][1])
21998 || (nparts == 3
21999 && REGNO (part[0][0]) == REGNO (part[1][2]))
22000 || (nparts == 4
22001 && REGNO (part[0][0]) == REGNO (part[1][3]))))
22002 || (collisions > 0
22003 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
22005 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
22007 operands[2 + i] = part[0][j];
22008 operands[6 + i] = part[1][j];
22011 else
22013 for (i = 0; i < nparts; i++)
22015 operands[2 + i] = part[0][i];
22016 operands[6 + i] = part[1][i];
22020 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
22021 if (optimize_insn_for_size_p ())
22023 for (j = 0; j < nparts - 1; j++)
22024 if (CONST_INT_P (operands[6 + j])
22025 && operands[6 + j] != const0_rtx
22026 && REG_P (operands[2 + j]))
22027 for (i = j; i < nparts - 1; i++)
22028 if (CONST_INT_P (operands[7 + i])
22029 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
22030 operands[7 + i] = operands[2 + j];
22033 for (i = 0; i < nparts; i++)
22034 emit_move_insn (operands[2 + i], operands[6 + i]);
22036 return;
22039 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
22040 left shift by a constant, either using a single shift or
22041 a sequence of add instructions. */
22043 static void
22044 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
22046 rtx (*insn)(rtx, rtx, rtx);
22048 if (count == 1
22049 || (count * ix86_cost->add <= ix86_cost->shift_const
22050 && !optimize_insn_for_size_p ()))
22052 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
22053 while (count-- > 0)
22054 emit_insn (insn (operand, operand, operand));
22056 else
22058 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
22059 emit_insn (insn (operand, operand, GEN_INT (count)));
22063 void
22064 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
22066 rtx (*gen_ashl3)(rtx, rtx, rtx);
22067 rtx (*gen_shld)(rtx, rtx, rtx);
22068 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22070 rtx low[2], high[2];
22071 int count;
22073 if (CONST_INT_P (operands[2]))
22075 split_double_mode (mode, operands, 2, low, high);
22076 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22078 if (count >= half_width)
22080 emit_move_insn (high[0], low[1]);
22081 emit_move_insn (low[0], const0_rtx);
22083 if (count > half_width)
22084 ix86_expand_ashl_const (high[0], count - half_width, mode);
22086 else
22088 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22090 if (!rtx_equal_p (operands[0], operands[1]))
22091 emit_move_insn (operands[0], operands[1]);
22093 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
22094 ix86_expand_ashl_const (low[0], count, mode);
22096 return;
22099 split_double_mode (mode, operands, 1, low, high);
22101 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
22103 if (operands[1] == const1_rtx)
22105 /* Assuming we've chosen a QImode capable registers, then 1 << N
22106 can be done with two 32/64-bit shifts, no branches, no cmoves. */
22107 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
22109 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
22111 ix86_expand_clear (low[0]);
22112 ix86_expand_clear (high[0]);
22113 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
22115 d = gen_lowpart (QImode, low[0]);
22116 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22117 s = gen_rtx_EQ (QImode, flags, const0_rtx);
22118 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22120 d = gen_lowpart (QImode, high[0]);
22121 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
22122 s = gen_rtx_NE (QImode, flags, const0_rtx);
22123 emit_insn (gen_rtx_SET (VOIDmode, d, s));
22126 /* Otherwise, we can get the same results by manually performing
22127 a bit extract operation on bit 5/6, and then performing the two
22128 shifts. The two methods of getting 0/1 into low/high are exactly
22129 the same size. Avoiding the shift in the bit extract case helps
22130 pentium4 a bit; no one else seems to care much either way. */
22131 else
22133 enum machine_mode half_mode;
22134 rtx (*gen_lshr3)(rtx, rtx, rtx);
22135 rtx (*gen_and3)(rtx, rtx, rtx);
22136 rtx (*gen_xor3)(rtx, rtx, rtx);
22137 HOST_WIDE_INT bits;
22138 rtx x;
22140 if (mode == DImode)
22142 half_mode = SImode;
22143 gen_lshr3 = gen_lshrsi3;
22144 gen_and3 = gen_andsi3;
22145 gen_xor3 = gen_xorsi3;
22146 bits = 5;
22148 else
22150 half_mode = DImode;
22151 gen_lshr3 = gen_lshrdi3;
22152 gen_and3 = gen_anddi3;
22153 gen_xor3 = gen_xordi3;
22154 bits = 6;
22157 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
22158 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
22159 else
22160 x = gen_lowpart (half_mode, operands[2]);
22161 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
22163 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
22164 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
22165 emit_move_insn (low[0], high[0]);
22166 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
22169 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22170 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
22171 return;
22174 if (operands[1] == constm1_rtx)
22176 /* For -1 << N, we can avoid the shld instruction, because we
22177 know that we're shifting 0...31/63 ones into a -1. */
22178 emit_move_insn (low[0], constm1_rtx);
22179 if (optimize_insn_for_size_p ())
22180 emit_move_insn (high[0], low[0]);
22181 else
22182 emit_move_insn (high[0], constm1_rtx);
22184 else
22186 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
22188 if (!rtx_equal_p (operands[0], operands[1]))
22189 emit_move_insn (operands[0], operands[1]);
22191 split_double_mode (mode, operands, 1, low, high);
22192 emit_insn (gen_shld (high[0], low[0], operands[2]));
22195 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22197 if (TARGET_CMOVE && scratch)
22199 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22200 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22202 ix86_expand_clear (scratch);
22203 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22205 else
22207 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22208 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22210 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22214 void
22215 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22217 rtx (*gen_ashr3)(rtx, rtx, rtx)
22218 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22219 rtx (*gen_shrd)(rtx, rtx, rtx);
22220 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22222 rtx low[2], high[2];
22223 int count;
22225 if (CONST_INT_P (operands[2]))
22227 split_double_mode (mode, operands, 2, low, high);
22228 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22230 if (count == GET_MODE_BITSIZE (mode) - 1)
22232 emit_move_insn (high[0], high[1]);
22233 emit_insn (gen_ashr3 (high[0], high[0],
22234 GEN_INT (half_width - 1)));
22235 emit_move_insn (low[0], high[0]);
22238 else if (count >= half_width)
22240 emit_move_insn (low[0], high[1]);
22241 emit_move_insn (high[0], low[0]);
22242 emit_insn (gen_ashr3 (high[0], high[0],
22243 GEN_INT (half_width - 1)));
22245 if (count > half_width)
22246 emit_insn (gen_ashr3 (low[0], low[0],
22247 GEN_INT (count - half_width)));
22249 else
22251 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22253 if (!rtx_equal_p (operands[0], operands[1]))
22254 emit_move_insn (operands[0], operands[1]);
22256 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22257 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22260 else
22262 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22264 if (!rtx_equal_p (operands[0], operands[1]))
22265 emit_move_insn (operands[0], operands[1]);
22267 split_double_mode (mode, operands, 1, low, high);
22269 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22270 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22272 if (TARGET_CMOVE && scratch)
22274 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22275 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22277 emit_move_insn (scratch, high[0]);
22278 emit_insn (gen_ashr3 (scratch, scratch,
22279 GEN_INT (half_width - 1)));
22280 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22281 scratch));
22283 else
22285 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22286 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22288 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22293 void
22294 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22296 rtx (*gen_lshr3)(rtx, rtx, rtx)
22297 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22298 rtx (*gen_shrd)(rtx, rtx, rtx);
22299 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22301 rtx low[2], high[2];
22302 int count;
22304 if (CONST_INT_P (operands[2]))
22306 split_double_mode (mode, operands, 2, low, high);
22307 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22309 if (count >= half_width)
22311 emit_move_insn (low[0], high[1]);
22312 ix86_expand_clear (high[0]);
22314 if (count > half_width)
22315 emit_insn (gen_lshr3 (low[0], low[0],
22316 GEN_INT (count - half_width)));
22318 else
22320 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22322 if (!rtx_equal_p (operands[0], operands[1]))
22323 emit_move_insn (operands[0], operands[1]);
22325 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22326 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22329 else
22331 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22333 if (!rtx_equal_p (operands[0], operands[1]))
22334 emit_move_insn (operands[0], operands[1]);
22336 split_double_mode (mode, operands, 1, low, high);
22338 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22339 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22341 if (TARGET_CMOVE && scratch)
22343 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22344 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22346 ix86_expand_clear (scratch);
22347 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22348 scratch));
22350 else
22352 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22353 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22355 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22360 /* Predict just emitted jump instruction to be taken with probability PROB. */
22361 static void
22362 predict_jump (int prob)
22364 rtx insn = get_last_insn ();
22365 gcc_assert (JUMP_P (insn));
22366 add_int_reg_note (insn, REG_BR_PROB, prob);
22369 /* Helper function for the string operations below. Dest VARIABLE whether
22370 it is aligned to VALUE bytes. If true, jump to the label. */
22371 static rtx
22372 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22374 rtx label = gen_label_rtx ();
22375 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22376 if (GET_MODE (variable) == DImode)
22377 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22378 else
22379 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22380 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22381 1, label);
22382 if (epilogue)
22383 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22384 else
22385 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22386 return label;
22389 /* Adjust COUNTER by the VALUE. */
22390 static void
22391 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22393 rtx (*gen_add)(rtx, rtx, rtx)
22394 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22396 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22399 /* Zero extend possibly SImode EXP to Pmode register. */
22401 ix86_zero_extend_to_Pmode (rtx exp)
22403 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22406 /* Divide COUNTREG by SCALE. */
22407 static rtx
22408 scale_counter (rtx countreg, int scale)
22410 rtx sc;
22412 if (scale == 1)
22413 return countreg;
22414 if (CONST_INT_P (countreg))
22415 return GEN_INT (INTVAL (countreg) / scale);
22416 gcc_assert (REG_P (countreg));
22418 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22419 GEN_INT (exact_log2 (scale)),
22420 NULL, 1, OPTAB_DIRECT);
22421 return sc;
22424 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22425 DImode for constant loop counts. */
22427 static enum machine_mode
22428 counter_mode (rtx count_exp)
22430 if (GET_MODE (count_exp) != VOIDmode)
22431 return GET_MODE (count_exp);
22432 if (!CONST_INT_P (count_exp))
22433 return Pmode;
22434 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22435 return DImode;
22436 return SImode;
22439 /* Copy the address to a Pmode register. This is used for x32 to
22440 truncate DImode TLS address to a SImode register. */
22442 static rtx
22443 ix86_copy_addr_to_reg (rtx addr)
22445 if (GET_MODE (addr) == Pmode)
22446 return copy_addr_to_reg (addr);
22447 else
22449 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22450 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22454 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22455 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22456 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22457 memory by VALUE (supposed to be in MODE).
22459 The size is rounded down to whole number of chunk size moved at once.
22460 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22463 static void
22464 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22465 rtx destptr, rtx srcptr, rtx value,
22466 rtx count, enum machine_mode mode, int unroll,
22467 int expected_size, bool issetmem)
22469 rtx out_label, top_label, iter, tmp;
22470 enum machine_mode iter_mode = counter_mode (count);
22471 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22472 rtx piece_size = GEN_INT (piece_size_n);
22473 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22474 rtx size;
22475 int i;
22477 top_label = gen_label_rtx ();
22478 out_label = gen_label_rtx ();
22479 iter = gen_reg_rtx (iter_mode);
22481 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22482 NULL, 1, OPTAB_DIRECT);
22483 /* Those two should combine. */
22484 if (piece_size == const1_rtx)
22486 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22487 true, out_label);
22488 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22490 emit_move_insn (iter, const0_rtx);
22492 emit_label (top_label);
22494 tmp = convert_modes (Pmode, iter_mode, iter, true);
22496 /* This assert could be relaxed - in this case we'll need to compute
22497 smallest power of two, containing in PIECE_SIZE_N and pass it to
22498 offset_address. */
22499 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22500 destmem = offset_address (destmem, tmp, piece_size_n);
22501 destmem = adjust_address (destmem, mode, 0);
22503 if (!issetmem)
22505 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22506 srcmem = adjust_address (srcmem, mode, 0);
22508 /* When unrolling for chips that reorder memory reads and writes,
22509 we can save registers by using single temporary.
22510 Also using 4 temporaries is overkill in 32bit mode. */
22511 if (!TARGET_64BIT && 0)
22513 for (i = 0; i < unroll; i++)
22515 if (i)
22517 destmem =
22518 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22519 srcmem =
22520 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22522 emit_move_insn (destmem, srcmem);
22525 else
22527 rtx tmpreg[4];
22528 gcc_assert (unroll <= 4);
22529 for (i = 0; i < unroll; i++)
22531 tmpreg[i] = gen_reg_rtx (mode);
22532 if (i)
22534 srcmem =
22535 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22537 emit_move_insn (tmpreg[i], srcmem);
22539 for (i = 0; i < unroll; i++)
22541 if (i)
22543 destmem =
22544 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22546 emit_move_insn (destmem, tmpreg[i]);
22550 else
22551 for (i = 0; i < unroll; i++)
22553 if (i)
22554 destmem =
22555 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22556 emit_move_insn (destmem, value);
22559 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22560 true, OPTAB_LIB_WIDEN);
22561 if (tmp != iter)
22562 emit_move_insn (iter, tmp);
22564 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22565 true, top_label);
22566 if (expected_size != -1)
22568 expected_size /= GET_MODE_SIZE (mode) * unroll;
22569 if (expected_size == 0)
22570 predict_jump (0);
22571 else if (expected_size > REG_BR_PROB_BASE)
22572 predict_jump (REG_BR_PROB_BASE - 1);
22573 else
22574 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22576 else
22577 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22578 iter = ix86_zero_extend_to_Pmode (iter);
22579 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22580 true, OPTAB_LIB_WIDEN);
22581 if (tmp != destptr)
22582 emit_move_insn (destptr, tmp);
22583 if (!issetmem)
22585 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22586 true, OPTAB_LIB_WIDEN);
22587 if (tmp != srcptr)
22588 emit_move_insn (srcptr, tmp);
22590 emit_label (out_label);
22593 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22594 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22595 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22596 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22597 ORIG_VALUE is the original value passed to memset to fill the memory with.
22598 Other arguments have same meaning as for previous function. */
22600 static void
22601 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22602 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22603 rtx count,
22604 enum machine_mode mode, bool issetmem)
22606 rtx destexp;
22607 rtx srcexp;
22608 rtx countreg;
22609 HOST_WIDE_INT rounded_count;
22611 /* If possible, it is shorter to use rep movs.
22612 TODO: Maybe it is better to move this logic to decide_alg. */
22613 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22614 && (!issetmem || orig_value == const0_rtx))
22615 mode = SImode;
22617 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22618 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22620 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22621 GET_MODE_SIZE (mode)));
22622 if (mode != QImode)
22624 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22625 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22626 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22628 else
22629 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22630 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22632 rounded_count = (INTVAL (count)
22633 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22634 destmem = shallow_copy_rtx (destmem);
22635 set_mem_size (destmem, rounded_count);
22637 else if (MEM_SIZE_KNOWN_P (destmem))
22638 clear_mem_size (destmem);
22640 if (issetmem)
22642 value = force_reg (mode, gen_lowpart (mode, value));
22643 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22645 else
22647 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22648 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22649 if (mode != QImode)
22651 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22652 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22653 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22655 else
22656 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22657 if (CONST_INT_P (count))
22659 rounded_count = (INTVAL (count)
22660 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22661 srcmem = shallow_copy_rtx (srcmem);
22662 set_mem_size (srcmem, rounded_count);
22664 else
22666 if (MEM_SIZE_KNOWN_P (srcmem))
22667 clear_mem_size (srcmem);
22669 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22670 destexp, srcexp));
22674 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22675 DESTMEM.
22676 SRC is passed by pointer to be updated on return.
22677 Return value is updated DST. */
22678 static rtx
22679 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22680 HOST_WIDE_INT size_to_move)
22682 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22683 enum insn_code code;
22684 enum machine_mode move_mode;
22685 int piece_size, i;
22687 /* Find the widest mode in which we could perform moves.
22688 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22689 it until move of such size is supported. */
22690 piece_size = 1 << floor_log2 (size_to_move);
22691 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22692 code = optab_handler (mov_optab, move_mode);
22693 while (code == CODE_FOR_nothing && piece_size > 1)
22695 piece_size >>= 1;
22696 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22697 code = optab_handler (mov_optab, move_mode);
22700 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22701 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22702 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22704 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22705 move_mode = mode_for_vector (word_mode, nunits);
22706 code = optab_handler (mov_optab, move_mode);
22707 if (code == CODE_FOR_nothing)
22709 move_mode = word_mode;
22710 piece_size = GET_MODE_SIZE (move_mode);
22711 code = optab_handler (mov_optab, move_mode);
22714 gcc_assert (code != CODE_FOR_nothing);
22716 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22717 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22719 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22720 gcc_assert (size_to_move % piece_size == 0);
22721 adjust = GEN_INT (piece_size);
22722 for (i = 0; i < size_to_move; i += piece_size)
22724 /* We move from memory to memory, so we'll need to do it via
22725 a temporary register. */
22726 tempreg = gen_reg_rtx (move_mode);
22727 emit_insn (GEN_FCN (code) (tempreg, src));
22728 emit_insn (GEN_FCN (code) (dst, tempreg));
22730 emit_move_insn (destptr,
22731 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22732 emit_move_insn (srcptr,
22733 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22735 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22736 piece_size);
22737 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22738 piece_size);
22741 /* Update DST and SRC rtx. */
22742 *srcmem = src;
22743 return dst;
22746 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22747 static void
22748 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22749 rtx destptr, rtx srcptr, rtx count, int max_size)
22751 rtx src, dest;
22752 if (CONST_INT_P (count))
22754 HOST_WIDE_INT countval = INTVAL (count);
22755 HOST_WIDE_INT epilogue_size = countval % max_size;
22756 int i;
22758 /* For now MAX_SIZE should be a power of 2. This assert could be
22759 relaxed, but it'll require a bit more complicated epilogue
22760 expanding. */
22761 gcc_assert ((max_size & (max_size - 1)) == 0);
22762 for (i = max_size; i >= 1; i >>= 1)
22764 if (epilogue_size & i)
22765 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22767 return;
22769 if (max_size > 8)
22771 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22772 count, 1, OPTAB_DIRECT);
22773 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22774 count, QImode, 1, 4, false);
22775 return;
22778 /* When there are stringops, we can cheaply increase dest and src pointers.
22779 Otherwise we save code size by maintaining offset (zero is readily
22780 available from preceding rep operation) and using x86 addressing modes.
22782 if (TARGET_SINGLE_STRINGOP)
22784 if (max_size > 4)
22786 rtx label = ix86_expand_aligntest (count, 4, true);
22787 src = change_address (srcmem, SImode, srcptr);
22788 dest = change_address (destmem, SImode, destptr);
22789 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22790 emit_label (label);
22791 LABEL_NUSES (label) = 1;
22793 if (max_size > 2)
22795 rtx label = ix86_expand_aligntest (count, 2, true);
22796 src = change_address (srcmem, HImode, srcptr);
22797 dest = change_address (destmem, HImode, destptr);
22798 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22799 emit_label (label);
22800 LABEL_NUSES (label) = 1;
22802 if (max_size > 1)
22804 rtx label = ix86_expand_aligntest (count, 1, true);
22805 src = change_address (srcmem, QImode, srcptr);
22806 dest = change_address (destmem, QImode, destptr);
22807 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22808 emit_label (label);
22809 LABEL_NUSES (label) = 1;
22812 else
22814 rtx offset = force_reg (Pmode, const0_rtx);
22815 rtx tmp;
22817 if (max_size > 4)
22819 rtx label = ix86_expand_aligntest (count, 4, true);
22820 src = change_address (srcmem, SImode, srcptr);
22821 dest = change_address (destmem, SImode, destptr);
22822 emit_move_insn (dest, src);
22823 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22824 true, OPTAB_LIB_WIDEN);
22825 if (tmp != offset)
22826 emit_move_insn (offset, tmp);
22827 emit_label (label);
22828 LABEL_NUSES (label) = 1;
22830 if (max_size > 2)
22832 rtx label = ix86_expand_aligntest (count, 2, true);
22833 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22834 src = change_address (srcmem, HImode, tmp);
22835 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22836 dest = change_address (destmem, HImode, tmp);
22837 emit_move_insn (dest, src);
22838 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22839 true, OPTAB_LIB_WIDEN);
22840 if (tmp != offset)
22841 emit_move_insn (offset, tmp);
22842 emit_label (label);
22843 LABEL_NUSES (label) = 1;
22845 if (max_size > 1)
22847 rtx label = ix86_expand_aligntest (count, 1, true);
22848 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22849 src = change_address (srcmem, QImode, tmp);
22850 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22851 dest = change_address (destmem, QImode, tmp);
22852 emit_move_insn (dest, src);
22853 emit_label (label);
22854 LABEL_NUSES (label) = 1;
22859 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22860 with value PROMOTED_VAL.
22861 SRC is passed by pointer to be updated on return.
22862 Return value is updated DST. */
22863 static rtx
22864 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22865 HOST_WIDE_INT size_to_move)
22867 rtx dst = destmem, adjust;
22868 enum insn_code code;
22869 enum machine_mode move_mode;
22870 int piece_size, i;
22872 /* Find the widest mode in which we could perform moves.
22873 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22874 it until move of such size is supported. */
22875 move_mode = GET_MODE (promoted_val);
22876 if (move_mode == VOIDmode)
22877 move_mode = QImode;
22878 if (size_to_move < GET_MODE_SIZE (move_mode))
22880 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22881 promoted_val = gen_lowpart (move_mode, promoted_val);
22883 piece_size = GET_MODE_SIZE (move_mode);
22884 code = optab_handler (mov_optab, move_mode);
22885 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22887 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22889 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22890 gcc_assert (size_to_move % piece_size == 0);
22891 adjust = GEN_INT (piece_size);
22892 for (i = 0; i < size_to_move; i += piece_size)
22894 if (piece_size <= GET_MODE_SIZE (word_mode))
22896 emit_insn (gen_strset (destptr, dst, promoted_val));
22897 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22898 piece_size);
22899 continue;
22902 emit_insn (GEN_FCN (code) (dst, promoted_val));
22904 emit_move_insn (destptr,
22905 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22907 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22908 piece_size);
22911 /* Update DST rtx. */
22912 return dst;
22914 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22915 static void
22916 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22917 rtx count, int max_size)
22919 count =
22920 expand_simple_binop (counter_mode (count), AND, count,
22921 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22922 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22923 gen_lowpart (QImode, value), count, QImode,
22924 1, max_size / 2, true);
22927 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22928 static void
22929 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22930 rtx count, int max_size)
22932 rtx dest;
22934 if (CONST_INT_P (count))
22936 HOST_WIDE_INT countval = INTVAL (count);
22937 HOST_WIDE_INT epilogue_size = countval % max_size;
22938 int i;
22940 /* For now MAX_SIZE should be a power of 2. This assert could be
22941 relaxed, but it'll require a bit more complicated epilogue
22942 expanding. */
22943 gcc_assert ((max_size & (max_size - 1)) == 0);
22944 for (i = max_size; i >= 1; i >>= 1)
22946 if (epilogue_size & i)
22948 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22949 destmem = emit_memset (destmem, destptr, vec_value, i);
22950 else
22951 destmem = emit_memset (destmem, destptr, value, i);
22954 return;
22956 if (max_size > 32)
22958 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22959 return;
22961 if (max_size > 16)
22963 rtx label = ix86_expand_aligntest (count, 16, true);
22964 if (TARGET_64BIT)
22966 dest = change_address (destmem, DImode, destptr);
22967 emit_insn (gen_strset (destptr, dest, value));
22968 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
22969 emit_insn (gen_strset (destptr, dest, value));
22971 else
22973 dest = change_address (destmem, SImode, destptr);
22974 emit_insn (gen_strset (destptr, dest, value));
22975 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
22976 emit_insn (gen_strset (destptr, dest, value));
22977 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
22978 emit_insn (gen_strset (destptr, dest, value));
22979 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
22980 emit_insn (gen_strset (destptr, dest, value));
22982 emit_label (label);
22983 LABEL_NUSES (label) = 1;
22985 if (max_size > 8)
22987 rtx label = ix86_expand_aligntest (count, 8, true);
22988 if (TARGET_64BIT)
22990 dest = change_address (destmem, DImode, destptr);
22991 emit_insn (gen_strset (destptr, dest, value));
22993 else
22995 dest = change_address (destmem, SImode, destptr);
22996 emit_insn (gen_strset (destptr, dest, value));
22997 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
22998 emit_insn (gen_strset (destptr, dest, value));
23000 emit_label (label);
23001 LABEL_NUSES (label) = 1;
23003 if (max_size > 4)
23005 rtx label = ix86_expand_aligntest (count, 4, true);
23006 dest = change_address (destmem, SImode, destptr);
23007 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
23008 emit_label (label);
23009 LABEL_NUSES (label) = 1;
23011 if (max_size > 2)
23013 rtx label = ix86_expand_aligntest (count, 2, true);
23014 dest = change_address (destmem, HImode, destptr);
23015 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
23016 emit_label (label);
23017 LABEL_NUSES (label) = 1;
23019 if (max_size > 1)
23021 rtx label = ix86_expand_aligntest (count, 1, true);
23022 dest = change_address (destmem, QImode, destptr);
23023 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
23024 emit_label (label);
23025 LABEL_NUSES (label) = 1;
23029 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
23030 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
23031 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
23032 ignored.
23033 Return value is updated DESTMEM. */
23034 static rtx
23035 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
23036 rtx destptr, rtx srcptr, rtx value,
23037 rtx vec_value, rtx count, int align,
23038 int desired_alignment, bool issetmem)
23040 int i;
23041 for (i = 1; i < desired_alignment; i <<= 1)
23043 if (align <= i)
23045 rtx label = ix86_expand_aligntest (destptr, i, false);
23046 if (issetmem)
23048 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
23049 destmem = emit_memset (destmem, destptr, vec_value, i);
23050 else
23051 destmem = emit_memset (destmem, destptr, value, i);
23053 else
23054 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
23055 ix86_adjust_counter (count, i);
23056 emit_label (label);
23057 LABEL_NUSES (label) = 1;
23058 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
23061 return destmem;
23064 /* Test if COUNT&SIZE is nonzero and if so, expand movme
23065 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
23066 and jump to DONE_LABEL. */
23067 static void
23068 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
23069 rtx destptr, rtx srcptr,
23070 rtx value, rtx vec_value,
23071 rtx count, int size,
23072 rtx done_label, bool issetmem)
23074 rtx label = ix86_expand_aligntest (count, size, false);
23075 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
23076 rtx modesize;
23077 int n;
23079 /* If we do not have vector value to copy, we must reduce size. */
23080 if (issetmem)
23082 if (!vec_value)
23084 if (GET_MODE (value) == VOIDmode && size > 8)
23085 mode = Pmode;
23086 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
23087 mode = GET_MODE (value);
23089 else
23090 mode = GET_MODE (vec_value), value = vec_value;
23092 else
23094 /* Choose appropriate vector mode. */
23095 if (size >= 32)
23096 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
23097 else if (size >= 16)
23098 mode = TARGET_SSE ? V16QImode : DImode;
23099 srcmem = change_address (srcmem, mode, srcptr);
23101 destmem = change_address (destmem, mode, destptr);
23102 modesize = GEN_INT (GET_MODE_SIZE (mode));
23103 gcc_assert (GET_MODE_SIZE (mode) <= size);
23104 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
23106 if (issetmem)
23107 emit_move_insn (destmem, gen_lowpart (mode, value));
23108 else
23110 emit_move_insn (destmem, srcmem);
23111 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23113 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23116 destmem = offset_address (destmem, count, 1);
23117 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23118 GET_MODE_SIZE (mode));
23119 if (issetmem)
23120 emit_move_insn (destmem, gen_lowpart (mode, value));
23121 else
23123 srcmem = offset_address (srcmem, count, 1);
23124 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
23125 GET_MODE_SIZE (mode));
23126 emit_move_insn (destmem, srcmem);
23128 emit_jump_insn (gen_jump (done_label));
23129 emit_barrier ();
23131 emit_label (label);
23132 LABEL_NUSES (label) = 1;
23135 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
23136 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
23137 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
23138 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
23139 DONE_LABEL is a label after the whole copying sequence. The label is created
23140 on demand if *DONE_LABEL is NULL.
23141 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
23142 bounds after the initial copies.
23144 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
23145 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
23146 we will dispatch to a library call for large blocks.
23148 In pseudocode we do:
23150 if (COUNT < SIZE)
23152 Assume that SIZE is 4. Bigger sizes are handled analogously
23153 if (COUNT & 4)
23155 copy 4 bytes from SRCPTR to DESTPTR
23156 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
23157 goto done_label
23159 if (!COUNT)
23160 goto done_label;
23161 copy 1 byte from SRCPTR to DESTPTR
23162 if (COUNT & 2)
23164 copy 2 bytes from SRCPTR to DESTPTR
23165 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
23168 else
23170 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
23171 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
23173 OLD_DESPTR = DESTPTR;
23174 Align DESTPTR up to DESIRED_ALIGN
23175 SRCPTR += DESTPTR - OLD_DESTPTR
23176 COUNT -= DEST_PTR - OLD_DESTPTR
23177 if (DYNAMIC_CHECK)
23178 Round COUNT down to multiple of SIZE
23179 << optional caller supplied zero size guard is here >>
23180 << optional caller suppplied dynamic check is here >>
23181 << caller supplied main copy loop is here >>
23183 done_label:
23185 static void
23186 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
23187 rtx *destptr, rtx *srcptr,
23188 enum machine_mode mode,
23189 rtx value, rtx vec_value,
23190 rtx *count,
23191 rtx *done_label,
23192 int size,
23193 int desired_align,
23194 int align,
23195 unsigned HOST_WIDE_INT *min_size,
23196 bool dynamic_check,
23197 bool issetmem)
23199 rtx loop_label = NULL, label;
23200 int n;
23201 rtx modesize;
23202 int prolog_size = 0;
23203 rtx mode_value;
23205 /* Chose proper value to copy. */
23206 if (issetmem && VECTOR_MODE_P (mode))
23207 mode_value = vec_value;
23208 else
23209 mode_value = value;
23210 gcc_assert (GET_MODE_SIZE (mode) <= size);
23212 /* See if block is big or small, handle small blocks. */
23213 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23215 int size2 = size;
23216 loop_label = gen_label_rtx ();
23218 if (!*done_label)
23219 *done_label = gen_label_rtx ();
23221 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23222 1, loop_label);
23223 size2 >>= 1;
23225 /* Handle sizes > 3. */
23226 for (;size2 > 2; size2 >>= 1)
23227 expand_small_movmem_or_setmem (destmem, srcmem,
23228 *destptr, *srcptr,
23229 value, vec_value,
23230 *count,
23231 size2, *done_label, issetmem);
23232 /* Nothing to copy? Jump to DONE_LABEL if so */
23233 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23234 1, *done_label);
23236 /* Do a byte copy. */
23237 destmem = change_address (destmem, QImode, *destptr);
23238 if (issetmem)
23239 emit_move_insn (destmem, gen_lowpart (QImode, value));
23240 else
23242 srcmem = change_address (srcmem, QImode, *srcptr);
23243 emit_move_insn (destmem, srcmem);
23246 /* Handle sizes 2 and 3. */
23247 label = ix86_expand_aligntest (*count, 2, false);
23248 destmem = change_address (destmem, HImode, *destptr);
23249 destmem = offset_address (destmem, *count, 1);
23250 destmem = offset_address (destmem, GEN_INT (-2), 2);
23251 if (issetmem)
23252 emit_move_insn (destmem, gen_lowpart (HImode, value));
23253 else
23255 srcmem = change_address (srcmem, HImode, *srcptr);
23256 srcmem = offset_address (srcmem, *count, 1);
23257 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23258 emit_move_insn (destmem, srcmem);
23261 emit_label (label);
23262 LABEL_NUSES (label) = 1;
23263 emit_jump_insn (gen_jump (*done_label));
23264 emit_barrier ();
23266 else
23267 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23268 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23270 /* Start memcpy for COUNT >= SIZE. */
23271 if (loop_label)
23273 emit_label (loop_label);
23274 LABEL_NUSES (loop_label) = 1;
23277 /* Copy first desired_align bytes. */
23278 if (!issetmem)
23279 srcmem = change_address (srcmem, mode, *srcptr);
23280 destmem = change_address (destmem, mode, *destptr);
23281 modesize = GEN_INT (GET_MODE_SIZE (mode));
23282 for (n = 0; prolog_size < desired_align - align; n++)
23284 if (issetmem)
23285 emit_move_insn (destmem, mode_value);
23286 else
23288 emit_move_insn (destmem, srcmem);
23289 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23291 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23292 prolog_size += GET_MODE_SIZE (mode);
23296 /* Copy last SIZE bytes. */
23297 destmem = offset_address (destmem, *count, 1);
23298 destmem = offset_address (destmem,
23299 GEN_INT (-size - prolog_size),
23301 if (issetmem)
23302 emit_move_insn (destmem, mode_value);
23303 else
23305 srcmem = offset_address (srcmem, *count, 1);
23306 srcmem = offset_address (srcmem,
23307 GEN_INT (-size - prolog_size),
23309 emit_move_insn (destmem, srcmem);
23311 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23313 destmem = offset_address (destmem, modesize, 1);
23314 if (issetmem)
23315 emit_move_insn (destmem, mode_value);
23316 else
23318 srcmem = offset_address (srcmem, modesize, 1);
23319 emit_move_insn (destmem, srcmem);
23323 /* Align destination. */
23324 if (desired_align > 1 && desired_align > align)
23326 rtx saveddest = *destptr;
23328 gcc_assert (desired_align <= size);
23329 /* Align destptr up, place it to new register. */
23330 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23331 GEN_INT (prolog_size),
23332 NULL_RTX, 1, OPTAB_DIRECT);
23333 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23334 GEN_INT (-desired_align),
23335 *destptr, 1, OPTAB_DIRECT);
23336 /* See how many bytes we skipped. */
23337 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23338 *destptr,
23339 saveddest, 1, OPTAB_DIRECT);
23340 /* Adjust srcptr and count. */
23341 if (!issetmem)
23342 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23343 *srcptr, 1, OPTAB_DIRECT);
23344 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23345 saveddest, *count, 1, OPTAB_DIRECT);
23346 /* We copied at most size + prolog_size. */
23347 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23348 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23349 else
23350 *min_size = 0;
23352 /* Our loops always round down the bock size, but for dispatch to library
23353 we need precise value. */
23354 if (dynamic_check)
23355 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23356 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23358 else
23360 gcc_assert (prolog_size == 0);
23361 /* Decrease count, so we won't end up copying last word twice. */
23362 if (!CONST_INT_P (*count))
23363 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23364 constm1_rtx, *count, 1, OPTAB_DIRECT);
23365 else
23366 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23367 if (*min_size)
23368 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23373 /* This function is like the previous one, except here we know how many bytes
23374 need to be copied. That allows us to update alignment not only of DST, which
23375 is returned, but also of SRC, which is passed as a pointer for that
23376 reason. */
23377 static rtx
23378 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23379 rtx srcreg, rtx value, rtx vec_value,
23380 int desired_align, int align_bytes,
23381 bool issetmem)
23383 rtx src = NULL;
23384 rtx orig_dst = dst;
23385 rtx orig_src = NULL;
23386 int piece_size = 1;
23387 int copied_bytes = 0;
23389 if (!issetmem)
23391 gcc_assert (srcp != NULL);
23392 src = *srcp;
23393 orig_src = src;
23396 for (piece_size = 1;
23397 piece_size <= desired_align && copied_bytes < align_bytes;
23398 piece_size <<= 1)
23400 if (align_bytes & piece_size)
23402 if (issetmem)
23404 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23405 dst = emit_memset (dst, destreg, vec_value, piece_size);
23406 else
23407 dst = emit_memset (dst, destreg, value, piece_size);
23409 else
23410 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23411 copied_bytes += piece_size;
23414 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23415 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23416 if (MEM_SIZE_KNOWN_P (orig_dst))
23417 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23419 if (!issetmem)
23421 int src_align_bytes = get_mem_align_offset (src, desired_align
23422 * BITS_PER_UNIT);
23423 if (src_align_bytes >= 0)
23424 src_align_bytes = desired_align - src_align_bytes;
23425 if (src_align_bytes >= 0)
23427 unsigned int src_align;
23428 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23430 if ((src_align_bytes & (src_align - 1))
23431 == (align_bytes & (src_align - 1)))
23432 break;
23434 if (src_align > (unsigned int) desired_align)
23435 src_align = desired_align;
23436 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23437 set_mem_align (src, src_align * BITS_PER_UNIT);
23439 if (MEM_SIZE_KNOWN_P (orig_src))
23440 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23441 *srcp = src;
23444 return dst;
23447 /* Return true if ALG can be used in current context.
23448 Assume we expand memset if MEMSET is true. */
23449 static bool
23450 alg_usable_p (enum stringop_alg alg, bool memset)
23452 if (alg == no_stringop)
23453 return false;
23454 if (alg == vector_loop)
23455 return TARGET_SSE || TARGET_AVX;
23456 /* Algorithms using the rep prefix want at least edi and ecx;
23457 additionally, memset wants eax and memcpy wants esi. Don't
23458 consider such algorithms if the user has appropriated those
23459 registers for their own purposes. */
23460 if (alg == rep_prefix_1_byte
23461 || alg == rep_prefix_4_byte
23462 || alg == rep_prefix_8_byte)
23463 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23464 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23465 return true;
23468 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23469 static enum stringop_alg
23470 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23471 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23472 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23474 const struct stringop_algs * algs;
23475 bool optimize_for_speed;
23476 int max = -1;
23477 const struct processor_costs *cost;
23478 int i;
23479 bool any_alg_usable_p = false;
23481 *noalign = false;
23482 *dynamic_check = -1;
23484 /* Even if the string operation call is cold, we still might spend a lot
23485 of time processing large blocks. */
23486 if (optimize_function_for_size_p (cfun)
23487 || (optimize_insn_for_size_p ()
23488 && (max_size < 256
23489 || (expected_size != -1 && expected_size < 256))))
23490 optimize_for_speed = false;
23491 else
23492 optimize_for_speed = true;
23494 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23495 if (memset)
23496 algs = &cost->memset[TARGET_64BIT != 0];
23497 else
23498 algs = &cost->memcpy[TARGET_64BIT != 0];
23500 /* See maximal size for user defined algorithm. */
23501 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23503 enum stringop_alg candidate = algs->size[i].alg;
23504 bool usable = alg_usable_p (candidate, memset);
23505 any_alg_usable_p |= usable;
23507 if (candidate != libcall && candidate && usable)
23508 max = algs->size[i].max;
23511 /* If expected size is not known but max size is small enough
23512 so inline version is a win, set expected size into
23513 the range. */
23514 if (max > 1 && (unsigned HOST_WIDE_INT) max >= max_size
23515 && expected_size == -1)
23516 expected_size = min_size / 2 + max_size / 2;
23518 /* If user specified the algorithm, honnor it if possible. */
23519 if (ix86_stringop_alg != no_stringop
23520 && alg_usable_p (ix86_stringop_alg, memset))
23521 return ix86_stringop_alg;
23522 /* rep; movq or rep; movl is the smallest variant. */
23523 else if (!optimize_for_speed)
23525 *noalign = true;
23526 if (!count || (count & 3) || (memset && !zero_memset))
23527 return alg_usable_p (rep_prefix_1_byte, memset)
23528 ? rep_prefix_1_byte : loop_1_byte;
23529 else
23530 return alg_usable_p (rep_prefix_4_byte, memset)
23531 ? rep_prefix_4_byte : loop;
23533 /* Very tiny blocks are best handled via the loop, REP is expensive to
23534 setup. */
23535 else if (expected_size != -1 && expected_size < 4)
23536 return loop_1_byte;
23537 else if (expected_size != -1)
23539 enum stringop_alg alg = libcall;
23540 bool alg_noalign = false;
23541 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23543 /* We get here if the algorithms that were not libcall-based
23544 were rep-prefix based and we are unable to use rep prefixes
23545 based on global register usage. Break out of the loop and
23546 use the heuristic below. */
23547 if (algs->size[i].max == 0)
23548 break;
23549 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23551 enum stringop_alg candidate = algs->size[i].alg;
23553 if (candidate != libcall && alg_usable_p (candidate, memset))
23555 alg = candidate;
23556 alg_noalign = algs->size[i].noalign;
23558 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23559 last non-libcall inline algorithm. */
23560 if (TARGET_INLINE_ALL_STRINGOPS)
23562 /* When the current size is best to be copied by a libcall,
23563 but we are still forced to inline, run the heuristic below
23564 that will pick code for medium sized blocks. */
23565 if (alg != libcall)
23567 *noalign = alg_noalign;
23568 return alg;
23570 break;
23572 else if (alg_usable_p (candidate, memset))
23574 *noalign = algs->size[i].noalign;
23575 return candidate;
23580 /* When asked to inline the call anyway, try to pick meaningful choice.
23581 We look for maximal size of block that is faster to copy by hand and
23582 take blocks of at most of that size guessing that average size will
23583 be roughly half of the block.
23585 If this turns out to be bad, we might simply specify the preferred
23586 choice in ix86_costs. */
23587 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23588 && (algs->unknown_size == libcall
23589 || !alg_usable_p (algs->unknown_size, memset)))
23591 enum stringop_alg alg;
23593 /* If there aren't any usable algorithms, then recursing on
23594 smaller sizes isn't going to find anything. Just return the
23595 simple byte-at-a-time copy loop. */
23596 if (!any_alg_usable_p)
23598 /* Pick something reasonable. */
23599 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23600 *dynamic_check = 128;
23601 return loop_1_byte;
23603 if (max == -1)
23604 max = 4096;
23605 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23606 zero_memset, dynamic_check, noalign);
23607 gcc_assert (*dynamic_check == -1);
23608 gcc_assert (alg != libcall);
23609 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23610 *dynamic_check = max;
23611 return alg;
23613 return (alg_usable_p (algs->unknown_size, memset)
23614 ? algs->unknown_size : libcall);
23617 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23618 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23619 static int
23620 decide_alignment (int align,
23621 enum stringop_alg alg,
23622 int expected_size,
23623 enum machine_mode move_mode)
23625 int desired_align = 0;
23627 gcc_assert (alg != no_stringop);
23629 if (alg == libcall)
23630 return 0;
23631 if (move_mode == VOIDmode)
23632 return 0;
23634 desired_align = GET_MODE_SIZE (move_mode);
23635 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23636 copying whole cacheline at once. */
23637 if (TARGET_PENTIUMPRO
23638 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23639 desired_align = 8;
23641 if (optimize_size)
23642 desired_align = 1;
23643 if (desired_align < align)
23644 desired_align = align;
23645 if (expected_size != -1 && expected_size < 4)
23646 desired_align = align;
23648 return desired_align;
23652 /* Helper function for memcpy. For QImode value 0xXY produce
23653 0xXYXYXYXY of wide specified by MODE. This is essentially
23654 a * 0x10101010, but we can do slightly better than
23655 synth_mult by unwinding the sequence by hand on CPUs with
23656 slow multiply. */
23657 static rtx
23658 promote_duplicated_reg (enum machine_mode mode, rtx val)
23660 enum machine_mode valmode = GET_MODE (val);
23661 rtx tmp;
23662 int nops = mode == DImode ? 3 : 2;
23664 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23665 if (val == const0_rtx)
23666 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23667 if (CONST_INT_P (val))
23669 HOST_WIDE_INT v = INTVAL (val) & 255;
23671 v |= v << 8;
23672 v |= v << 16;
23673 if (mode == DImode)
23674 v |= (v << 16) << 16;
23675 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23678 if (valmode == VOIDmode)
23679 valmode = QImode;
23680 if (valmode != QImode)
23681 val = gen_lowpart (QImode, val);
23682 if (mode == QImode)
23683 return val;
23684 if (!TARGET_PARTIAL_REG_STALL)
23685 nops--;
23686 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23687 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23688 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23689 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23691 rtx reg = convert_modes (mode, QImode, val, true);
23692 tmp = promote_duplicated_reg (mode, const1_rtx);
23693 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23694 OPTAB_DIRECT);
23696 else
23698 rtx reg = convert_modes (mode, QImode, val, true);
23700 if (!TARGET_PARTIAL_REG_STALL)
23701 if (mode == SImode)
23702 emit_insn (gen_movsi_insv_1 (reg, reg));
23703 else
23704 emit_insn (gen_movdi_insv_1 (reg, reg));
23705 else
23707 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23708 NULL, 1, OPTAB_DIRECT);
23709 reg =
23710 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23712 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23713 NULL, 1, OPTAB_DIRECT);
23714 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23715 if (mode == SImode)
23716 return reg;
23717 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23718 NULL, 1, OPTAB_DIRECT);
23719 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23720 return reg;
23724 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23725 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23726 alignment from ALIGN to DESIRED_ALIGN. */
23727 static rtx
23728 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23729 int align)
23731 rtx promoted_val;
23733 if (TARGET_64BIT
23734 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23735 promoted_val = promote_duplicated_reg (DImode, val);
23736 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23737 promoted_val = promote_duplicated_reg (SImode, val);
23738 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23739 promoted_val = promote_duplicated_reg (HImode, val);
23740 else
23741 promoted_val = val;
23743 return promoted_val;
23746 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23747 operations when profitable. The code depends upon architecture, block size
23748 and alignment, but always has one of the following overall structures:
23750 Aligned move sequence:
23752 1) Prologue guard: Conditional that jumps up to epilogues for small
23753 blocks that can be handled by epilogue alone. This is faster
23754 but also needed for correctness, since prologue assume the block
23755 is larger than the desired alignment.
23757 Optional dynamic check for size and libcall for large
23758 blocks is emitted here too, with -minline-stringops-dynamically.
23760 2) Prologue: copy first few bytes in order to get destination
23761 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23762 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23763 copied. We emit either a jump tree on power of two sized
23764 blocks, or a byte loop.
23766 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23767 with specified algorithm.
23769 4) Epilogue: code copying tail of the block that is too small to be
23770 handled by main body (or up to size guarded by prologue guard).
23772 Misaligned move sequence
23774 1) missaligned move prologue/epilogue containing:
23775 a) Prologue handling small memory blocks and jumping to done_label
23776 (skipped if blocks are known to be large enough)
23777 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23778 needed by single possibly misaligned move
23779 (skipped if alignment is not needed)
23780 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23782 2) Zero size guard dispatching to done_label, if needed
23784 3) dispatch to library call, if needed,
23786 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23787 with specified algorithm. */
23788 bool
23789 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23790 rtx align_exp, rtx expected_align_exp,
23791 rtx expected_size_exp, rtx min_size_exp,
23792 rtx max_size_exp, rtx probable_max_size_exp,
23793 bool issetmem)
23795 rtx destreg;
23796 rtx srcreg = NULL;
23797 rtx label = NULL;
23798 rtx tmp;
23799 rtx jump_around_label = NULL;
23800 HOST_WIDE_INT align = 1;
23801 unsigned HOST_WIDE_INT count = 0;
23802 HOST_WIDE_INT expected_size = -1;
23803 int size_needed = 0, epilogue_size_needed;
23804 int desired_align = 0, align_bytes = 0;
23805 enum stringop_alg alg;
23806 rtx promoted_val = NULL;
23807 rtx vec_promoted_val = NULL;
23808 bool force_loopy_epilogue = false;
23809 int dynamic_check;
23810 bool need_zero_guard = false;
23811 bool noalign;
23812 enum machine_mode move_mode = VOIDmode;
23813 int unroll_factor = 1;
23814 /* TODO: Once value ranges are available, fill in proper data. */
23815 unsigned HOST_WIDE_INT min_size = 0;
23816 unsigned HOST_WIDE_INT max_size = -1;
23817 unsigned HOST_WIDE_INT probable_max_size = -1;
23818 bool misaligned_prologue_used = false;
23820 if (CONST_INT_P (align_exp))
23821 align = INTVAL (align_exp);
23822 /* i386 can do misaligned access on reasonably increased cost. */
23823 if (CONST_INT_P (expected_align_exp)
23824 && INTVAL (expected_align_exp) > align)
23825 align = INTVAL (expected_align_exp);
23826 /* ALIGN is the minimum of destination and source alignment, but we care here
23827 just about destination alignment. */
23828 else if (!issetmem
23829 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23830 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23832 if (CONST_INT_P (count_exp))
23833 min_size = max_size = probable_max_size = count = expected_size
23834 = INTVAL (count_exp);
23835 else
23837 if (min_size_exp)
23838 min_size = INTVAL (min_size_exp);
23839 if (max_size_exp)
23840 max_size = INTVAL (max_size_exp);
23841 if (probable_max_size_exp)
23842 probable_max_size = INTVAL (probable_max_size_exp);
23843 if (CONST_INT_P (expected_size_exp) && count == 0)
23844 expected_size = INTVAL (expected_size_exp);
23847 /* Make sure we don't need to care about overflow later on. */
23848 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23849 return false;
23851 /* Step 0: Decide on preferred algorithm, desired alignment and
23852 size of chunks to be copied by main loop. */
23853 alg = decide_alg (count, expected_size, min_size, probable_max_size,
23854 issetmem,
23855 issetmem && val_exp == const0_rtx,
23856 &dynamic_check, &noalign);
23857 if (alg == libcall)
23858 return false;
23859 gcc_assert (alg != no_stringop);
23861 /* For now vector-version of memset is generated only for memory zeroing, as
23862 creating of promoted vector value is very cheap in this case. */
23863 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23864 alg = unrolled_loop;
23866 if (!count)
23867 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23868 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23869 if (!issetmem)
23870 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23872 unroll_factor = 1;
23873 move_mode = word_mode;
23874 switch (alg)
23876 case libcall:
23877 case no_stringop:
23878 case last_alg:
23879 gcc_unreachable ();
23880 case loop_1_byte:
23881 need_zero_guard = true;
23882 move_mode = QImode;
23883 break;
23884 case loop:
23885 need_zero_guard = true;
23886 break;
23887 case unrolled_loop:
23888 need_zero_guard = true;
23889 unroll_factor = (TARGET_64BIT ? 4 : 2);
23890 break;
23891 case vector_loop:
23892 need_zero_guard = true;
23893 unroll_factor = 4;
23894 /* Find the widest supported mode. */
23895 move_mode = word_mode;
23896 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23897 != CODE_FOR_nothing)
23898 move_mode = GET_MODE_WIDER_MODE (move_mode);
23900 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23901 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23902 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23904 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23905 move_mode = mode_for_vector (word_mode, nunits);
23906 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23907 move_mode = word_mode;
23909 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23910 break;
23911 case rep_prefix_8_byte:
23912 move_mode = DImode;
23913 break;
23914 case rep_prefix_4_byte:
23915 move_mode = SImode;
23916 break;
23917 case rep_prefix_1_byte:
23918 move_mode = QImode;
23919 break;
23921 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23922 epilogue_size_needed = size_needed;
23924 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23925 if (!TARGET_ALIGN_STRINGOPS || noalign)
23926 align = desired_align;
23928 /* Step 1: Prologue guard. */
23930 /* Alignment code needs count to be in register. */
23931 if (CONST_INT_P (count_exp) && desired_align > align)
23933 if (INTVAL (count_exp) > desired_align
23934 && INTVAL (count_exp) > size_needed)
23936 align_bytes
23937 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23938 if (align_bytes <= 0)
23939 align_bytes = 0;
23940 else
23941 align_bytes = desired_align - align_bytes;
23943 if (align_bytes == 0)
23944 count_exp = force_reg (counter_mode (count_exp), count_exp);
23946 gcc_assert (desired_align >= 1 && align >= 1);
23948 /* Misaligned move sequences handle both prologue and epilogue at once.
23949 Default code generation results in a smaller code for large alignments
23950 and also avoids redundant job when sizes are known precisely. */
23951 misaligned_prologue_used
23952 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
23953 && MAX (desired_align, epilogue_size_needed) <= 32
23954 && desired_align <= epilogue_size_needed
23955 && ((desired_align > align && !align_bytes)
23956 || (!count && epilogue_size_needed > 1)));
23958 /* Do the cheap promotion to allow better CSE across the
23959 main loop and epilogue (ie one load of the big constant in the
23960 front of all code.
23961 For now the misaligned move sequences do not have fast path
23962 without broadcasting. */
23963 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23965 if (alg == vector_loop)
23967 gcc_assert (val_exp == const0_rtx);
23968 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23969 promoted_val = promote_duplicated_reg_to_size (val_exp,
23970 GET_MODE_SIZE (word_mode),
23971 desired_align, align);
23973 else
23975 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23976 desired_align, align);
23979 /* Misaligned move sequences handles both prologues and epilogues at once.
23980 Default code generation results in smaller code for large alignments and
23981 also avoids redundant job when sizes are known precisely. */
23982 if (misaligned_prologue_used)
23984 /* Misaligned move prologue handled small blocks by itself. */
23985 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23986 (dst, src, &destreg, &srcreg,
23987 move_mode, promoted_val, vec_promoted_val,
23988 &count_exp,
23989 &jump_around_label,
23990 desired_align < align
23991 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23992 desired_align, align, &min_size, dynamic_check, issetmem);
23993 if (!issetmem)
23994 src = change_address (src, BLKmode, srcreg);
23995 dst = change_address (dst, BLKmode, destreg);
23996 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23997 epilogue_size_needed = 0;
23998 if (need_zero_guard && !min_size)
24000 /* It is possible that we copied enough so the main loop will not
24001 execute. */
24002 gcc_assert (size_needed > 1);
24003 if (jump_around_label == NULL_RTX)
24004 jump_around_label = gen_label_rtx ();
24005 emit_cmp_and_jump_insns (count_exp,
24006 GEN_INT (size_needed),
24007 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
24008 if (expected_size == -1
24009 || expected_size < (desired_align - align) / 2 + size_needed)
24010 predict_jump (REG_BR_PROB_BASE * 20 / 100);
24011 else
24012 predict_jump (REG_BR_PROB_BASE * 60 / 100);
24015 /* Ensure that alignment prologue won't copy past end of block. */
24016 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
24018 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
24019 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
24020 Make sure it is power of 2. */
24021 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
24023 /* To improve performance of small blocks, we jump around the VAL
24024 promoting mode. This mean that if the promoted VAL is not constant,
24025 we might not use it in the epilogue and have to use byte
24026 loop variant. */
24027 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
24028 force_loopy_epilogue = true;
24029 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
24030 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
24032 /* If main algorithm works on QImode, no epilogue is needed.
24033 For small sizes just don't align anything. */
24034 if (size_needed == 1)
24035 desired_align = align;
24036 else
24037 goto epilogue;
24039 else if (!count
24040 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
24042 label = gen_label_rtx ();
24043 emit_cmp_and_jump_insns (count_exp,
24044 GEN_INT (epilogue_size_needed),
24045 LTU, 0, counter_mode (count_exp), 1, label);
24046 if (expected_size == -1 || expected_size < epilogue_size_needed)
24047 predict_jump (REG_BR_PROB_BASE * 60 / 100);
24048 else
24049 predict_jump (REG_BR_PROB_BASE * 20 / 100);
24053 /* Emit code to decide on runtime whether library call or inline should be
24054 used. */
24055 if (dynamic_check != -1)
24057 if (!issetmem && CONST_INT_P (count_exp))
24059 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
24061 emit_block_move_via_libcall (dst, src, count_exp, false);
24062 count_exp = const0_rtx;
24063 goto epilogue;
24066 else
24068 rtx hot_label = gen_label_rtx ();
24069 jump_around_label = gen_label_rtx ();
24070 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
24071 LEU, 0, GET_MODE (count_exp), 1, hot_label);
24072 predict_jump (REG_BR_PROB_BASE * 90 / 100);
24073 if (issetmem)
24074 set_storage_via_libcall (dst, count_exp, val_exp, false);
24075 else
24076 emit_block_move_via_libcall (dst, src, count_exp, false);
24077 emit_jump (jump_around_label);
24078 emit_label (hot_label);
24082 /* Step 2: Alignment prologue. */
24083 /* Do the expensive promotion once we branched off the small blocks. */
24084 if (issetmem && !promoted_val)
24085 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
24086 desired_align, align);
24088 if (desired_align > align && !misaligned_prologue_used)
24090 if (align_bytes == 0)
24092 /* Except for the first move in prologue, we no longer know
24093 constant offset in aliasing info. It don't seems to worth
24094 the pain to maintain it for the first move, so throw away
24095 the info early. */
24096 dst = change_address (dst, BLKmode, destreg);
24097 if (!issetmem)
24098 src = change_address (src, BLKmode, srcreg);
24099 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
24100 promoted_val, vec_promoted_val,
24101 count_exp, align, desired_align,
24102 issetmem);
24103 /* At most desired_align - align bytes are copied. */
24104 if (min_size < (unsigned)(desired_align - align))
24105 min_size = 0;
24106 else
24107 min_size -= desired_align - align;
24109 else
24111 /* If we know how many bytes need to be stored before dst is
24112 sufficiently aligned, maintain aliasing info accurately. */
24113 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
24114 srcreg,
24115 promoted_val,
24116 vec_promoted_val,
24117 desired_align,
24118 align_bytes,
24119 issetmem);
24121 count_exp = plus_constant (counter_mode (count_exp),
24122 count_exp, -align_bytes);
24123 count -= align_bytes;
24124 min_size -= align_bytes;
24125 max_size -= align_bytes;
24127 if (need_zero_guard
24128 && !min_size
24129 && (count < (unsigned HOST_WIDE_INT) size_needed
24130 || (align_bytes == 0
24131 && count < ((unsigned HOST_WIDE_INT) size_needed
24132 + desired_align - align))))
24134 /* It is possible that we copied enough so the main loop will not
24135 execute. */
24136 gcc_assert (size_needed > 1);
24137 if (label == NULL_RTX)
24138 label = gen_label_rtx ();
24139 emit_cmp_and_jump_insns (count_exp,
24140 GEN_INT (size_needed),
24141 LTU, 0, counter_mode (count_exp), 1, label);
24142 if (expected_size == -1
24143 || expected_size < (desired_align - align) / 2 + size_needed)
24144 predict_jump (REG_BR_PROB_BASE * 20 / 100);
24145 else
24146 predict_jump (REG_BR_PROB_BASE * 60 / 100);
24149 if (label && size_needed == 1)
24151 emit_label (label);
24152 LABEL_NUSES (label) = 1;
24153 label = NULL;
24154 epilogue_size_needed = 1;
24155 if (issetmem)
24156 promoted_val = val_exp;
24158 else if (label == NULL_RTX && !misaligned_prologue_used)
24159 epilogue_size_needed = size_needed;
24161 /* Step 3: Main loop. */
24163 switch (alg)
24165 case libcall:
24166 case no_stringop:
24167 case last_alg:
24168 gcc_unreachable ();
24169 case loop_1_byte:
24170 case loop:
24171 case unrolled_loop:
24172 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
24173 count_exp, move_mode, unroll_factor,
24174 expected_size, issetmem);
24175 break;
24176 case vector_loop:
24177 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
24178 vec_promoted_val, count_exp, move_mode,
24179 unroll_factor, expected_size, issetmem);
24180 break;
24181 case rep_prefix_8_byte:
24182 case rep_prefix_4_byte:
24183 case rep_prefix_1_byte:
24184 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
24185 val_exp, count_exp, move_mode, issetmem);
24186 break;
24188 /* Adjust properly the offset of src and dest memory for aliasing. */
24189 if (CONST_INT_P (count_exp))
24191 if (!issetmem)
24192 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
24193 (count / size_needed) * size_needed);
24194 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
24195 (count / size_needed) * size_needed);
24197 else
24199 if (!issetmem)
24200 src = change_address (src, BLKmode, srcreg);
24201 dst = change_address (dst, BLKmode, destreg);
24204 /* Step 4: Epilogue to copy the remaining bytes. */
24205 epilogue:
24206 if (label)
24208 /* When the main loop is done, COUNT_EXP might hold original count,
24209 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
24210 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
24211 bytes. Compensate if needed. */
24213 if (size_needed < epilogue_size_needed)
24215 tmp =
24216 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24217 GEN_INT (size_needed - 1), count_exp, 1,
24218 OPTAB_DIRECT);
24219 if (tmp != count_exp)
24220 emit_move_insn (count_exp, tmp);
24222 emit_label (label);
24223 LABEL_NUSES (label) = 1;
24226 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24228 if (force_loopy_epilogue)
24229 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24230 epilogue_size_needed);
24231 else
24233 if (issetmem)
24234 expand_setmem_epilogue (dst, destreg, promoted_val,
24235 vec_promoted_val, count_exp,
24236 epilogue_size_needed);
24237 else
24238 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24239 epilogue_size_needed);
24242 if (jump_around_label)
24243 emit_label (jump_around_label);
24244 return true;
24248 /* Expand the appropriate insns for doing strlen if not just doing
24249 repnz; scasb
24251 out = result, initialized with the start address
24252 align_rtx = alignment of the address.
24253 scratch = scratch register, initialized with the startaddress when
24254 not aligned, otherwise undefined
24256 This is just the body. It needs the initializations mentioned above and
24257 some address computing at the end. These things are done in i386.md. */
24259 static void
24260 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24262 int align;
24263 rtx tmp;
24264 rtx align_2_label = NULL_RTX;
24265 rtx align_3_label = NULL_RTX;
24266 rtx align_4_label = gen_label_rtx ();
24267 rtx end_0_label = gen_label_rtx ();
24268 rtx mem;
24269 rtx tmpreg = gen_reg_rtx (SImode);
24270 rtx scratch = gen_reg_rtx (SImode);
24271 rtx cmp;
24273 align = 0;
24274 if (CONST_INT_P (align_rtx))
24275 align = INTVAL (align_rtx);
24277 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24279 /* Is there a known alignment and is it less than 4? */
24280 if (align < 4)
24282 rtx scratch1 = gen_reg_rtx (Pmode);
24283 emit_move_insn (scratch1, out);
24284 /* Is there a known alignment and is it not 2? */
24285 if (align != 2)
24287 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24288 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24290 /* Leave just the 3 lower bits. */
24291 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24292 NULL_RTX, 0, OPTAB_WIDEN);
24294 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24295 Pmode, 1, align_4_label);
24296 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24297 Pmode, 1, align_2_label);
24298 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24299 Pmode, 1, align_3_label);
24301 else
24303 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24304 check if is aligned to 4 - byte. */
24306 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24307 NULL_RTX, 0, OPTAB_WIDEN);
24309 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24310 Pmode, 1, align_4_label);
24313 mem = change_address (src, QImode, out);
24315 /* Now compare the bytes. */
24317 /* Compare the first n unaligned byte on a byte per byte basis. */
24318 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24319 QImode, 1, end_0_label);
24321 /* Increment the address. */
24322 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24324 /* Not needed with an alignment of 2 */
24325 if (align != 2)
24327 emit_label (align_2_label);
24329 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24330 end_0_label);
24332 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24334 emit_label (align_3_label);
24337 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24338 end_0_label);
24340 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24343 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24344 align this loop. It gives only huge programs, but does not help to
24345 speed up. */
24346 emit_label (align_4_label);
24348 mem = change_address (src, SImode, out);
24349 emit_move_insn (scratch, mem);
24350 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24352 /* This formula yields a nonzero result iff one of the bytes is zero.
24353 This saves three branches inside loop and many cycles. */
24355 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24356 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24357 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24358 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24359 gen_int_mode (0x80808080, SImode)));
24360 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24361 align_4_label);
24363 if (TARGET_CMOVE)
24365 rtx reg = gen_reg_rtx (SImode);
24366 rtx reg2 = gen_reg_rtx (Pmode);
24367 emit_move_insn (reg, tmpreg);
24368 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24370 /* If zero is not in the first two bytes, move two bytes forward. */
24371 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24372 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24373 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24374 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24375 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24376 reg,
24377 tmpreg)));
24378 /* Emit lea manually to avoid clobbering of flags. */
24379 emit_insn (gen_rtx_SET (SImode, reg2,
24380 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24382 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24383 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24384 emit_insn (gen_rtx_SET (VOIDmode, out,
24385 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24386 reg2,
24387 out)));
24389 else
24391 rtx end_2_label = gen_label_rtx ();
24392 /* Is zero in the first two bytes? */
24394 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24395 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24396 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24397 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24398 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24399 pc_rtx);
24400 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24401 JUMP_LABEL (tmp) = end_2_label;
24403 /* Not in the first two. Move two bytes forward. */
24404 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24405 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24407 emit_label (end_2_label);
24411 /* Avoid branch in fixing the byte. */
24412 tmpreg = gen_lowpart (QImode, tmpreg);
24413 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24414 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24415 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24416 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24418 emit_label (end_0_label);
24421 /* Expand strlen. */
24423 bool
24424 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24426 rtx addr, scratch1, scratch2, scratch3, scratch4;
24428 /* The generic case of strlen expander is long. Avoid it's
24429 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24431 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24432 && !TARGET_INLINE_ALL_STRINGOPS
24433 && !optimize_insn_for_size_p ()
24434 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24435 return false;
24437 addr = force_reg (Pmode, XEXP (src, 0));
24438 scratch1 = gen_reg_rtx (Pmode);
24440 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24441 && !optimize_insn_for_size_p ())
24443 /* Well it seems that some optimizer does not combine a call like
24444 foo(strlen(bar), strlen(bar));
24445 when the move and the subtraction is done here. It does calculate
24446 the length just once when these instructions are done inside of
24447 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24448 often used and I use one fewer register for the lifetime of
24449 output_strlen_unroll() this is better. */
24451 emit_move_insn (out, addr);
24453 ix86_expand_strlensi_unroll_1 (out, src, align);
24455 /* strlensi_unroll_1 returns the address of the zero at the end of
24456 the string, like memchr(), so compute the length by subtracting
24457 the start address. */
24458 emit_insn (ix86_gen_sub3 (out, out, addr));
24460 else
24462 rtx unspec;
24464 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24465 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24466 return false;
24468 scratch2 = gen_reg_rtx (Pmode);
24469 scratch3 = gen_reg_rtx (Pmode);
24470 scratch4 = force_reg (Pmode, constm1_rtx);
24472 emit_move_insn (scratch3, addr);
24473 eoschar = force_reg (QImode, eoschar);
24475 src = replace_equiv_address_nv (src, scratch3);
24477 /* If .md starts supporting :P, this can be done in .md. */
24478 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24479 scratch4), UNSPEC_SCAS);
24480 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24481 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24482 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24484 return true;
24487 /* For given symbol (function) construct code to compute address of it's PLT
24488 entry in large x86-64 PIC model. */
24489 static rtx
24490 construct_plt_address (rtx symbol)
24492 rtx tmp, unspec;
24494 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24495 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24496 gcc_assert (Pmode == DImode);
24498 tmp = gen_reg_rtx (Pmode);
24499 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24501 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24502 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24503 return tmp;
24507 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24508 rtx callarg2,
24509 rtx pop, bool sibcall)
24511 unsigned int const cregs_size
24512 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24513 rtx vec[3 + cregs_size];
24514 rtx use = NULL, call;
24515 unsigned int vec_len = 0;
24517 if (pop == const0_rtx)
24518 pop = NULL;
24519 gcc_assert (!TARGET_64BIT || !pop);
24521 if (TARGET_MACHO && !TARGET_64BIT)
24523 #if TARGET_MACHO
24524 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24525 fnaddr = machopic_indirect_call_target (fnaddr);
24526 #endif
24528 else
24530 /* Static functions and indirect calls don't need the pic register. */
24531 if (flag_pic
24532 && (!TARGET_64BIT
24533 || (ix86_cmodel == CM_LARGE_PIC
24534 && DEFAULT_ABI != MS_ABI))
24535 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24536 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24537 use_reg (&use, pic_offset_table_rtx);
24540 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24542 rtx al = gen_rtx_REG (QImode, AX_REG);
24543 emit_move_insn (al, callarg2);
24544 use_reg (&use, al);
24547 if (ix86_cmodel == CM_LARGE_PIC
24548 && !TARGET_PECOFF
24549 && MEM_P (fnaddr)
24550 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24551 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24552 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24553 else if (sibcall
24554 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24555 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24557 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24558 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24561 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24562 if (retval)
24563 call = gen_rtx_SET (VOIDmode, retval, call);
24564 vec[vec_len++] = call;
24566 if (pop)
24568 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24569 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24570 vec[vec_len++] = pop;
24573 if (TARGET_64BIT_MS_ABI
24574 && (!callarg2 || INTVAL (callarg2) != -2))
24576 unsigned i;
24578 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24579 UNSPEC_MS_TO_SYSV_CALL);
24581 for (i = 0; i < cregs_size; i++)
24583 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24584 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24586 vec[vec_len++]
24587 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24591 if (vec_len > 1)
24592 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24593 call = emit_call_insn (call);
24594 if (use)
24595 CALL_INSN_FUNCTION_USAGE (call) = use;
24597 return call;
24600 /* Output the assembly for a call instruction. */
24602 const char *
24603 ix86_output_call_insn (rtx insn, rtx call_op)
24605 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24606 bool seh_nop_p = false;
24607 const char *xasm;
24609 if (SIBLING_CALL_P (insn))
24611 if (direct_p)
24612 xasm = "jmp\t%P0";
24613 /* SEH epilogue detection requires the indirect branch case
24614 to include REX.W. */
24615 else if (TARGET_SEH)
24616 xasm = "rex.W jmp %A0";
24617 else
24618 xasm = "jmp\t%A0";
24620 output_asm_insn (xasm, &call_op);
24621 return "";
24624 /* SEH unwinding can require an extra nop to be emitted in several
24625 circumstances. Determine if we have one of those. */
24626 if (TARGET_SEH)
24628 rtx i;
24630 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24632 /* If we get to another real insn, we don't need the nop. */
24633 if (INSN_P (i))
24634 break;
24636 /* If we get to the epilogue note, prevent a catch region from
24637 being adjacent to the standard epilogue sequence. If non-
24638 call-exceptions, we'll have done this during epilogue emission. */
24639 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24640 && !flag_non_call_exceptions
24641 && !can_throw_internal (insn))
24643 seh_nop_p = true;
24644 break;
24648 /* If we didn't find a real insn following the call, prevent the
24649 unwinder from looking into the next function. */
24650 if (i == NULL)
24651 seh_nop_p = true;
24654 if (direct_p)
24655 xasm = "call\t%P0";
24656 else
24657 xasm = "call\t%A0";
24659 output_asm_insn (xasm, &call_op);
24661 if (seh_nop_p)
24662 return "nop";
24664 return "";
24667 /* Clear stack slot assignments remembered from previous functions.
24668 This is called from INIT_EXPANDERS once before RTL is emitted for each
24669 function. */
24671 static struct machine_function *
24672 ix86_init_machine_status (void)
24674 struct machine_function *f;
24676 f = ggc_alloc_cleared_machine_function ();
24677 f->use_fast_prologue_epilogue_nregs = -1;
24678 f->call_abi = ix86_abi;
24680 return f;
24683 /* Return a MEM corresponding to a stack slot with mode MODE.
24684 Allocate a new slot if necessary.
24686 The RTL for a function can have several slots available: N is
24687 which slot to use. */
24690 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24692 struct stack_local_entry *s;
24694 gcc_assert (n < MAX_386_STACK_LOCALS);
24696 for (s = ix86_stack_locals; s; s = s->next)
24697 if (s->mode == mode && s->n == n)
24698 return validize_mem (copy_rtx (s->rtl));
24700 s = ggc_alloc_stack_local_entry ();
24701 s->n = n;
24702 s->mode = mode;
24703 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24705 s->next = ix86_stack_locals;
24706 ix86_stack_locals = s;
24707 return validize_mem (s->rtl);
24710 static void
24711 ix86_instantiate_decls (void)
24713 struct stack_local_entry *s;
24715 for (s = ix86_stack_locals; s; s = s->next)
24716 if (s->rtl != NULL_RTX)
24717 instantiate_decl_rtl (s->rtl);
24720 /* Check whether x86 address PARTS is a pc-relative address. */
24722 static bool
24723 rip_relative_addr_p (struct ix86_address *parts)
24725 rtx base, index, disp;
24727 base = parts->base;
24728 index = parts->index;
24729 disp = parts->disp;
24731 if (disp && !base && !index)
24733 if (TARGET_64BIT)
24735 rtx symbol = disp;
24737 if (GET_CODE (disp) == CONST)
24738 symbol = XEXP (disp, 0);
24739 if (GET_CODE (symbol) == PLUS
24740 && CONST_INT_P (XEXP (symbol, 1)))
24741 symbol = XEXP (symbol, 0);
24743 if (GET_CODE (symbol) == LABEL_REF
24744 || (GET_CODE (symbol) == SYMBOL_REF
24745 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
24746 || (GET_CODE (symbol) == UNSPEC
24747 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
24748 || XINT (symbol, 1) == UNSPEC_PCREL
24749 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
24750 return true;
24753 return false;
24756 /* Calculate the length of the memory address in the instruction encoding.
24757 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24758 or other prefixes. We never generate addr32 prefix for LEA insn. */
24761 memory_address_length (rtx addr, bool lea)
24763 struct ix86_address parts;
24764 rtx base, index, disp;
24765 int len;
24766 int ok;
24768 if (GET_CODE (addr) == PRE_DEC
24769 || GET_CODE (addr) == POST_INC
24770 || GET_CODE (addr) == PRE_MODIFY
24771 || GET_CODE (addr) == POST_MODIFY)
24772 return 0;
24774 ok = ix86_decompose_address (addr, &parts);
24775 gcc_assert (ok);
24777 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24779 /* If this is not LEA instruction, add the length of addr32 prefix. */
24780 if (TARGET_64BIT && !lea
24781 && (SImode_address_operand (addr, VOIDmode)
24782 || (parts.base && GET_MODE (parts.base) == SImode)
24783 || (parts.index && GET_MODE (parts.index) == SImode)))
24784 len++;
24786 base = parts.base;
24787 index = parts.index;
24788 disp = parts.disp;
24790 if (base && GET_CODE (base) == SUBREG)
24791 base = SUBREG_REG (base);
24792 if (index && GET_CODE (index) == SUBREG)
24793 index = SUBREG_REG (index);
24795 gcc_assert (base == NULL_RTX || REG_P (base));
24796 gcc_assert (index == NULL_RTX || REG_P (index));
24798 /* Rule of thumb:
24799 - esp as the base always wants an index,
24800 - ebp as the base always wants a displacement,
24801 - r12 as the base always wants an index,
24802 - r13 as the base always wants a displacement. */
24804 /* Register Indirect. */
24805 if (base && !index && !disp)
24807 /* esp (for its index) and ebp (for its displacement) need
24808 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24809 code. */
24810 if (base == arg_pointer_rtx
24811 || base == frame_pointer_rtx
24812 || REGNO (base) == SP_REG
24813 || REGNO (base) == BP_REG
24814 || REGNO (base) == R12_REG
24815 || REGNO (base) == R13_REG)
24816 len++;
24819 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24820 is not disp32, but disp32(%rip), so for disp32
24821 SIB byte is needed, unless print_operand_address
24822 optimizes it into disp32(%rip) or (%rip) is implied
24823 by UNSPEC. */
24824 else if (disp && !base && !index)
24826 len += 4;
24827 if (rip_relative_addr_p (&parts))
24828 len++;
24830 else
24832 /* Find the length of the displacement constant. */
24833 if (disp)
24835 if (base && satisfies_constraint_K (disp))
24836 len += 1;
24837 else
24838 len += 4;
24840 /* ebp always wants a displacement. Similarly r13. */
24841 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24842 len++;
24844 /* An index requires the two-byte modrm form.... */
24845 if (index
24846 /* ...like esp (or r12), which always wants an index. */
24847 || base == arg_pointer_rtx
24848 || base == frame_pointer_rtx
24849 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24850 len++;
24853 return len;
24856 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24857 is set, expect that insn have 8bit immediate alternative. */
24859 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24861 int len = 0;
24862 int i;
24863 extract_insn_cached (insn);
24864 for (i = recog_data.n_operands - 1; i >= 0; --i)
24865 if (CONSTANT_P (recog_data.operand[i]))
24867 enum attr_mode mode = get_attr_mode (insn);
24869 gcc_assert (!len);
24870 if (shortform && CONST_INT_P (recog_data.operand[i]))
24872 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24873 switch (mode)
24875 case MODE_QI:
24876 len = 1;
24877 continue;
24878 case MODE_HI:
24879 ival = trunc_int_for_mode (ival, HImode);
24880 break;
24881 case MODE_SI:
24882 ival = trunc_int_for_mode (ival, SImode);
24883 break;
24884 default:
24885 break;
24887 if (IN_RANGE (ival, -128, 127))
24889 len = 1;
24890 continue;
24893 switch (mode)
24895 case MODE_QI:
24896 len = 1;
24897 break;
24898 case MODE_HI:
24899 len = 2;
24900 break;
24901 case MODE_SI:
24902 len = 4;
24903 break;
24904 /* Immediates for DImode instructions are encoded
24905 as 32bit sign extended values. */
24906 case MODE_DI:
24907 len = 4;
24908 break;
24909 default:
24910 fatal_insn ("unknown insn mode", insn);
24913 return len;
24916 /* Compute default value for "length_address" attribute. */
24918 ix86_attr_length_address_default (rtx insn)
24920 int i;
24922 if (get_attr_type (insn) == TYPE_LEA)
24924 rtx set = PATTERN (insn), addr;
24926 if (GET_CODE (set) == PARALLEL)
24927 set = XVECEXP (set, 0, 0);
24929 gcc_assert (GET_CODE (set) == SET);
24931 addr = SET_SRC (set);
24933 return memory_address_length (addr, true);
24936 extract_insn_cached (insn);
24937 for (i = recog_data.n_operands - 1; i >= 0; --i)
24938 if (MEM_P (recog_data.operand[i]))
24940 constrain_operands_cached (reload_completed);
24941 if (which_alternative != -1)
24943 const char *constraints = recog_data.constraints[i];
24944 int alt = which_alternative;
24946 while (*constraints == '=' || *constraints == '+')
24947 constraints++;
24948 while (alt-- > 0)
24949 while (*constraints++ != ',')
24951 /* Skip ignored operands. */
24952 if (*constraints == 'X')
24953 continue;
24955 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24957 return 0;
24960 /* Compute default value for "length_vex" attribute. It includes
24961 2 or 3 byte VEX prefix and 1 opcode byte. */
24964 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24966 int i;
24968 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24969 byte VEX prefix. */
24970 if (!has_0f_opcode || has_vex_w)
24971 return 3 + 1;
24973 /* We can always use 2 byte VEX prefix in 32bit. */
24974 if (!TARGET_64BIT)
24975 return 2 + 1;
24977 extract_insn_cached (insn);
24979 for (i = recog_data.n_operands - 1; i >= 0; --i)
24980 if (REG_P (recog_data.operand[i]))
24982 /* REX.W bit uses 3 byte VEX prefix. */
24983 if (GET_MODE (recog_data.operand[i]) == DImode
24984 && GENERAL_REG_P (recog_data.operand[i]))
24985 return 3 + 1;
24987 else
24989 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24990 if (MEM_P (recog_data.operand[i])
24991 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24992 return 3 + 1;
24995 return 2 + 1;
24998 /* Return the maximum number of instructions a cpu can issue. */
25000 static int
25001 ix86_issue_rate (void)
25003 switch (ix86_tune)
25005 case PROCESSOR_PENTIUM:
25006 case PROCESSOR_ATOM:
25007 case PROCESSOR_SLM:
25008 case PROCESSOR_K6:
25009 case PROCESSOR_BTVER2:
25010 case PROCESSOR_PENTIUM4:
25011 case PROCESSOR_NOCONA:
25012 return 2;
25014 case PROCESSOR_PENTIUMPRO:
25015 case PROCESSOR_ATHLON:
25016 case PROCESSOR_K8:
25017 case PROCESSOR_AMDFAM10:
25018 case PROCESSOR_GENERIC:
25019 case PROCESSOR_BTVER1:
25020 return 3;
25022 case PROCESSOR_BDVER1:
25023 case PROCESSOR_BDVER2:
25024 case PROCESSOR_BDVER3:
25025 case PROCESSOR_BDVER4:
25026 case PROCESSOR_CORE2:
25027 case PROCESSOR_COREI7:
25028 case PROCESSOR_COREI7_AVX:
25029 case PROCESSOR_HASWELL:
25030 return 4;
25032 default:
25033 return 1;
25037 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
25038 by DEP_INSN and nothing set by DEP_INSN. */
25040 static bool
25041 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
25043 rtx set, set2;
25045 /* Simplify the test for uninteresting insns. */
25046 if (insn_type != TYPE_SETCC
25047 && insn_type != TYPE_ICMOV
25048 && insn_type != TYPE_FCMOV
25049 && insn_type != TYPE_IBR)
25050 return false;
25052 if ((set = single_set (dep_insn)) != 0)
25054 set = SET_DEST (set);
25055 set2 = NULL_RTX;
25057 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
25058 && XVECLEN (PATTERN (dep_insn), 0) == 2
25059 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
25060 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
25062 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
25063 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
25065 else
25066 return false;
25068 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
25069 return false;
25071 /* This test is true if the dependent insn reads the flags but
25072 not any other potentially set register. */
25073 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
25074 return false;
25076 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
25077 return false;
25079 return true;
25082 /* Return true iff USE_INSN has a memory address with operands set by
25083 SET_INSN. */
25085 bool
25086 ix86_agi_dependent (rtx set_insn, rtx use_insn)
25088 int i;
25089 extract_insn_cached (use_insn);
25090 for (i = recog_data.n_operands - 1; i >= 0; --i)
25091 if (MEM_P (recog_data.operand[i]))
25093 rtx addr = XEXP (recog_data.operand[i], 0);
25094 return modified_in_p (addr, set_insn) != 0;
25096 return false;
25099 /* Helper function for exact_store_load_dependency.
25100 Return true if addr is found in insn. */
25101 static bool
25102 exact_dependency_1 (rtx addr, rtx insn)
25104 enum rtx_code code;
25105 const char *format_ptr;
25106 int i, j;
25108 code = GET_CODE (insn);
25109 switch (code)
25111 case MEM:
25112 if (rtx_equal_p (addr, insn))
25113 return true;
25114 break;
25115 case REG:
25116 CASE_CONST_ANY:
25117 case SYMBOL_REF:
25118 case CODE_LABEL:
25119 case PC:
25120 case CC0:
25121 case EXPR_LIST:
25122 return false;
25123 default:
25124 break;
25127 format_ptr = GET_RTX_FORMAT (code);
25128 for (i = 0; i < GET_RTX_LENGTH (code); i++)
25130 switch (*format_ptr++)
25132 case 'e':
25133 if (exact_dependency_1 (addr, XEXP (insn, i)))
25134 return true;
25135 break;
25136 case 'E':
25137 for (j = 0; j < XVECLEN (insn, i); j++)
25138 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
25139 return true;
25140 break;
25143 return false;
25146 /* Return true if there exists exact dependency for store & load, i.e.
25147 the same memory address is used in them. */
25148 static bool
25149 exact_store_load_dependency (rtx store, rtx load)
25151 rtx set1, set2;
25153 set1 = single_set (store);
25154 if (!set1)
25155 return false;
25156 if (!MEM_P (SET_DEST (set1)))
25157 return false;
25158 set2 = single_set (load);
25159 if (!set2)
25160 return false;
25161 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
25162 return true;
25163 return false;
25166 static int
25167 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
25169 enum attr_type insn_type, dep_insn_type;
25170 enum attr_memory memory;
25171 rtx set, set2;
25172 int dep_insn_code_number;
25174 /* Anti and output dependencies have zero cost on all CPUs. */
25175 if (REG_NOTE_KIND (link) != 0)
25176 return 0;
25178 dep_insn_code_number = recog_memoized (dep_insn);
25180 /* If we can't recognize the insns, we can't really do anything. */
25181 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
25182 return cost;
25184 insn_type = get_attr_type (insn);
25185 dep_insn_type = get_attr_type (dep_insn);
25187 switch (ix86_tune)
25189 case PROCESSOR_PENTIUM:
25190 /* Address Generation Interlock adds a cycle of latency. */
25191 if (insn_type == TYPE_LEA)
25193 rtx addr = PATTERN (insn);
25195 if (GET_CODE (addr) == PARALLEL)
25196 addr = XVECEXP (addr, 0, 0);
25198 gcc_assert (GET_CODE (addr) == SET);
25200 addr = SET_SRC (addr);
25201 if (modified_in_p (addr, dep_insn))
25202 cost += 1;
25204 else if (ix86_agi_dependent (dep_insn, insn))
25205 cost += 1;
25207 /* ??? Compares pair with jump/setcc. */
25208 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25209 cost = 0;
25211 /* Floating point stores require value to be ready one cycle earlier. */
25212 if (insn_type == TYPE_FMOV
25213 && get_attr_memory (insn) == MEMORY_STORE
25214 && !ix86_agi_dependent (dep_insn, insn))
25215 cost += 1;
25216 break;
25218 case PROCESSOR_PENTIUMPRO:
25219 memory = get_attr_memory (insn);
25221 /* INT->FP conversion is expensive. */
25222 if (get_attr_fp_int_src (dep_insn))
25223 cost += 5;
25225 /* There is one cycle extra latency between an FP op and a store. */
25226 if (insn_type == TYPE_FMOV
25227 && (set = single_set (dep_insn)) != NULL_RTX
25228 && (set2 = single_set (insn)) != NULL_RTX
25229 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25230 && MEM_P (SET_DEST (set2)))
25231 cost += 1;
25233 /* Show ability of reorder buffer to hide latency of load by executing
25234 in parallel with previous instruction in case
25235 previous instruction is not needed to compute the address. */
25236 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25237 && !ix86_agi_dependent (dep_insn, insn))
25239 /* Claim moves to take one cycle, as core can issue one load
25240 at time and the next load can start cycle later. */
25241 if (dep_insn_type == TYPE_IMOV
25242 || dep_insn_type == TYPE_FMOV)
25243 cost = 1;
25244 else if (cost > 1)
25245 cost--;
25247 break;
25249 case PROCESSOR_K6:
25250 memory = get_attr_memory (insn);
25252 /* The esp dependency is resolved before the instruction is really
25253 finished. */
25254 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25255 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25256 return 1;
25258 /* INT->FP conversion is expensive. */
25259 if (get_attr_fp_int_src (dep_insn))
25260 cost += 5;
25262 /* Show ability of reorder buffer to hide latency of load by executing
25263 in parallel with previous instruction in case
25264 previous instruction is not needed to compute the address. */
25265 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25266 && !ix86_agi_dependent (dep_insn, insn))
25268 /* Claim moves to take one cycle, as core can issue one load
25269 at time and the next load can start cycle later. */
25270 if (dep_insn_type == TYPE_IMOV
25271 || dep_insn_type == TYPE_FMOV)
25272 cost = 1;
25273 else if (cost > 2)
25274 cost -= 2;
25275 else
25276 cost = 1;
25278 break;
25280 case PROCESSOR_ATHLON:
25281 case PROCESSOR_K8:
25282 case PROCESSOR_AMDFAM10:
25283 case PROCESSOR_BDVER1:
25284 case PROCESSOR_BDVER2:
25285 case PROCESSOR_BDVER3:
25286 case PROCESSOR_BDVER4:
25287 case PROCESSOR_BTVER1:
25288 case PROCESSOR_BTVER2:
25289 case PROCESSOR_GENERIC:
25290 memory = get_attr_memory (insn);
25292 /* Stack engine allows to execute push&pop instructions in parall. */
25293 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25294 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25295 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25296 return 0;
25298 /* Show ability of reorder buffer to hide latency of load by executing
25299 in parallel with previous instruction in case
25300 previous instruction is not needed to compute the address. */
25301 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25302 && !ix86_agi_dependent (dep_insn, insn))
25304 enum attr_unit unit = get_attr_unit (insn);
25305 int loadcost = 3;
25307 /* Because of the difference between the length of integer and
25308 floating unit pipeline preparation stages, the memory operands
25309 for floating point are cheaper.
25311 ??? For Athlon it the difference is most probably 2. */
25312 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25313 loadcost = 3;
25314 else
25315 loadcost = TARGET_ATHLON ? 2 : 0;
25317 if (cost >= loadcost)
25318 cost -= loadcost;
25319 else
25320 cost = 0;
25322 break;
25324 case PROCESSOR_CORE2:
25325 case PROCESSOR_COREI7:
25326 case PROCESSOR_COREI7_AVX:
25327 case PROCESSOR_HASWELL:
25328 memory = get_attr_memory (insn);
25330 /* Stack engine allows to execute push&pop instructions in parall. */
25331 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25332 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25333 return 0;
25335 /* Show ability of reorder buffer to hide latency of load by executing
25336 in parallel with previous instruction in case
25337 previous instruction is not needed to compute the address. */
25338 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25339 && !ix86_agi_dependent (dep_insn, insn))
25341 if (cost >= 4)
25342 cost -= 4;
25343 else
25344 cost = 0;
25346 break;
25348 case PROCESSOR_SLM:
25349 if (!reload_completed)
25350 return cost;
25352 /* Increase cost of integer loads. */
25353 memory = get_attr_memory (dep_insn);
25354 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25356 enum attr_unit unit = get_attr_unit (dep_insn);
25357 if (unit == UNIT_INTEGER && cost == 1)
25359 if (memory == MEMORY_LOAD)
25360 cost = 3;
25361 else
25363 /* Increase cost of ld/st for short int types only
25364 because of store forwarding issue. */
25365 rtx set = single_set (dep_insn);
25366 if (set && (GET_MODE (SET_DEST (set)) == QImode
25367 || GET_MODE (SET_DEST (set)) == HImode))
25369 /* Increase cost of store/load insn if exact
25370 dependence exists and it is load insn. */
25371 enum attr_memory insn_memory = get_attr_memory (insn);
25372 if (insn_memory == MEMORY_LOAD
25373 && exact_store_load_dependency (dep_insn, insn))
25374 cost = 3;
25380 default:
25381 break;
25384 return cost;
25387 /* How many alternative schedules to try. This should be as wide as the
25388 scheduling freedom in the DFA, but no wider. Making this value too
25389 large results extra work for the scheduler. */
25391 static int
25392 ia32_multipass_dfa_lookahead (void)
25394 switch (ix86_tune)
25396 case PROCESSOR_PENTIUM:
25397 return 2;
25399 case PROCESSOR_PENTIUMPRO:
25400 case PROCESSOR_K6:
25401 return 1;
25403 case PROCESSOR_BDVER1:
25404 case PROCESSOR_BDVER2:
25405 case PROCESSOR_BDVER3:
25406 case PROCESSOR_BDVER4:
25407 /* We use lookahead value 4 for BD both before and after reload
25408 schedules. Plan is to have value 8 included for O3. */
25409 return 4;
25411 case PROCESSOR_CORE2:
25412 case PROCESSOR_COREI7:
25413 case PROCESSOR_COREI7_AVX:
25414 case PROCESSOR_HASWELL:
25415 case PROCESSOR_ATOM:
25416 case PROCESSOR_SLM:
25417 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25418 as many instructions can be executed on a cycle, i.e.,
25419 issue_rate. I wonder why tuning for many CPUs does not do this. */
25420 if (reload_completed)
25421 return ix86_issue_rate ();
25422 /* Don't use lookahead for pre-reload schedule to save compile time. */
25423 return 0;
25425 default:
25426 return 0;
25430 /* Return true if target platform supports macro-fusion. */
25432 static bool
25433 ix86_macro_fusion_p ()
25435 return TARGET_FUSE_CMP_AND_BRANCH;
25438 /* Check whether current microarchitecture support macro fusion
25439 for insn pair "CONDGEN + CONDJMP". Refer to
25440 "Intel Architectures Optimization Reference Manual". */
25442 static bool
25443 ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
25445 rtx src, dest;
25446 rtx single_set = single_set (condgen);
25447 enum rtx_code ccode;
25448 rtx compare_set = NULL_RTX, test_if, cond;
25449 rtx alu_set = NULL_RTX, addr = NULL_RTX;
25451 if (get_attr_type (condgen) != TYPE_TEST
25452 && get_attr_type (condgen) != TYPE_ICMP
25453 && get_attr_type (condgen) != TYPE_INCDEC
25454 && get_attr_type (condgen) != TYPE_ALU)
25455 return false;
25457 if (single_set == NULL_RTX
25458 && !TARGET_FUSE_ALU_AND_BRANCH)
25459 return false;
25461 if (single_set != NULL_RTX)
25462 compare_set = single_set;
25463 else
25465 int i;
25466 rtx pat = PATTERN (condgen);
25467 for (i = 0; i < XVECLEN (pat, 0); i++)
25468 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
25470 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
25471 if (GET_CODE (set_src) == COMPARE)
25472 compare_set = XVECEXP (pat, 0, i);
25473 else
25474 alu_set = XVECEXP (pat, 0, i);
25477 if (compare_set == NULL_RTX)
25478 return false;
25479 src = SET_SRC (compare_set);
25480 if (GET_CODE (src) != COMPARE)
25481 return false;
25483 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
25484 supported. */
25485 if ((MEM_P (XEXP (src, 0))
25486 && CONST_INT_P (XEXP (src, 1)))
25487 || (MEM_P (XEXP (src, 1))
25488 && CONST_INT_P (XEXP (src, 0))))
25489 return false;
25491 /* No fusion for RIP-relative address. */
25492 if (MEM_P (XEXP (src, 0)))
25493 addr = XEXP (XEXP (src, 0), 0);
25494 else if (MEM_P (XEXP (src, 1)))
25495 addr = XEXP (XEXP (src, 1), 0);
25497 if (addr) {
25498 ix86_address parts;
25499 int ok = ix86_decompose_address (addr, &parts);
25500 gcc_assert (ok);
25502 if (rip_relative_addr_p (&parts))
25503 return false;
25506 test_if = SET_SRC (pc_set (condjmp));
25507 cond = XEXP (test_if, 0);
25508 ccode = GET_CODE (cond);
25509 /* Check whether conditional jump use Sign or Overflow Flags. */
25510 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
25511 && (ccode == GE
25512 || ccode == GT
25513 || ccode == LE
25514 || ccode == LT))
25515 return false;
25517 /* Return true for TYPE_TEST and TYPE_ICMP. */
25518 if (get_attr_type (condgen) == TYPE_TEST
25519 || get_attr_type (condgen) == TYPE_ICMP)
25520 return true;
25522 /* The following is the case that macro-fusion for alu + jmp. */
25523 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
25524 return false;
25526 /* No fusion for alu op with memory destination operand. */
25527 dest = SET_DEST (alu_set);
25528 if (MEM_P (dest))
25529 return false;
25531 /* Macro-fusion for inc/dec + unsigned conditional jump is not
25532 supported. */
25533 if (get_attr_type (condgen) == TYPE_INCDEC
25534 && (ccode == GEU
25535 || ccode == GTU
25536 || ccode == LEU
25537 || ccode == LTU))
25538 return false;
25540 return true;
25543 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25544 execution. It is applied if
25545 (1) IMUL instruction is on the top of list;
25546 (2) There exists the only producer of independent IMUL instruction in
25547 ready list.
25548 Return index of IMUL producer if it was found and -1 otherwise. */
25549 static int
25550 do_reorder_for_imul (rtx *ready, int n_ready)
25552 rtx insn, set, insn1, insn2;
25553 sd_iterator_def sd_it;
25554 dep_t dep;
25555 int index = -1;
25556 int i;
25558 if (ix86_tune != PROCESSOR_ATOM)
25559 return index;
25561 /* Check that IMUL instruction is on the top of ready list. */
25562 insn = ready[n_ready - 1];
25563 set = single_set (insn);
25564 if (!set)
25565 return index;
25566 if (!(GET_CODE (SET_SRC (set)) == MULT
25567 && GET_MODE (SET_SRC (set)) == SImode))
25568 return index;
25570 /* Search for producer of independent IMUL instruction. */
25571 for (i = n_ready - 2; i >= 0; i--)
25573 insn = ready[i];
25574 if (!NONDEBUG_INSN_P (insn))
25575 continue;
25576 /* Skip IMUL instruction. */
25577 insn2 = PATTERN (insn);
25578 if (GET_CODE (insn2) == PARALLEL)
25579 insn2 = XVECEXP (insn2, 0, 0);
25580 if (GET_CODE (insn2) == SET
25581 && GET_CODE (SET_SRC (insn2)) == MULT
25582 && GET_MODE (SET_SRC (insn2)) == SImode)
25583 continue;
25585 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25587 rtx con;
25588 con = DEP_CON (dep);
25589 if (!NONDEBUG_INSN_P (con))
25590 continue;
25591 insn1 = PATTERN (con);
25592 if (GET_CODE (insn1) == PARALLEL)
25593 insn1 = XVECEXP (insn1, 0, 0);
25595 if (GET_CODE (insn1) == SET
25596 && GET_CODE (SET_SRC (insn1)) == MULT
25597 && GET_MODE (SET_SRC (insn1)) == SImode)
25599 sd_iterator_def sd_it1;
25600 dep_t dep1;
25601 /* Check if there is no other dependee for IMUL. */
25602 index = i;
25603 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25605 rtx pro;
25606 pro = DEP_PRO (dep1);
25607 if (!NONDEBUG_INSN_P (pro))
25608 continue;
25609 if (pro != insn)
25610 index = -1;
25612 if (index >= 0)
25613 break;
25616 if (index >= 0)
25617 break;
25619 return index;
25622 /* Try to find the best candidate on the top of ready list if two insns
25623 have the same priority - candidate is best if its dependees were
25624 scheduled earlier. Applied for Silvermont only.
25625 Return true if top 2 insns must be interchanged. */
25626 static bool
25627 swap_top_of_ready_list (rtx *ready, int n_ready)
25629 rtx top = ready[n_ready - 1];
25630 rtx next = ready[n_ready - 2];
25631 rtx set;
25632 sd_iterator_def sd_it;
25633 dep_t dep;
25634 int clock1 = -1;
25635 int clock2 = -1;
25636 #define INSN_TICK(INSN) (HID (INSN)->tick)
25638 if (ix86_tune != PROCESSOR_SLM)
25639 return false;
25641 if (!NONDEBUG_INSN_P (top))
25642 return false;
25643 if (!NONJUMP_INSN_P (top))
25644 return false;
25645 if (!NONDEBUG_INSN_P (next))
25646 return false;
25647 if (!NONJUMP_INSN_P (next))
25648 return false;
25649 set = single_set (top);
25650 if (!set)
25651 return false;
25652 set = single_set (next);
25653 if (!set)
25654 return false;
25656 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25658 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25659 return false;
25660 /* Determine winner more precise. */
25661 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25663 rtx pro;
25664 pro = DEP_PRO (dep);
25665 if (!NONDEBUG_INSN_P (pro))
25666 continue;
25667 if (INSN_TICK (pro) > clock1)
25668 clock1 = INSN_TICK (pro);
25670 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25672 rtx pro;
25673 pro = DEP_PRO (dep);
25674 if (!NONDEBUG_INSN_P (pro))
25675 continue;
25676 if (INSN_TICK (pro) > clock2)
25677 clock2 = INSN_TICK (pro);
25680 if (clock1 == clock2)
25682 /* Determine winner - load must win. */
25683 enum attr_memory memory1, memory2;
25684 memory1 = get_attr_memory (top);
25685 memory2 = get_attr_memory (next);
25686 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25687 return true;
25689 return (bool) (clock2 < clock1);
25691 return false;
25692 #undef INSN_TICK
25695 /* Perform possible reodering of ready list for Atom/Silvermont only.
25696 Return issue rate. */
25697 static int
25698 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25699 int clock_var)
25701 int issue_rate = -1;
25702 int n_ready = *pn_ready;
25703 int i;
25704 rtx insn;
25705 int index = -1;
25707 /* Set up issue rate. */
25708 issue_rate = ix86_issue_rate ();
25710 /* Do reodering for Atom/SLM only. */
25711 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25712 return issue_rate;
25714 /* Nothing to do if ready list contains only 1 instruction. */
25715 if (n_ready <= 1)
25716 return issue_rate;
25718 /* Do reodering for post-reload scheduler only. */
25719 if (!reload_completed)
25720 return issue_rate;
25722 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25724 if (sched_verbose > 1)
25725 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25726 INSN_UID (ready[index]));
25728 /* Put IMUL producer (ready[index]) at the top of ready list. */
25729 insn = ready[index];
25730 for (i = index; i < n_ready - 1; i++)
25731 ready[i] = ready[i + 1];
25732 ready[n_ready - 1] = insn;
25733 return issue_rate;
25735 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25737 if (sched_verbose > 1)
25738 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25739 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25740 /* Swap 2 top elements of ready list. */
25741 insn = ready[n_ready - 1];
25742 ready[n_ready - 1] = ready[n_ready - 2];
25743 ready[n_ready - 2] = insn;
25745 return issue_rate;
25748 static bool
25749 ix86_class_likely_spilled_p (reg_class_t);
25751 /* Returns true if lhs of insn is HW function argument register and set up
25752 is_spilled to true if it is likely spilled HW register. */
25753 static bool
25754 insn_is_function_arg (rtx insn, bool* is_spilled)
25756 rtx dst;
25758 if (!NONDEBUG_INSN_P (insn))
25759 return false;
25760 /* Call instructions are not movable, ignore it. */
25761 if (CALL_P (insn))
25762 return false;
25763 insn = PATTERN (insn);
25764 if (GET_CODE (insn) == PARALLEL)
25765 insn = XVECEXP (insn, 0, 0);
25766 if (GET_CODE (insn) != SET)
25767 return false;
25768 dst = SET_DEST (insn);
25769 if (REG_P (dst) && HARD_REGISTER_P (dst)
25770 && ix86_function_arg_regno_p (REGNO (dst)))
25772 /* Is it likely spilled HW register? */
25773 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25774 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25775 *is_spilled = true;
25776 return true;
25778 return false;
25781 /* Add output dependencies for chain of function adjacent arguments if only
25782 there is a move to likely spilled HW register. Return first argument
25783 if at least one dependence was added or NULL otherwise. */
25784 static rtx
25785 add_parameter_dependencies (rtx call, rtx head)
25787 rtx insn;
25788 rtx last = call;
25789 rtx first_arg = NULL;
25790 bool is_spilled = false;
25792 head = PREV_INSN (head);
25794 /* Find nearest to call argument passing instruction. */
25795 while (true)
25797 last = PREV_INSN (last);
25798 if (last == head)
25799 return NULL;
25800 if (!NONDEBUG_INSN_P (last))
25801 continue;
25802 if (insn_is_function_arg (last, &is_spilled))
25803 break;
25804 return NULL;
25807 first_arg = last;
25808 while (true)
25810 insn = PREV_INSN (last);
25811 if (!INSN_P (insn))
25812 break;
25813 if (insn == head)
25814 break;
25815 if (!NONDEBUG_INSN_P (insn))
25817 last = insn;
25818 continue;
25820 if (insn_is_function_arg (insn, &is_spilled))
25822 /* Add output depdendence between two function arguments if chain
25823 of output arguments contains likely spilled HW registers. */
25824 if (is_spilled)
25825 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25826 first_arg = last = insn;
25828 else
25829 break;
25831 if (!is_spilled)
25832 return NULL;
25833 return first_arg;
25836 /* Add output or anti dependency from insn to first_arg to restrict its code
25837 motion. */
25838 static void
25839 avoid_func_arg_motion (rtx first_arg, rtx insn)
25841 rtx set;
25842 rtx tmp;
25844 set = single_set (insn);
25845 if (!set)
25846 return;
25847 tmp = SET_DEST (set);
25848 if (REG_P (tmp))
25850 /* Add output dependency to the first function argument. */
25851 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25852 return;
25854 /* Add anti dependency. */
25855 add_dependence (first_arg, insn, REG_DEP_ANTI);
25858 /* Avoid cross block motion of function argument through adding dependency
25859 from the first non-jump instruction in bb. */
25860 static void
25861 add_dependee_for_func_arg (rtx arg, basic_block bb)
25863 rtx insn = BB_END (bb);
25865 while (insn)
25867 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25869 rtx set = single_set (insn);
25870 if (set)
25872 avoid_func_arg_motion (arg, insn);
25873 return;
25876 if (insn == BB_HEAD (bb))
25877 return;
25878 insn = PREV_INSN (insn);
25882 /* Hook for pre-reload schedule - avoid motion of function arguments
25883 passed in likely spilled HW registers. */
25884 static void
25885 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25887 rtx insn;
25888 rtx first_arg = NULL;
25889 if (reload_completed)
25890 return;
25891 while (head != tail && DEBUG_INSN_P (head))
25892 head = NEXT_INSN (head);
25893 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25894 if (INSN_P (insn) && CALL_P (insn))
25896 first_arg = add_parameter_dependencies (insn, head);
25897 if (first_arg)
25899 /* Add dependee for first argument to predecessors if only
25900 region contains more than one block. */
25901 basic_block bb = BLOCK_FOR_INSN (insn);
25902 int rgn = CONTAINING_RGN (bb->index);
25903 int nr_blks = RGN_NR_BLOCKS (rgn);
25904 /* Skip trivial regions and region head blocks that can have
25905 predecessors outside of region. */
25906 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25908 edge e;
25909 edge_iterator ei;
25910 /* Assume that region is SCC, i.e. all immediate predecessors
25911 of non-head block are in the same region. */
25912 FOR_EACH_EDGE (e, ei, bb->preds)
25914 /* Avoid creating of loop-carried dependencies through
25915 using topological odering in region. */
25916 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25917 add_dependee_for_func_arg (first_arg, e->src);
25920 insn = first_arg;
25921 if (insn == head)
25922 break;
25925 else if (first_arg)
25926 avoid_func_arg_motion (first_arg, insn);
25929 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25930 HW registers to maximum, to schedule them at soon as possible. These are
25931 moves from function argument registers at the top of the function entry
25932 and moves from function return value registers after call. */
25933 static int
25934 ix86_adjust_priority (rtx insn, int priority)
25936 rtx set;
25938 if (reload_completed)
25939 return priority;
25941 if (!NONDEBUG_INSN_P (insn))
25942 return priority;
25944 set = single_set (insn);
25945 if (set)
25947 rtx tmp = SET_SRC (set);
25948 if (REG_P (tmp)
25949 && HARD_REGISTER_P (tmp)
25950 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25951 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25952 return current_sched_info->sched_max_insns_priority;
25955 return priority;
25958 /* Model decoder of Core 2/i7.
25959 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25960 track the instruction fetch block boundaries and make sure that long
25961 (9+ bytes) instructions are assigned to D0. */
25963 /* Maximum length of an insn that can be handled by
25964 a secondary decoder unit. '8' for Core 2/i7. */
25965 static int core2i7_secondary_decoder_max_insn_size;
25967 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25968 '16' for Core 2/i7. */
25969 static int core2i7_ifetch_block_size;
25971 /* Maximum number of instructions decoder can handle per cycle.
25972 '6' for Core 2/i7. */
25973 static int core2i7_ifetch_block_max_insns;
25975 typedef struct ix86_first_cycle_multipass_data_ *
25976 ix86_first_cycle_multipass_data_t;
25977 typedef const struct ix86_first_cycle_multipass_data_ *
25978 const_ix86_first_cycle_multipass_data_t;
25980 /* A variable to store target state across calls to max_issue within
25981 one cycle. */
25982 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25983 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25985 /* Initialize DATA. */
25986 static void
25987 core2i7_first_cycle_multipass_init (void *_data)
25989 ix86_first_cycle_multipass_data_t data
25990 = (ix86_first_cycle_multipass_data_t) _data;
25992 data->ifetch_block_len = 0;
25993 data->ifetch_block_n_insns = 0;
25994 data->ready_try_change = NULL;
25995 data->ready_try_change_size = 0;
25998 /* Advancing the cycle; reset ifetch block counts. */
25999 static void
26000 core2i7_dfa_post_advance_cycle (void)
26002 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
26004 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
26006 data->ifetch_block_len = 0;
26007 data->ifetch_block_n_insns = 0;
26010 static int min_insn_size (rtx);
26012 /* Filter out insns from ready_try that the core will not be able to issue
26013 on current cycle due to decoder. */
26014 static void
26015 core2i7_first_cycle_multipass_filter_ready_try
26016 (const_ix86_first_cycle_multipass_data_t data,
26017 char *ready_try, int n_ready, bool first_cycle_insn_p)
26019 while (n_ready--)
26021 rtx insn;
26022 int insn_size;
26024 if (ready_try[n_ready])
26025 continue;
26027 insn = get_ready_element (n_ready);
26028 insn_size = min_insn_size (insn);
26030 if (/* If this is a too long an insn for a secondary decoder ... */
26031 (!first_cycle_insn_p
26032 && insn_size > core2i7_secondary_decoder_max_insn_size)
26033 /* ... or it would not fit into the ifetch block ... */
26034 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
26035 /* ... or the decoder is full already ... */
26036 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
26037 /* ... mask the insn out. */
26039 ready_try[n_ready] = 1;
26041 if (data->ready_try_change)
26042 bitmap_set_bit (data->ready_try_change, n_ready);
26047 /* Prepare for a new round of multipass lookahead scheduling. */
26048 static void
26049 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
26050 bool first_cycle_insn_p)
26052 ix86_first_cycle_multipass_data_t data
26053 = (ix86_first_cycle_multipass_data_t) _data;
26054 const_ix86_first_cycle_multipass_data_t prev_data
26055 = ix86_first_cycle_multipass_data;
26057 /* Restore the state from the end of the previous round. */
26058 data->ifetch_block_len = prev_data->ifetch_block_len;
26059 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
26061 /* Filter instructions that cannot be issued on current cycle due to
26062 decoder restrictions. */
26063 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
26064 first_cycle_insn_p);
26067 /* INSN is being issued in current solution. Account for its impact on
26068 the decoder model. */
26069 static void
26070 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
26071 rtx insn, const void *_prev_data)
26073 ix86_first_cycle_multipass_data_t data
26074 = (ix86_first_cycle_multipass_data_t) _data;
26075 const_ix86_first_cycle_multipass_data_t prev_data
26076 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
26078 int insn_size = min_insn_size (insn);
26080 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
26081 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
26082 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
26083 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
26085 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
26086 if (!data->ready_try_change)
26088 data->ready_try_change = sbitmap_alloc (n_ready);
26089 data->ready_try_change_size = n_ready;
26091 else if (data->ready_try_change_size < n_ready)
26093 data->ready_try_change = sbitmap_resize (data->ready_try_change,
26094 n_ready, 0);
26095 data->ready_try_change_size = n_ready;
26097 bitmap_clear (data->ready_try_change);
26099 /* Filter out insns from ready_try that the core will not be able to issue
26100 on current cycle due to decoder. */
26101 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
26102 false);
26105 /* Revert the effect on ready_try. */
26106 static void
26107 core2i7_first_cycle_multipass_backtrack (const void *_data,
26108 char *ready_try,
26109 int n_ready ATTRIBUTE_UNUSED)
26111 const_ix86_first_cycle_multipass_data_t data
26112 = (const_ix86_first_cycle_multipass_data_t) _data;
26113 unsigned int i = 0;
26114 sbitmap_iterator sbi;
26116 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
26117 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
26119 ready_try[i] = 0;
26123 /* Save the result of multipass lookahead scheduling for the next round. */
26124 static void
26125 core2i7_first_cycle_multipass_end (const void *_data)
26127 const_ix86_first_cycle_multipass_data_t data
26128 = (const_ix86_first_cycle_multipass_data_t) _data;
26129 ix86_first_cycle_multipass_data_t next_data
26130 = ix86_first_cycle_multipass_data;
26132 if (data != NULL)
26134 next_data->ifetch_block_len = data->ifetch_block_len;
26135 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
26139 /* Deallocate target data. */
26140 static void
26141 core2i7_first_cycle_multipass_fini (void *_data)
26143 ix86_first_cycle_multipass_data_t data
26144 = (ix86_first_cycle_multipass_data_t) _data;
26146 if (data->ready_try_change)
26148 sbitmap_free (data->ready_try_change);
26149 data->ready_try_change = NULL;
26150 data->ready_try_change_size = 0;
26154 /* Prepare for scheduling pass. */
26155 static void
26156 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
26157 int verbose ATTRIBUTE_UNUSED,
26158 int max_uid ATTRIBUTE_UNUSED)
26160 /* Install scheduling hooks for current CPU. Some of these hooks are used
26161 in time-critical parts of the scheduler, so we only set them up when
26162 they are actually used. */
26163 switch (ix86_tune)
26165 case PROCESSOR_CORE2:
26166 case PROCESSOR_COREI7:
26167 case PROCESSOR_COREI7_AVX:
26168 case PROCESSOR_HASWELL:
26169 /* Do not perform multipass scheduling for pre-reload schedule
26170 to save compile time. */
26171 if (reload_completed)
26173 targetm.sched.dfa_post_advance_cycle
26174 = core2i7_dfa_post_advance_cycle;
26175 targetm.sched.first_cycle_multipass_init
26176 = core2i7_first_cycle_multipass_init;
26177 targetm.sched.first_cycle_multipass_begin
26178 = core2i7_first_cycle_multipass_begin;
26179 targetm.sched.first_cycle_multipass_issue
26180 = core2i7_first_cycle_multipass_issue;
26181 targetm.sched.first_cycle_multipass_backtrack
26182 = core2i7_first_cycle_multipass_backtrack;
26183 targetm.sched.first_cycle_multipass_end
26184 = core2i7_first_cycle_multipass_end;
26185 targetm.sched.first_cycle_multipass_fini
26186 = core2i7_first_cycle_multipass_fini;
26188 /* Set decoder parameters. */
26189 core2i7_secondary_decoder_max_insn_size = 8;
26190 core2i7_ifetch_block_size = 16;
26191 core2i7_ifetch_block_max_insns = 6;
26192 break;
26194 /* ... Fall through ... */
26195 default:
26196 targetm.sched.dfa_post_advance_cycle = NULL;
26197 targetm.sched.first_cycle_multipass_init = NULL;
26198 targetm.sched.first_cycle_multipass_begin = NULL;
26199 targetm.sched.first_cycle_multipass_issue = NULL;
26200 targetm.sched.first_cycle_multipass_backtrack = NULL;
26201 targetm.sched.first_cycle_multipass_end = NULL;
26202 targetm.sched.first_cycle_multipass_fini = NULL;
26203 break;
26208 /* Compute the alignment given to a constant that is being placed in memory.
26209 EXP is the constant and ALIGN is the alignment that the object would
26210 ordinarily have.
26211 The value of this function is used instead of that alignment to align
26212 the object. */
26215 ix86_constant_alignment (tree exp, int align)
26217 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
26218 || TREE_CODE (exp) == INTEGER_CST)
26220 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
26221 return 64;
26222 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
26223 return 128;
26225 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
26226 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
26227 return BITS_PER_WORD;
26229 return align;
26232 /* Compute the alignment for a static variable.
26233 TYPE is the data type, and ALIGN is the alignment that
26234 the object would ordinarily have. The value of this function is used
26235 instead of that alignment to align the object. */
26238 ix86_data_alignment (tree type, int align, bool opt)
26240 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
26242 if (opt
26243 && AGGREGATE_TYPE_P (type)
26244 && TYPE_SIZE (type)
26245 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26246 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
26247 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
26248 && align < max_align)
26249 align = max_align;
26251 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26252 to 16byte boundary. */
26253 if (TARGET_64BIT)
26255 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
26256 && TYPE_SIZE (type)
26257 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26258 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
26259 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26260 return 128;
26263 if (!opt)
26264 return align;
26266 if (TREE_CODE (type) == ARRAY_TYPE)
26268 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26269 return 64;
26270 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26271 return 128;
26273 else if (TREE_CODE (type) == COMPLEX_TYPE)
26276 if (TYPE_MODE (type) == DCmode && align < 64)
26277 return 64;
26278 if ((TYPE_MODE (type) == XCmode
26279 || TYPE_MODE (type) == TCmode) && align < 128)
26280 return 128;
26282 else if ((TREE_CODE (type) == RECORD_TYPE
26283 || TREE_CODE (type) == UNION_TYPE
26284 || TREE_CODE (type) == QUAL_UNION_TYPE)
26285 && TYPE_FIELDS (type))
26287 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26288 return 64;
26289 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26290 return 128;
26292 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26293 || TREE_CODE (type) == INTEGER_TYPE)
26295 if (TYPE_MODE (type) == DFmode && align < 64)
26296 return 64;
26297 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26298 return 128;
26301 return align;
26304 /* Compute the alignment for a local variable or a stack slot. EXP is
26305 the data type or decl itself, MODE is the widest mode available and
26306 ALIGN is the alignment that the object would ordinarily have. The
26307 value of this macro is used instead of that alignment to align the
26308 object. */
26310 unsigned int
26311 ix86_local_alignment (tree exp, enum machine_mode mode,
26312 unsigned int align)
26314 tree type, decl;
26316 if (exp && DECL_P (exp))
26318 type = TREE_TYPE (exp);
26319 decl = exp;
26321 else
26323 type = exp;
26324 decl = NULL;
26327 /* Don't do dynamic stack realignment for long long objects with
26328 -mpreferred-stack-boundary=2. */
26329 if (!TARGET_64BIT
26330 && align == 64
26331 && ix86_preferred_stack_boundary < 64
26332 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26333 && (!type || !TYPE_USER_ALIGN (type))
26334 && (!decl || !DECL_USER_ALIGN (decl)))
26335 align = 32;
26337 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26338 register in MODE. We will return the largest alignment of XF
26339 and DF. */
26340 if (!type)
26342 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26343 align = GET_MODE_ALIGNMENT (DFmode);
26344 return align;
26347 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26348 to 16byte boundary. Exact wording is:
26350 An array uses the same alignment as its elements, except that a local or
26351 global array variable of length at least 16 bytes or
26352 a C99 variable-length array variable always has alignment of at least 16 bytes.
26354 This was added to allow use of aligned SSE instructions at arrays. This
26355 rule is meant for static storage (where compiler can not do the analysis
26356 by itself). We follow it for automatic variables only when convenient.
26357 We fully control everything in the function compiled and functions from
26358 other unit can not rely on the alignment.
26360 Exclude va_list type. It is the common case of local array where
26361 we can not benefit from the alignment.
26363 TODO: Probably one should optimize for size only when var is not escaping. */
26364 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26365 && TARGET_SSE)
26367 if (AGGREGATE_TYPE_P (type)
26368 && (va_list_type_node == NULL_TREE
26369 || (TYPE_MAIN_VARIANT (type)
26370 != TYPE_MAIN_VARIANT (va_list_type_node)))
26371 && TYPE_SIZE (type)
26372 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26373 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
26374 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26375 return 128;
26377 if (TREE_CODE (type) == ARRAY_TYPE)
26379 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26380 return 64;
26381 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26382 return 128;
26384 else if (TREE_CODE (type) == COMPLEX_TYPE)
26386 if (TYPE_MODE (type) == DCmode && align < 64)
26387 return 64;
26388 if ((TYPE_MODE (type) == XCmode
26389 || TYPE_MODE (type) == TCmode) && align < 128)
26390 return 128;
26392 else if ((TREE_CODE (type) == RECORD_TYPE
26393 || TREE_CODE (type) == UNION_TYPE
26394 || TREE_CODE (type) == QUAL_UNION_TYPE)
26395 && TYPE_FIELDS (type))
26397 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26398 return 64;
26399 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26400 return 128;
26402 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26403 || TREE_CODE (type) == INTEGER_TYPE)
26406 if (TYPE_MODE (type) == DFmode && align < 64)
26407 return 64;
26408 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26409 return 128;
26411 return align;
26414 /* Compute the minimum required alignment for dynamic stack realignment
26415 purposes for a local variable, parameter or a stack slot. EXP is
26416 the data type or decl itself, MODE is its mode and ALIGN is the
26417 alignment that the object would ordinarily have. */
26419 unsigned int
26420 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26421 unsigned int align)
26423 tree type, decl;
26425 if (exp && DECL_P (exp))
26427 type = TREE_TYPE (exp);
26428 decl = exp;
26430 else
26432 type = exp;
26433 decl = NULL;
26436 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26437 return align;
26439 /* Don't do dynamic stack realignment for long long objects with
26440 -mpreferred-stack-boundary=2. */
26441 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26442 && (!type || !TYPE_USER_ALIGN (type))
26443 && (!decl || !DECL_USER_ALIGN (decl)))
26444 return 32;
26446 return align;
26449 /* Find a location for the static chain incoming to a nested function.
26450 This is a register, unless all free registers are used by arguments. */
26452 static rtx
26453 ix86_static_chain (const_tree fndecl, bool incoming_p)
26455 unsigned regno;
26457 if (!DECL_STATIC_CHAIN (fndecl))
26458 return NULL;
26460 if (TARGET_64BIT)
26462 /* We always use R10 in 64-bit mode. */
26463 regno = R10_REG;
26465 else
26467 tree fntype;
26468 unsigned int ccvt;
26470 /* By default in 32-bit mode we use ECX to pass the static chain. */
26471 regno = CX_REG;
26473 fntype = TREE_TYPE (fndecl);
26474 ccvt = ix86_get_callcvt (fntype);
26475 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26477 /* Fastcall functions use ecx/edx for arguments, which leaves
26478 us with EAX for the static chain.
26479 Thiscall functions use ecx for arguments, which also
26480 leaves us with EAX for the static chain. */
26481 regno = AX_REG;
26483 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26485 /* Thiscall functions use ecx for arguments, which leaves
26486 us with EAX and EDX for the static chain.
26487 We are using for abi-compatibility EAX. */
26488 regno = AX_REG;
26490 else if (ix86_function_regparm (fntype, fndecl) == 3)
26492 /* For regparm 3, we have no free call-clobbered registers in
26493 which to store the static chain. In order to implement this,
26494 we have the trampoline push the static chain to the stack.
26495 However, we can't push a value below the return address when
26496 we call the nested function directly, so we have to use an
26497 alternate entry point. For this we use ESI, and have the
26498 alternate entry point push ESI, so that things appear the
26499 same once we're executing the nested function. */
26500 if (incoming_p)
26502 if (fndecl == current_function_decl)
26503 ix86_static_chain_on_stack = true;
26504 return gen_frame_mem (SImode,
26505 plus_constant (Pmode,
26506 arg_pointer_rtx, -8));
26508 regno = SI_REG;
26512 return gen_rtx_REG (Pmode, regno);
26515 /* Emit RTL insns to initialize the variable parts of a trampoline.
26516 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26517 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26518 to be passed to the target function. */
26520 static void
26521 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26523 rtx mem, fnaddr;
26524 int opcode;
26525 int offset = 0;
26527 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26529 if (TARGET_64BIT)
26531 int size;
26533 /* Load the function address to r11. Try to load address using
26534 the shorter movl instead of movabs. We may want to support
26535 movq for kernel mode, but kernel does not use trampolines at
26536 the moment. FNADDR is a 32bit address and may not be in
26537 DImode when ptr_mode == SImode. Always use movl in this
26538 case. */
26539 if (ptr_mode == SImode
26540 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26542 fnaddr = copy_addr_to_reg (fnaddr);
26544 mem = adjust_address (m_tramp, HImode, offset);
26545 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26547 mem = adjust_address (m_tramp, SImode, offset + 2);
26548 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26549 offset += 6;
26551 else
26553 mem = adjust_address (m_tramp, HImode, offset);
26554 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26556 mem = adjust_address (m_tramp, DImode, offset + 2);
26557 emit_move_insn (mem, fnaddr);
26558 offset += 10;
26561 /* Load static chain using movabs to r10. Use the shorter movl
26562 instead of movabs when ptr_mode == SImode. */
26563 if (ptr_mode == SImode)
26565 opcode = 0xba41;
26566 size = 6;
26568 else
26570 opcode = 0xba49;
26571 size = 10;
26574 mem = adjust_address (m_tramp, HImode, offset);
26575 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26577 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26578 emit_move_insn (mem, chain_value);
26579 offset += size;
26581 /* Jump to r11; the last (unused) byte is a nop, only there to
26582 pad the write out to a single 32-bit store. */
26583 mem = adjust_address (m_tramp, SImode, offset);
26584 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26585 offset += 4;
26587 else
26589 rtx disp, chain;
26591 /* Depending on the static chain location, either load a register
26592 with a constant, or push the constant to the stack. All of the
26593 instructions are the same size. */
26594 chain = ix86_static_chain (fndecl, true);
26595 if (REG_P (chain))
26597 switch (REGNO (chain))
26599 case AX_REG:
26600 opcode = 0xb8; break;
26601 case CX_REG:
26602 opcode = 0xb9; break;
26603 default:
26604 gcc_unreachable ();
26607 else
26608 opcode = 0x68;
26610 mem = adjust_address (m_tramp, QImode, offset);
26611 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26613 mem = adjust_address (m_tramp, SImode, offset + 1);
26614 emit_move_insn (mem, chain_value);
26615 offset += 5;
26617 mem = adjust_address (m_tramp, QImode, offset);
26618 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26620 mem = adjust_address (m_tramp, SImode, offset + 1);
26622 /* Compute offset from the end of the jmp to the target function.
26623 In the case in which the trampoline stores the static chain on
26624 the stack, we need to skip the first insn which pushes the
26625 (call-saved) register static chain; this push is 1 byte. */
26626 offset += 5;
26627 disp = expand_binop (SImode, sub_optab, fnaddr,
26628 plus_constant (Pmode, XEXP (m_tramp, 0),
26629 offset - (MEM_P (chain) ? 1 : 0)),
26630 NULL_RTX, 1, OPTAB_DIRECT);
26631 emit_move_insn (mem, disp);
26634 gcc_assert (offset <= TRAMPOLINE_SIZE);
26636 #ifdef HAVE_ENABLE_EXECUTE_STACK
26637 #ifdef CHECK_EXECUTE_STACK_ENABLED
26638 if (CHECK_EXECUTE_STACK_ENABLED)
26639 #endif
26640 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26641 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26642 #endif
26645 /* The following file contains several enumerations and data structures
26646 built from the definitions in i386-builtin-types.def. */
26648 #include "i386-builtin-types.inc"
26650 /* Table for the ix86 builtin non-function types. */
26651 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26653 /* Retrieve an element from the above table, building some of
26654 the types lazily. */
26656 static tree
26657 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26659 unsigned int index;
26660 tree type, itype;
26662 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26664 type = ix86_builtin_type_tab[(int) tcode];
26665 if (type != NULL)
26666 return type;
26668 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26669 if (tcode <= IX86_BT_LAST_VECT)
26671 enum machine_mode mode;
26673 index = tcode - IX86_BT_LAST_PRIM - 1;
26674 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26675 mode = ix86_builtin_type_vect_mode[index];
26677 type = build_vector_type_for_mode (itype, mode);
26679 else
26681 int quals;
26683 index = tcode - IX86_BT_LAST_VECT - 1;
26684 if (tcode <= IX86_BT_LAST_PTR)
26685 quals = TYPE_UNQUALIFIED;
26686 else
26687 quals = TYPE_QUAL_CONST;
26689 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26690 if (quals != TYPE_UNQUALIFIED)
26691 itype = build_qualified_type (itype, quals);
26693 type = build_pointer_type (itype);
26696 ix86_builtin_type_tab[(int) tcode] = type;
26697 return type;
26700 /* Table for the ix86 builtin function types. */
26701 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26703 /* Retrieve an element from the above table, building some of
26704 the types lazily. */
26706 static tree
26707 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26709 tree type;
26711 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26713 type = ix86_builtin_func_type_tab[(int) tcode];
26714 if (type != NULL)
26715 return type;
26717 if (tcode <= IX86_BT_LAST_FUNC)
26719 unsigned start = ix86_builtin_func_start[(int) tcode];
26720 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26721 tree rtype, atype, args = void_list_node;
26722 unsigned i;
26724 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26725 for (i = after - 1; i > start; --i)
26727 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26728 args = tree_cons (NULL, atype, args);
26731 type = build_function_type (rtype, args);
26733 else
26735 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26736 enum ix86_builtin_func_type icode;
26738 icode = ix86_builtin_func_alias_base[index];
26739 type = ix86_get_builtin_func_type (icode);
26742 ix86_builtin_func_type_tab[(int) tcode] = type;
26743 return type;
26747 /* Codes for all the SSE/MMX builtins. */
26748 enum ix86_builtins
26750 IX86_BUILTIN_ADDPS,
26751 IX86_BUILTIN_ADDSS,
26752 IX86_BUILTIN_DIVPS,
26753 IX86_BUILTIN_DIVSS,
26754 IX86_BUILTIN_MULPS,
26755 IX86_BUILTIN_MULSS,
26756 IX86_BUILTIN_SUBPS,
26757 IX86_BUILTIN_SUBSS,
26759 IX86_BUILTIN_CMPEQPS,
26760 IX86_BUILTIN_CMPLTPS,
26761 IX86_BUILTIN_CMPLEPS,
26762 IX86_BUILTIN_CMPGTPS,
26763 IX86_BUILTIN_CMPGEPS,
26764 IX86_BUILTIN_CMPNEQPS,
26765 IX86_BUILTIN_CMPNLTPS,
26766 IX86_BUILTIN_CMPNLEPS,
26767 IX86_BUILTIN_CMPNGTPS,
26768 IX86_BUILTIN_CMPNGEPS,
26769 IX86_BUILTIN_CMPORDPS,
26770 IX86_BUILTIN_CMPUNORDPS,
26771 IX86_BUILTIN_CMPEQSS,
26772 IX86_BUILTIN_CMPLTSS,
26773 IX86_BUILTIN_CMPLESS,
26774 IX86_BUILTIN_CMPNEQSS,
26775 IX86_BUILTIN_CMPNLTSS,
26776 IX86_BUILTIN_CMPNLESS,
26777 IX86_BUILTIN_CMPORDSS,
26778 IX86_BUILTIN_CMPUNORDSS,
26780 IX86_BUILTIN_COMIEQSS,
26781 IX86_BUILTIN_COMILTSS,
26782 IX86_BUILTIN_COMILESS,
26783 IX86_BUILTIN_COMIGTSS,
26784 IX86_BUILTIN_COMIGESS,
26785 IX86_BUILTIN_COMINEQSS,
26786 IX86_BUILTIN_UCOMIEQSS,
26787 IX86_BUILTIN_UCOMILTSS,
26788 IX86_BUILTIN_UCOMILESS,
26789 IX86_BUILTIN_UCOMIGTSS,
26790 IX86_BUILTIN_UCOMIGESS,
26791 IX86_BUILTIN_UCOMINEQSS,
26793 IX86_BUILTIN_CVTPI2PS,
26794 IX86_BUILTIN_CVTPS2PI,
26795 IX86_BUILTIN_CVTSI2SS,
26796 IX86_BUILTIN_CVTSI642SS,
26797 IX86_BUILTIN_CVTSS2SI,
26798 IX86_BUILTIN_CVTSS2SI64,
26799 IX86_BUILTIN_CVTTPS2PI,
26800 IX86_BUILTIN_CVTTSS2SI,
26801 IX86_BUILTIN_CVTTSS2SI64,
26803 IX86_BUILTIN_MAXPS,
26804 IX86_BUILTIN_MAXSS,
26805 IX86_BUILTIN_MINPS,
26806 IX86_BUILTIN_MINSS,
26808 IX86_BUILTIN_LOADUPS,
26809 IX86_BUILTIN_STOREUPS,
26810 IX86_BUILTIN_MOVSS,
26812 IX86_BUILTIN_MOVHLPS,
26813 IX86_BUILTIN_MOVLHPS,
26814 IX86_BUILTIN_LOADHPS,
26815 IX86_BUILTIN_LOADLPS,
26816 IX86_BUILTIN_STOREHPS,
26817 IX86_BUILTIN_STORELPS,
26819 IX86_BUILTIN_MASKMOVQ,
26820 IX86_BUILTIN_MOVMSKPS,
26821 IX86_BUILTIN_PMOVMSKB,
26823 IX86_BUILTIN_MOVNTPS,
26824 IX86_BUILTIN_MOVNTQ,
26826 IX86_BUILTIN_LOADDQU,
26827 IX86_BUILTIN_STOREDQU,
26829 IX86_BUILTIN_PACKSSWB,
26830 IX86_BUILTIN_PACKSSDW,
26831 IX86_BUILTIN_PACKUSWB,
26833 IX86_BUILTIN_PADDB,
26834 IX86_BUILTIN_PADDW,
26835 IX86_BUILTIN_PADDD,
26836 IX86_BUILTIN_PADDQ,
26837 IX86_BUILTIN_PADDSB,
26838 IX86_BUILTIN_PADDSW,
26839 IX86_BUILTIN_PADDUSB,
26840 IX86_BUILTIN_PADDUSW,
26841 IX86_BUILTIN_PSUBB,
26842 IX86_BUILTIN_PSUBW,
26843 IX86_BUILTIN_PSUBD,
26844 IX86_BUILTIN_PSUBQ,
26845 IX86_BUILTIN_PSUBSB,
26846 IX86_BUILTIN_PSUBSW,
26847 IX86_BUILTIN_PSUBUSB,
26848 IX86_BUILTIN_PSUBUSW,
26850 IX86_BUILTIN_PAND,
26851 IX86_BUILTIN_PANDN,
26852 IX86_BUILTIN_POR,
26853 IX86_BUILTIN_PXOR,
26855 IX86_BUILTIN_PAVGB,
26856 IX86_BUILTIN_PAVGW,
26858 IX86_BUILTIN_PCMPEQB,
26859 IX86_BUILTIN_PCMPEQW,
26860 IX86_BUILTIN_PCMPEQD,
26861 IX86_BUILTIN_PCMPGTB,
26862 IX86_BUILTIN_PCMPGTW,
26863 IX86_BUILTIN_PCMPGTD,
26865 IX86_BUILTIN_PMADDWD,
26867 IX86_BUILTIN_PMAXSW,
26868 IX86_BUILTIN_PMAXUB,
26869 IX86_BUILTIN_PMINSW,
26870 IX86_BUILTIN_PMINUB,
26872 IX86_BUILTIN_PMULHUW,
26873 IX86_BUILTIN_PMULHW,
26874 IX86_BUILTIN_PMULLW,
26876 IX86_BUILTIN_PSADBW,
26877 IX86_BUILTIN_PSHUFW,
26879 IX86_BUILTIN_PSLLW,
26880 IX86_BUILTIN_PSLLD,
26881 IX86_BUILTIN_PSLLQ,
26882 IX86_BUILTIN_PSRAW,
26883 IX86_BUILTIN_PSRAD,
26884 IX86_BUILTIN_PSRLW,
26885 IX86_BUILTIN_PSRLD,
26886 IX86_BUILTIN_PSRLQ,
26887 IX86_BUILTIN_PSLLWI,
26888 IX86_BUILTIN_PSLLDI,
26889 IX86_BUILTIN_PSLLQI,
26890 IX86_BUILTIN_PSRAWI,
26891 IX86_BUILTIN_PSRADI,
26892 IX86_BUILTIN_PSRLWI,
26893 IX86_BUILTIN_PSRLDI,
26894 IX86_BUILTIN_PSRLQI,
26896 IX86_BUILTIN_PUNPCKHBW,
26897 IX86_BUILTIN_PUNPCKHWD,
26898 IX86_BUILTIN_PUNPCKHDQ,
26899 IX86_BUILTIN_PUNPCKLBW,
26900 IX86_BUILTIN_PUNPCKLWD,
26901 IX86_BUILTIN_PUNPCKLDQ,
26903 IX86_BUILTIN_SHUFPS,
26905 IX86_BUILTIN_RCPPS,
26906 IX86_BUILTIN_RCPSS,
26907 IX86_BUILTIN_RSQRTPS,
26908 IX86_BUILTIN_RSQRTPS_NR,
26909 IX86_BUILTIN_RSQRTSS,
26910 IX86_BUILTIN_RSQRTF,
26911 IX86_BUILTIN_SQRTPS,
26912 IX86_BUILTIN_SQRTPS_NR,
26913 IX86_BUILTIN_SQRTSS,
26915 IX86_BUILTIN_UNPCKHPS,
26916 IX86_BUILTIN_UNPCKLPS,
26918 IX86_BUILTIN_ANDPS,
26919 IX86_BUILTIN_ANDNPS,
26920 IX86_BUILTIN_ORPS,
26921 IX86_BUILTIN_XORPS,
26923 IX86_BUILTIN_EMMS,
26924 IX86_BUILTIN_LDMXCSR,
26925 IX86_BUILTIN_STMXCSR,
26926 IX86_BUILTIN_SFENCE,
26928 IX86_BUILTIN_FXSAVE,
26929 IX86_BUILTIN_FXRSTOR,
26930 IX86_BUILTIN_FXSAVE64,
26931 IX86_BUILTIN_FXRSTOR64,
26933 IX86_BUILTIN_XSAVE,
26934 IX86_BUILTIN_XRSTOR,
26935 IX86_BUILTIN_XSAVE64,
26936 IX86_BUILTIN_XRSTOR64,
26938 IX86_BUILTIN_XSAVEOPT,
26939 IX86_BUILTIN_XSAVEOPT64,
26941 /* 3DNow! Original */
26942 IX86_BUILTIN_FEMMS,
26943 IX86_BUILTIN_PAVGUSB,
26944 IX86_BUILTIN_PF2ID,
26945 IX86_BUILTIN_PFACC,
26946 IX86_BUILTIN_PFADD,
26947 IX86_BUILTIN_PFCMPEQ,
26948 IX86_BUILTIN_PFCMPGE,
26949 IX86_BUILTIN_PFCMPGT,
26950 IX86_BUILTIN_PFMAX,
26951 IX86_BUILTIN_PFMIN,
26952 IX86_BUILTIN_PFMUL,
26953 IX86_BUILTIN_PFRCP,
26954 IX86_BUILTIN_PFRCPIT1,
26955 IX86_BUILTIN_PFRCPIT2,
26956 IX86_BUILTIN_PFRSQIT1,
26957 IX86_BUILTIN_PFRSQRT,
26958 IX86_BUILTIN_PFSUB,
26959 IX86_BUILTIN_PFSUBR,
26960 IX86_BUILTIN_PI2FD,
26961 IX86_BUILTIN_PMULHRW,
26963 /* 3DNow! Athlon Extensions */
26964 IX86_BUILTIN_PF2IW,
26965 IX86_BUILTIN_PFNACC,
26966 IX86_BUILTIN_PFPNACC,
26967 IX86_BUILTIN_PI2FW,
26968 IX86_BUILTIN_PSWAPDSI,
26969 IX86_BUILTIN_PSWAPDSF,
26971 /* SSE2 */
26972 IX86_BUILTIN_ADDPD,
26973 IX86_BUILTIN_ADDSD,
26974 IX86_BUILTIN_DIVPD,
26975 IX86_BUILTIN_DIVSD,
26976 IX86_BUILTIN_MULPD,
26977 IX86_BUILTIN_MULSD,
26978 IX86_BUILTIN_SUBPD,
26979 IX86_BUILTIN_SUBSD,
26981 IX86_BUILTIN_CMPEQPD,
26982 IX86_BUILTIN_CMPLTPD,
26983 IX86_BUILTIN_CMPLEPD,
26984 IX86_BUILTIN_CMPGTPD,
26985 IX86_BUILTIN_CMPGEPD,
26986 IX86_BUILTIN_CMPNEQPD,
26987 IX86_BUILTIN_CMPNLTPD,
26988 IX86_BUILTIN_CMPNLEPD,
26989 IX86_BUILTIN_CMPNGTPD,
26990 IX86_BUILTIN_CMPNGEPD,
26991 IX86_BUILTIN_CMPORDPD,
26992 IX86_BUILTIN_CMPUNORDPD,
26993 IX86_BUILTIN_CMPEQSD,
26994 IX86_BUILTIN_CMPLTSD,
26995 IX86_BUILTIN_CMPLESD,
26996 IX86_BUILTIN_CMPNEQSD,
26997 IX86_BUILTIN_CMPNLTSD,
26998 IX86_BUILTIN_CMPNLESD,
26999 IX86_BUILTIN_CMPORDSD,
27000 IX86_BUILTIN_CMPUNORDSD,
27002 IX86_BUILTIN_COMIEQSD,
27003 IX86_BUILTIN_COMILTSD,
27004 IX86_BUILTIN_COMILESD,
27005 IX86_BUILTIN_COMIGTSD,
27006 IX86_BUILTIN_COMIGESD,
27007 IX86_BUILTIN_COMINEQSD,
27008 IX86_BUILTIN_UCOMIEQSD,
27009 IX86_BUILTIN_UCOMILTSD,
27010 IX86_BUILTIN_UCOMILESD,
27011 IX86_BUILTIN_UCOMIGTSD,
27012 IX86_BUILTIN_UCOMIGESD,
27013 IX86_BUILTIN_UCOMINEQSD,
27015 IX86_BUILTIN_MAXPD,
27016 IX86_BUILTIN_MAXSD,
27017 IX86_BUILTIN_MINPD,
27018 IX86_BUILTIN_MINSD,
27020 IX86_BUILTIN_ANDPD,
27021 IX86_BUILTIN_ANDNPD,
27022 IX86_BUILTIN_ORPD,
27023 IX86_BUILTIN_XORPD,
27025 IX86_BUILTIN_SQRTPD,
27026 IX86_BUILTIN_SQRTSD,
27028 IX86_BUILTIN_UNPCKHPD,
27029 IX86_BUILTIN_UNPCKLPD,
27031 IX86_BUILTIN_SHUFPD,
27033 IX86_BUILTIN_LOADUPD,
27034 IX86_BUILTIN_STOREUPD,
27035 IX86_BUILTIN_MOVSD,
27037 IX86_BUILTIN_LOADHPD,
27038 IX86_BUILTIN_LOADLPD,
27040 IX86_BUILTIN_CVTDQ2PD,
27041 IX86_BUILTIN_CVTDQ2PS,
27043 IX86_BUILTIN_CVTPD2DQ,
27044 IX86_BUILTIN_CVTPD2PI,
27045 IX86_BUILTIN_CVTPD2PS,
27046 IX86_BUILTIN_CVTTPD2DQ,
27047 IX86_BUILTIN_CVTTPD2PI,
27049 IX86_BUILTIN_CVTPI2PD,
27050 IX86_BUILTIN_CVTSI2SD,
27051 IX86_BUILTIN_CVTSI642SD,
27053 IX86_BUILTIN_CVTSD2SI,
27054 IX86_BUILTIN_CVTSD2SI64,
27055 IX86_BUILTIN_CVTSD2SS,
27056 IX86_BUILTIN_CVTSS2SD,
27057 IX86_BUILTIN_CVTTSD2SI,
27058 IX86_BUILTIN_CVTTSD2SI64,
27060 IX86_BUILTIN_CVTPS2DQ,
27061 IX86_BUILTIN_CVTPS2PD,
27062 IX86_BUILTIN_CVTTPS2DQ,
27064 IX86_BUILTIN_MOVNTI,
27065 IX86_BUILTIN_MOVNTI64,
27066 IX86_BUILTIN_MOVNTPD,
27067 IX86_BUILTIN_MOVNTDQ,
27069 IX86_BUILTIN_MOVQ128,
27071 /* SSE2 MMX */
27072 IX86_BUILTIN_MASKMOVDQU,
27073 IX86_BUILTIN_MOVMSKPD,
27074 IX86_BUILTIN_PMOVMSKB128,
27076 IX86_BUILTIN_PACKSSWB128,
27077 IX86_BUILTIN_PACKSSDW128,
27078 IX86_BUILTIN_PACKUSWB128,
27080 IX86_BUILTIN_PADDB128,
27081 IX86_BUILTIN_PADDW128,
27082 IX86_BUILTIN_PADDD128,
27083 IX86_BUILTIN_PADDQ128,
27084 IX86_BUILTIN_PADDSB128,
27085 IX86_BUILTIN_PADDSW128,
27086 IX86_BUILTIN_PADDUSB128,
27087 IX86_BUILTIN_PADDUSW128,
27088 IX86_BUILTIN_PSUBB128,
27089 IX86_BUILTIN_PSUBW128,
27090 IX86_BUILTIN_PSUBD128,
27091 IX86_BUILTIN_PSUBQ128,
27092 IX86_BUILTIN_PSUBSB128,
27093 IX86_BUILTIN_PSUBSW128,
27094 IX86_BUILTIN_PSUBUSB128,
27095 IX86_BUILTIN_PSUBUSW128,
27097 IX86_BUILTIN_PAND128,
27098 IX86_BUILTIN_PANDN128,
27099 IX86_BUILTIN_POR128,
27100 IX86_BUILTIN_PXOR128,
27102 IX86_BUILTIN_PAVGB128,
27103 IX86_BUILTIN_PAVGW128,
27105 IX86_BUILTIN_PCMPEQB128,
27106 IX86_BUILTIN_PCMPEQW128,
27107 IX86_BUILTIN_PCMPEQD128,
27108 IX86_BUILTIN_PCMPGTB128,
27109 IX86_BUILTIN_PCMPGTW128,
27110 IX86_BUILTIN_PCMPGTD128,
27112 IX86_BUILTIN_PMADDWD128,
27114 IX86_BUILTIN_PMAXSW128,
27115 IX86_BUILTIN_PMAXUB128,
27116 IX86_BUILTIN_PMINSW128,
27117 IX86_BUILTIN_PMINUB128,
27119 IX86_BUILTIN_PMULUDQ,
27120 IX86_BUILTIN_PMULUDQ128,
27121 IX86_BUILTIN_PMULHUW128,
27122 IX86_BUILTIN_PMULHW128,
27123 IX86_BUILTIN_PMULLW128,
27125 IX86_BUILTIN_PSADBW128,
27126 IX86_BUILTIN_PSHUFHW,
27127 IX86_BUILTIN_PSHUFLW,
27128 IX86_BUILTIN_PSHUFD,
27130 IX86_BUILTIN_PSLLDQI128,
27131 IX86_BUILTIN_PSLLWI128,
27132 IX86_BUILTIN_PSLLDI128,
27133 IX86_BUILTIN_PSLLQI128,
27134 IX86_BUILTIN_PSRAWI128,
27135 IX86_BUILTIN_PSRADI128,
27136 IX86_BUILTIN_PSRLDQI128,
27137 IX86_BUILTIN_PSRLWI128,
27138 IX86_BUILTIN_PSRLDI128,
27139 IX86_BUILTIN_PSRLQI128,
27141 IX86_BUILTIN_PSLLDQ128,
27142 IX86_BUILTIN_PSLLW128,
27143 IX86_BUILTIN_PSLLD128,
27144 IX86_BUILTIN_PSLLQ128,
27145 IX86_BUILTIN_PSRAW128,
27146 IX86_BUILTIN_PSRAD128,
27147 IX86_BUILTIN_PSRLW128,
27148 IX86_BUILTIN_PSRLD128,
27149 IX86_BUILTIN_PSRLQ128,
27151 IX86_BUILTIN_PUNPCKHBW128,
27152 IX86_BUILTIN_PUNPCKHWD128,
27153 IX86_BUILTIN_PUNPCKHDQ128,
27154 IX86_BUILTIN_PUNPCKHQDQ128,
27155 IX86_BUILTIN_PUNPCKLBW128,
27156 IX86_BUILTIN_PUNPCKLWD128,
27157 IX86_BUILTIN_PUNPCKLDQ128,
27158 IX86_BUILTIN_PUNPCKLQDQ128,
27160 IX86_BUILTIN_CLFLUSH,
27161 IX86_BUILTIN_MFENCE,
27162 IX86_BUILTIN_LFENCE,
27163 IX86_BUILTIN_PAUSE,
27165 IX86_BUILTIN_FNSTENV,
27166 IX86_BUILTIN_FLDENV,
27167 IX86_BUILTIN_FNSTSW,
27168 IX86_BUILTIN_FNCLEX,
27170 IX86_BUILTIN_BSRSI,
27171 IX86_BUILTIN_BSRDI,
27172 IX86_BUILTIN_RDPMC,
27173 IX86_BUILTIN_RDTSC,
27174 IX86_BUILTIN_RDTSCP,
27175 IX86_BUILTIN_ROLQI,
27176 IX86_BUILTIN_ROLHI,
27177 IX86_BUILTIN_RORQI,
27178 IX86_BUILTIN_RORHI,
27180 /* SSE3. */
27181 IX86_BUILTIN_ADDSUBPS,
27182 IX86_BUILTIN_HADDPS,
27183 IX86_BUILTIN_HSUBPS,
27184 IX86_BUILTIN_MOVSHDUP,
27185 IX86_BUILTIN_MOVSLDUP,
27186 IX86_BUILTIN_ADDSUBPD,
27187 IX86_BUILTIN_HADDPD,
27188 IX86_BUILTIN_HSUBPD,
27189 IX86_BUILTIN_LDDQU,
27191 IX86_BUILTIN_MONITOR,
27192 IX86_BUILTIN_MWAIT,
27194 /* SSSE3. */
27195 IX86_BUILTIN_PHADDW,
27196 IX86_BUILTIN_PHADDD,
27197 IX86_BUILTIN_PHADDSW,
27198 IX86_BUILTIN_PHSUBW,
27199 IX86_BUILTIN_PHSUBD,
27200 IX86_BUILTIN_PHSUBSW,
27201 IX86_BUILTIN_PMADDUBSW,
27202 IX86_BUILTIN_PMULHRSW,
27203 IX86_BUILTIN_PSHUFB,
27204 IX86_BUILTIN_PSIGNB,
27205 IX86_BUILTIN_PSIGNW,
27206 IX86_BUILTIN_PSIGND,
27207 IX86_BUILTIN_PALIGNR,
27208 IX86_BUILTIN_PABSB,
27209 IX86_BUILTIN_PABSW,
27210 IX86_BUILTIN_PABSD,
27212 IX86_BUILTIN_PHADDW128,
27213 IX86_BUILTIN_PHADDD128,
27214 IX86_BUILTIN_PHADDSW128,
27215 IX86_BUILTIN_PHSUBW128,
27216 IX86_BUILTIN_PHSUBD128,
27217 IX86_BUILTIN_PHSUBSW128,
27218 IX86_BUILTIN_PMADDUBSW128,
27219 IX86_BUILTIN_PMULHRSW128,
27220 IX86_BUILTIN_PSHUFB128,
27221 IX86_BUILTIN_PSIGNB128,
27222 IX86_BUILTIN_PSIGNW128,
27223 IX86_BUILTIN_PSIGND128,
27224 IX86_BUILTIN_PALIGNR128,
27225 IX86_BUILTIN_PABSB128,
27226 IX86_BUILTIN_PABSW128,
27227 IX86_BUILTIN_PABSD128,
27229 /* AMDFAM10 - SSE4A New Instructions. */
27230 IX86_BUILTIN_MOVNTSD,
27231 IX86_BUILTIN_MOVNTSS,
27232 IX86_BUILTIN_EXTRQI,
27233 IX86_BUILTIN_EXTRQ,
27234 IX86_BUILTIN_INSERTQI,
27235 IX86_BUILTIN_INSERTQ,
27237 /* SSE4.1. */
27238 IX86_BUILTIN_BLENDPD,
27239 IX86_BUILTIN_BLENDPS,
27240 IX86_BUILTIN_BLENDVPD,
27241 IX86_BUILTIN_BLENDVPS,
27242 IX86_BUILTIN_PBLENDVB128,
27243 IX86_BUILTIN_PBLENDW128,
27245 IX86_BUILTIN_DPPD,
27246 IX86_BUILTIN_DPPS,
27248 IX86_BUILTIN_INSERTPS128,
27250 IX86_BUILTIN_MOVNTDQA,
27251 IX86_BUILTIN_MPSADBW128,
27252 IX86_BUILTIN_PACKUSDW128,
27253 IX86_BUILTIN_PCMPEQQ,
27254 IX86_BUILTIN_PHMINPOSUW128,
27256 IX86_BUILTIN_PMAXSB128,
27257 IX86_BUILTIN_PMAXSD128,
27258 IX86_BUILTIN_PMAXUD128,
27259 IX86_BUILTIN_PMAXUW128,
27261 IX86_BUILTIN_PMINSB128,
27262 IX86_BUILTIN_PMINSD128,
27263 IX86_BUILTIN_PMINUD128,
27264 IX86_BUILTIN_PMINUW128,
27266 IX86_BUILTIN_PMOVSXBW128,
27267 IX86_BUILTIN_PMOVSXBD128,
27268 IX86_BUILTIN_PMOVSXBQ128,
27269 IX86_BUILTIN_PMOVSXWD128,
27270 IX86_BUILTIN_PMOVSXWQ128,
27271 IX86_BUILTIN_PMOVSXDQ128,
27273 IX86_BUILTIN_PMOVZXBW128,
27274 IX86_BUILTIN_PMOVZXBD128,
27275 IX86_BUILTIN_PMOVZXBQ128,
27276 IX86_BUILTIN_PMOVZXWD128,
27277 IX86_BUILTIN_PMOVZXWQ128,
27278 IX86_BUILTIN_PMOVZXDQ128,
27280 IX86_BUILTIN_PMULDQ128,
27281 IX86_BUILTIN_PMULLD128,
27283 IX86_BUILTIN_ROUNDSD,
27284 IX86_BUILTIN_ROUNDSS,
27286 IX86_BUILTIN_ROUNDPD,
27287 IX86_BUILTIN_ROUNDPS,
27289 IX86_BUILTIN_FLOORPD,
27290 IX86_BUILTIN_CEILPD,
27291 IX86_BUILTIN_TRUNCPD,
27292 IX86_BUILTIN_RINTPD,
27293 IX86_BUILTIN_ROUNDPD_AZ,
27295 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
27296 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
27297 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
27299 IX86_BUILTIN_FLOORPS,
27300 IX86_BUILTIN_CEILPS,
27301 IX86_BUILTIN_TRUNCPS,
27302 IX86_BUILTIN_RINTPS,
27303 IX86_BUILTIN_ROUNDPS_AZ,
27305 IX86_BUILTIN_FLOORPS_SFIX,
27306 IX86_BUILTIN_CEILPS_SFIX,
27307 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
27309 IX86_BUILTIN_PTESTZ,
27310 IX86_BUILTIN_PTESTC,
27311 IX86_BUILTIN_PTESTNZC,
27313 IX86_BUILTIN_VEC_INIT_V2SI,
27314 IX86_BUILTIN_VEC_INIT_V4HI,
27315 IX86_BUILTIN_VEC_INIT_V8QI,
27316 IX86_BUILTIN_VEC_EXT_V2DF,
27317 IX86_BUILTIN_VEC_EXT_V2DI,
27318 IX86_BUILTIN_VEC_EXT_V4SF,
27319 IX86_BUILTIN_VEC_EXT_V4SI,
27320 IX86_BUILTIN_VEC_EXT_V8HI,
27321 IX86_BUILTIN_VEC_EXT_V2SI,
27322 IX86_BUILTIN_VEC_EXT_V4HI,
27323 IX86_BUILTIN_VEC_EXT_V16QI,
27324 IX86_BUILTIN_VEC_SET_V2DI,
27325 IX86_BUILTIN_VEC_SET_V4SF,
27326 IX86_BUILTIN_VEC_SET_V4SI,
27327 IX86_BUILTIN_VEC_SET_V8HI,
27328 IX86_BUILTIN_VEC_SET_V4HI,
27329 IX86_BUILTIN_VEC_SET_V16QI,
27331 IX86_BUILTIN_VEC_PACK_SFIX,
27332 IX86_BUILTIN_VEC_PACK_SFIX256,
27334 /* SSE4.2. */
27335 IX86_BUILTIN_CRC32QI,
27336 IX86_BUILTIN_CRC32HI,
27337 IX86_BUILTIN_CRC32SI,
27338 IX86_BUILTIN_CRC32DI,
27340 IX86_BUILTIN_PCMPESTRI128,
27341 IX86_BUILTIN_PCMPESTRM128,
27342 IX86_BUILTIN_PCMPESTRA128,
27343 IX86_BUILTIN_PCMPESTRC128,
27344 IX86_BUILTIN_PCMPESTRO128,
27345 IX86_BUILTIN_PCMPESTRS128,
27346 IX86_BUILTIN_PCMPESTRZ128,
27347 IX86_BUILTIN_PCMPISTRI128,
27348 IX86_BUILTIN_PCMPISTRM128,
27349 IX86_BUILTIN_PCMPISTRA128,
27350 IX86_BUILTIN_PCMPISTRC128,
27351 IX86_BUILTIN_PCMPISTRO128,
27352 IX86_BUILTIN_PCMPISTRS128,
27353 IX86_BUILTIN_PCMPISTRZ128,
27355 IX86_BUILTIN_PCMPGTQ,
27357 /* AES instructions */
27358 IX86_BUILTIN_AESENC128,
27359 IX86_BUILTIN_AESENCLAST128,
27360 IX86_BUILTIN_AESDEC128,
27361 IX86_BUILTIN_AESDECLAST128,
27362 IX86_BUILTIN_AESIMC128,
27363 IX86_BUILTIN_AESKEYGENASSIST128,
27365 /* PCLMUL instruction */
27366 IX86_BUILTIN_PCLMULQDQ128,
27368 /* AVX */
27369 IX86_BUILTIN_ADDPD256,
27370 IX86_BUILTIN_ADDPS256,
27371 IX86_BUILTIN_ADDSUBPD256,
27372 IX86_BUILTIN_ADDSUBPS256,
27373 IX86_BUILTIN_ANDPD256,
27374 IX86_BUILTIN_ANDPS256,
27375 IX86_BUILTIN_ANDNPD256,
27376 IX86_BUILTIN_ANDNPS256,
27377 IX86_BUILTIN_BLENDPD256,
27378 IX86_BUILTIN_BLENDPS256,
27379 IX86_BUILTIN_BLENDVPD256,
27380 IX86_BUILTIN_BLENDVPS256,
27381 IX86_BUILTIN_DIVPD256,
27382 IX86_BUILTIN_DIVPS256,
27383 IX86_BUILTIN_DPPS256,
27384 IX86_BUILTIN_HADDPD256,
27385 IX86_BUILTIN_HADDPS256,
27386 IX86_BUILTIN_HSUBPD256,
27387 IX86_BUILTIN_HSUBPS256,
27388 IX86_BUILTIN_MAXPD256,
27389 IX86_BUILTIN_MAXPS256,
27390 IX86_BUILTIN_MINPD256,
27391 IX86_BUILTIN_MINPS256,
27392 IX86_BUILTIN_MULPD256,
27393 IX86_BUILTIN_MULPS256,
27394 IX86_BUILTIN_ORPD256,
27395 IX86_BUILTIN_ORPS256,
27396 IX86_BUILTIN_SHUFPD256,
27397 IX86_BUILTIN_SHUFPS256,
27398 IX86_BUILTIN_SUBPD256,
27399 IX86_BUILTIN_SUBPS256,
27400 IX86_BUILTIN_XORPD256,
27401 IX86_BUILTIN_XORPS256,
27402 IX86_BUILTIN_CMPSD,
27403 IX86_BUILTIN_CMPSS,
27404 IX86_BUILTIN_CMPPD,
27405 IX86_BUILTIN_CMPPS,
27406 IX86_BUILTIN_CMPPD256,
27407 IX86_BUILTIN_CMPPS256,
27408 IX86_BUILTIN_CVTDQ2PD256,
27409 IX86_BUILTIN_CVTDQ2PS256,
27410 IX86_BUILTIN_CVTPD2PS256,
27411 IX86_BUILTIN_CVTPS2DQ256,
27412 IX86_BUILTIN_CVTPS2PD256,
27413 IX86_BUILTIN_CVTTPD2DQ256,
27414 IX86_BUILTIN_CVTPD2DQ256,
27415 IX86_BUILTIN_CVTTPS2DQ256,
27416 IX86_BUILTIN_EXTRACTF128PD256,
27417 IX86_BUILTIN_EXTRACTF128PS256,
27418 IX86_BUILTIN_EXTRACTF128SI256,
27419 IX86_BUILTIN_VZEROALL,
27420 IX86_BUILTIN_VZEROUPPER,
27421 IX86_BUILTIN_VPERMILVARPD,
27422 IX86_BUILTIN_VPERMILVARPS,
27423 IX86_BUILTIN_VPERMILVARPD256,
27424 IX86_BUILTIN_VPERMILVARPS256,
27425 IX86_BUILTIN_VPERMILPD,
27426 IX86_BUILTIN_VPERMILPS,
27427 IX86_BUILTIN_VPERMILPD256,
27428 IX86_BUILTIN_VPERMILPS256,
27429 IX86_BUILTIN_VPERMIL2PD,
27430 IX86_BUILTIN_VPERMIL2PS,
27431 IX86_BUILTIN_VPERMIL2PD256,
27432 IX86_BUILTIN_VPERMIL2PS256,
27433 IX86_BUILTIN_VPERM2F128PD256,
27434 IX86_BUILTIN_VPERM2F128PS256,
27435 IX86_BUILTIN_VPERM2F128SI256,
27436 IX86_BUILTIN_VBROADCASTSS,
27437 IX86_BUILTIN_VBROADCASTSD256,
27438 IX86_BUILTIN_VBROADCASTSS256,
27439 IX86_BUILTIN_VBROADCASTPD256,
27440 IX86_BUILTIN_VBROADCASTPS256,
27441 IX86_BUILTIN_VINSERTF128PD256,
27442 IX86_BUILTIN_VINSERTF128PS256,
27443 IX86_BUILTIN_VINSERTF128SI256,
27444 IX86_BUILTIN_LOADUPD256,
27445 IX86_BUILTIN_LOADUPS256,
27446 IX86_BUILTIN_STOREUPD256,
27447 IX86_BUILTIN_STOREUPS256,
27448 IX86_BUILTIN_LDDQU256,
27449 IX86_BUILTIN_MOVNTDQ256,
27450 IX86_BUILTIN_MOVNTPD256,
27451 IX86_BUILTIN_MOVNTPS256,
27452 IX86_BUILTIN_LOADDQU256,
27453 IX86_BUILTIN_STOREDQU256,
27454 IX86_BUILTIN_MASKLOADPD,
27455 IX86_BUILTIN_MASKLOADPS,
27456 IX86_BUILTIN_MASKSTOREPD,
27457 IX86_BUILTIN_MASKSTOREPS,
27458 IX86_BUILTIN_MASKLOADPD256,
27459 IX86_BUILTIN_MASKLOADPS256,
27460 IX86_BUILTIN_MASKSTOREPD256,
27461 IX86_BUILTIN_MASKSTOREPS256,
27462 IX86_BUILTIN_MOVSHDUP256,
27463 IX86_BUILTIN_MOVSLDUP256,
27464 IX86_BUILTIN_MOVDDUP256,
27466 IX86_BUILTIN_SQRTPD256,
27467 IX86_BUILTIN_SQRTPS256,
27468 IX86_BUILTIN_SQRTPS_NR256,
27469 IX86_BUILTIN_RSQRTPS256,
27470 IX86_BUILTIN_RSQRTPS_NR256,
27472 IX86_BUILTIN_RCPPS256,
27474 IX86_BUILTIN_ROUNDPD256,
27475 IX86_BUILTIN_ROUNDPS256,
27477 IX86_BUILTIN_FLOORPD256,
27478 IX86_BUILTIN_CEILPD256,
27479 IX86_BUILTIN_TRUNCPD256,
27480 IX86_BUILTIN_RINTPD256,
27481 IX86_BUILTIN_ROUNDPD_AZ256,
27483 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27484 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27485 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27487 IX86_BUILTIN_FLOORPS256,
27488 IX86_BUILTIN_CEILPS256,
27489 IX86_BUILTIN_TRUNCPS256,
27490 IX86_BUILTIN_RINTPS256,
27491 IX86_BUILTIN_ROUNDPS_AZ256,
27493 IX86_BUILTIN_FLOORPS_SFIX256,
27494 IX86_BUILTIN_CEILPS_SFIX256,
27495 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27497 IX86_BUILTIN_UNPCKHPD256,
27498 IX86_BUILTIN_UNPCKLPD256,
27499 IX86_BUILTIN_UNPCKHPS256,
27500 IX86_BUILTIN_UNPCKLPS256,
27502 IX86_BUILTIN_SI256_SI,
27503 IX86_BUILTIN_PS256_PS,
27504 IX86_BUILTIN_PD256_PD,
27505 IX86_BUILTIN_SI_SI256,
27506 IX86_BUILTIN_PS_PS256,
27507 IX86_BUILTIN_PD_PD256,
27509 IX86_BUILTIN_VTESTZPD,
27510 IX86_BUILTIN_VTESTCPD,
27511 IX86_BUILTIN_VTESTNZCPD,
27512 IX86_BUILTIN_VTESTZPS,
27513 IX86_BUILTIN_VTESTCPS,
27514 IX86_BUILTIN_VTESTNZCPS,
27515 IX86_BUILTIN_VTESTZPD256,
27516 IX86_BUILTIN_VTESTCPD256,
27517 IX86_BUILTIN_VTESTNZCPD256,
27518 IX86_BUILTIN_VTESTZPS256,
27519 IX86_BUILTIN_VTESTCPS256,
27520 IX86_BUILTIN_VTESTNZCPS256,
27521 IX86_BUILTIN_PTESTZ256,
27522 IX86_BUILTIN_PTESTC256,
27523 IX86_BUILTIN_PTESTNZC256,
27525 IX86_BUILTIN_MOVMSKPD256,
27526 IX86_BUILTIN_MOVMSKPS256,
27528 /* AVX2 */
27529 IX86_BUILTIN_MPSADBW256,
27530 IX86_BUILTIN_PABSB256,
27531 IX86_BUILTIN_PABSW256,
27532 IX86_BUILTIN_PABSD256,
27533 IX86_BUILTIN_PACKSSDW256,
27534 IX86_BUILTIN_PACKSSWB256,
27535 IX86_BUILTIN_PACKUSDW256,
27536 IX86_BUILTIN_PACKUSWB256,
27537 IX86_BUILTIN_PADDB256,
27538 IX86_BUILTIN_PADDW256,
27539 IX86_BUILTIN_PADDD256,
27540 IX86_BUILTIN_PADDQ256,
27541 IX86_BUILTIN_PADDSB256,
27542 IX86_BUILTIN_PADDSW256,
27543 IX86_BUILTIN_PADDUSB256,
27544 IX86_BUILTIN_PADDUSW256,
27545 IX86_BUILTIN_PALIGNR256,
27546 IX86_BUILTIN_AND256I,
27547 IX86_BUILTIN_ANDNOT256I,
27548 IX86_BUILTIN_PAVGB256,
27549 IX86_BUILTIN_PAVGW256,
27550 IX86_BUILTIN_PBLENDVB256,
27551 IX86_BUILTIN_PBLENDVW256,
27552 IX86_BUILTIN_PCMPEQB256,
27553 IX86_BUILTIN_PCMPEQW256,
27554 IX86_BUILTIN_PCMPEQD256,
27555 IX86_BUILTIN_PCMPEQQ256,
27556 IX86_BUILTIN_PCMPGTB256,
27557 IX86_BUILTIN_PCMPGTW256,
27558 IX86_BUILTIN_PCMPGTD256,
27559 IX86_BUILTIN_PCMPGTQ256,
27560 IX86_BUILTIN_PHADDW256,
27561 IX86_BUILTIN_PHADDD256,
27562 IX86_BUILTIN_PHADDSW256,
27563 IX86_BUILTIN_PHSUBW256,
27564 IX86_BUILTIN_PHSUBD256,
27565 IX86_BUILTIN_PHSUBSW256,
27566 IX86_BUILTIN_PMADDUBSW256,
27567 IX86_BUILTIN_PMADDWD256,
27568 IX86_BUILTIN_PMAXSB256,
27569 IX86_BUILTIN_PMAXSW256,
27570 IX86_BUILTIN_PMAXSD256,
27571 IX86_BUILTIN_PMAXUB256,
27572 IX86_BUILTIN_PMAXUW256,
27573 IX86_BUILTIN_PMAXUD256,
27574 IX86_BUILTIN_PMINSB256,
27575 IX86_BUILTIN_PMINSW256,
27576 IX86_BUILTIN_PMINSD256,
27577 IX86_BUILTIN_PMINUB256,
27578 IX86_BUILTIN_PMINUW256,
27579 IX86_BUILTIN_PMINUD256,
27580 IX86_BUILTIN_PMOVMSKB256,
27581 IX86_BUILTIN_PMOVSXBW256,
27582 IX86_BUILTIN_PMOVSXBD256,
27583 IX86_BUILTIN_PMOVSXBQ256,
27584 IX86_BUILTIN_PMOVSXWD256,
27585 IX86_BUILTIN_PMOVSXWQ256,
27586 IX86_BUILTIN_PMOVSXDQ256,
27587 IX86_BUILTIN_PMOVZXBW256,
27588 IX86_BUILTIN_PMOVZXBD256,
27589 IX86_BUILTIN_PMOVZXBQ256,
27590 IX86_BUILTIN_PMOVZXWD256,
27591 IX86_BUILTIN_PMOVZXWQ256,
27592 IX86_BUILTIN_PMOVZXDQ256,
27593 IX86_BUILTIN_PMULDQ256,
27594 IX86_BUILTIN_PMULHRSW256,
27595 IX86_BUILTIN_PMULHUW256,
27596 IX86_BUILTIN_PMULHW256,
27597 IX86_BUILTIN_PMULLW256,
27598 IX86_BUILTIN_PMULLD256,
27599 IX86_BUILTIN_PMULUDQ256,
27600 IX86_BUILTIN_POR256,
27601 IX86_BUILTIN_PSADBW256,
27602 IX86_BUILTIN_PSHUFB256,
27603 IX86_BUILTIN_PSHUFD256,
27604 IX86_BUILTIN_PSHUFHW256,
27605 IX86_BUILTIN_PSHUFLW256,
27606 IX86_BUILTIN_PSIGNB256,
27607 IX86_BUILTIN_PSIGNW256,
27608 IX86_BUILTIN_PSIGND256,
27609 IX86_BUILTIN_PSLLDQI256,
27610 IX86_BUILTIN_PSLLWI256,
27611 IX86_BUILTIN_PSLLW256,
27612 IX86_BUILTIN_PSLLDI256,
27613 IX86_BUILTIN_PSLLD256,
27614 IX86_BUILTIN_PSLLQI256,
27615 IX86_BUILTIN_PSLLQ256,
27616 IX86_BUILTIN_PSRAWI256,
27617 IX86_BUILTIN_PSRAW256,
27618 IX86_BUILTIN_PSRADI256,
27619 IX86_BUILTIN_PSRAD256,
27620 IX86_BUILTIN_PSRLDQI256,
27621 IX86_BUILTIN_PSRLWI256,
27622 IX86_BUILTIN_PSRLW256,
27623 IX86_BUILTIN_PSRLDI256,
27624 IX86_BUILTIN_PSRLD256,
27625 IX86_BUILTIN_PSRLQI256,
27626 IX86_BUILTIN_PSRLQ256,
27627 IX86_BUILTIN_PSUBB256,
27628 IX86_BUILTIN_PSUBW256,
27629 IX86_BUILTIN_PSUBD256,
27630 IX86_BUILTIN_PSUBQ256,
27631 IX86_BUILTIN_PSUBSB256,
27632 IX86_BUILTIN_PSUBSW256,
27633 IX86_BUILTIN_PSUBUSB256,
27634 IX86_BUILTIN_PSUBUSW256,
27635 IX86_BUILTIN_PUNPCKHBW256,
27636 IX86_BUILTIN_PUNPCKHWD256,
27637 IX86_BUILTIN_PUNPCKHDQ256,
27638 IX86_BUILTIN_PUNPCKHQDQ256,
27639 IX86_BUILTIN_PUNPCKLBW256,
27640 IX86_BUILTIN_PUNPCKLWD256,
27641 IX86_BUILTIN_PUNPCKLDQ256,
27642 IX86_BUILTIN_PUNPCKLQDQ256,
27643 IX86_BUILTIN_PXOR256,
27644 IX86_BUILTIN_MOVNTDQA256,
27645 IX86_BUILTIN_VBROADCASTSS_PS,
27646 IX86_BUILTIN_VBROADCASTSS_PS256,
27647 IX86_BUILTIN_VBROADCASTSD_PD256,
27648 IX86_BUILTIN_VBROADCASTSI256,
27649 IX86_BUILTIN_PBLENDD256,
27650 IX86_BUILTIN_PBLENDD128,
27651 IX86_BUILTIN_PBROADCASTB256,
27652 IX86_BUILTIN_PBROADCASTW256,
27653 IX86_BUILTIN_PBROADCASTD256,
27654 IX86_BUILTIN_PBROADCASTQ256,
27655 IX86_BUILTIN_PBROADCASTB128,
27656 IX86_BUILTIN_PBROADCASTW128,
27657 IX86_BUILTIN_PBROADCASTD128,
27658 IX86_BUILTIN_PBROADCASTQ128,
27659 IX86_BUILTIN_VPERMVARSI256,
27660 IX86_BUILTIN_VPERMDF256,
27661 IX86_BUILTIN_VPERMVARSF256,
27662 IX86_BUILTIN_VPERMDI256,
27663 IX86_BUILTIN_VPERMTI256,
27664 IX86_BUILTIN_VEXTRACT128I256,
27665 IX86_BUILTIN_VINSERT128I256,
27666 IX86_BUILTIN_MASKLOADD,
27667 IX86_BUILTIN_MASKLOADQ,
27668 IX86_BUILTIN_MASKLOADD256,
27669 IX86_BUILTIN_MASKLOADQ256,
27670 IX86_BUILTIN_MASKSTORED,
27671 IX86_BUILTIN_MASKSTOREQ,
27672 IX86_BUILTIN_MASKSTORED256,
27673 IX86_BUILTIN_MASKSTOREQ256,
27674 IX86_BUILTIN_PSLLVV4DI,
27675 IX86_BUILTIN_PSLLVV2DI,
27676 IX86_BUILTIN_PSLLVV8SI,
27677 IX86_BUILTIN_PSLLVV4SI,
27678 IX86_BUILTIN_PSRAVV8SI,
27679 IX86_BUILTIN_PSRAVV4SI,
27680 IX86_BUILTIN_PSRLVV4DI,
27681 IX86_BUILTIN_PSRLVV2DI,
27682 IX86_BUILTIN_PSRLVV8SI,
27683 IX86_BUILTIN_PSRLVV4SI,
27685 IX86_BUILTIN_GATHERSIV2DF,
27686 IX86_BUILTIN_GATHERSIV4DF,
27687 IX86_BUILTIN_GATHERDIV2DF,
27688 IX86_BUILTIN_GATHERDIV4DF,
27689 IX86_BUILTIN_GATHERSIV4SF,
27690 IX86_BUILTIN_GATHERSIV8SF,
27691 IX86_BUILTIN_GATHERDIV4SF,
27692 IX86_BUILTIN_GATHERDIV8SF,
27693 IX86_BUILTIN_GATHERSIV2DI,
27694 IX86_BUILTIN_GATHERSIV4DI,
27695 IX86_BUILTIN_GATHERDIV2DI,
27696 IX86_BUILTIN_GATHERDIV4DI,
27697 IX86_BUILTIN_GATHERSIV4SI,
27698 IX86_BUILTIN_GATHERSIV8SI,
27699 IX86_BUILTIN_GATHERDIV4SI,
27700 IX86_BUILTIN_GATHERDIV8SI,
27702 /* Alternate 4 element gather for the vectorizer where
27703 all operands are 32-byte wide. */
27704 IX86_BUILTIN_GATHERALTSIV4DF,
27705 IX86_BUILTIN_GATHERALTDIV8SF,
27706 IX86_BUILTIN_GATHERALTSIV4DI,
27707 IX86_BUILTIN_GATHERALTDIV8SI,
27709 /* TFmode support builtins. */
27710 IX86_BUILTIN_INFQ,
27711 IX86_BUILTIN_HUGE_VALQ,
27712 IX86_BUILTIN_FABSQ,
27713 IX86_BUILTIN_COPYSIGNQ,
27715 /* Vectorizer support builtins. */
27716 IX86_BUILTIN_CPYSGNPS,
27717 IX86_BUILTIN_CPYSGNPD,
27718 IX86_BUILTIN_CPYSGNPS256,
27719 IX86_BUILTIN_CPYSGNPD256,
27721 /* FMA4 instructions. */
27722 IX86_BUILTIN_VFMADDSS,
27723 IX86_BUILTIN_VFMADDSD,
27724 IX86_BUILTIN_VFMADDPS,
27725 IX86_BUILTIN_VFMADDPD,
27726 IX86_BUILTIN_VFMADDPS256,
27727 IX86_BUILTIN_VFMADDPD256,
27728 IX86_BUILTIN_VFMADDSUBPS,
27729 IX86_BUILTIN_VFMADDSUBPD,
27730 IX86_BUILTIN_VFMADDSUBPS256,
27731 IX86_BUILTIN_VFMADDSUBPD256,
27733 /* FMA3 instructions. */
27734 IX86_BUILTIN_VFMADDSS3,
27735 IX86_BUILTIN_VFMADDSD3,
27737 /* XOP instructions. */
27738 IX86_BUILTIN_VPCMOV,
27739 IX86_BUILTIN_VPCMOV_V2DI,
27740 IX86_BUILTIN_VPCMOV_V4SI,
27741 IX86_BUILTIN_VPCMOV_V8HI,
27742 IX86_BUILTIN_VPCMOV_V16QI,
27743 IX86_BUILTIN_VPCMOV_V4SF,
27744 IX86_BUILTIN_VPCMOV_V2DF,
27745 IX86_BUILTIN_VPCMOV256,
27746 IX86_BUILTIN_VPCMOV_V4DI256,
27747 IX86_BUILTIN_VPCMOV_V8SI256,
27748 IX86_BUILTIN_VPCMOV_V16HI256,
27749 IX86_BUILTIN_VPCMOV_V32QI256,
27750 IX86_BUILTIN_VPCMOV_V8SF256,
27751 IX86_BUILTIN_VPCMOV_V4DF256,
27753 IX86_BUILTIN_VPPERM,
27755 IX86_BUILTIN_VPMACSSWW,
27756 IX86_BUILTIN_VPMACSWW,
27757 IX86_BUILTIN_VPMACSSWD,
27758 IX86_BUILTIN_VPMACSWD,
27759 IX86_BUILTIN_VPMACSSDD,
27760 IX86_BUILTIN_VPMACSDD,
27761 IX86_BUILTIN_VPMACSSDQL,
27762 IX86_BUILTIN_VPMACSSDQH,
27763 IX86_BUILTIN_VPMACSDQL,
27764 IX86_BUILTIN_VPMACSDQH,
27765 IX86_BUILTIN_VPMADCSSWD,
27766 IX86_BUILTIN_VPMADCSWD,
27768 IX86_BUILTIN_VPHADDBW,
27769 IX86_BUILTIN_VPHADDBD,
27770 IX86_BUILTIN_VPHADDBQ,
27771 IX86_BUILTIN_VPHADDWD,
27772 IX86_BUILTIN_VPHADDWQ,
27773 IX86_BUILTIN_VPHADDDQ,
27774 IX86_BUILTIN_VPHADDUBW,
27775 IX86_BUILTIN_VPHADDUBD,
27776 IX86_BUILTIN_VPHADDUBQ,
27777 IX86_BUILTIN_VPHADDUWD,
27778 IX86_BUILTIN_VPHADDUWQ,
27779 IX86_BUILTIN_VPHADDUDQ,
27780 IX86_BUILTIN_VPHSUBBW,
27781 IX86_BUILTIN_VPHSUBWD,
27782 IX86_BUILTIN_VPHSUBDQ,
27784 IX86_BUILTIN_VPROTB,
27785 IX86_BUILTIN_VPROTW,
27786 IX86_BUILTIN_VPROTD,
27787 IX86_BUILTIN_VPROTQ,
27788 IX86_BUILTIN_VPROTB_IMM,
27789 IX86_BUILTIN_VPROTW_IMM,
27790 IX86_BUILTIN_VPROTD_IMM,
27791 IX86_BUILTIN_VPROTQ_IMM,
27793 IX86_BUILTIN_VPSHLB,
27794 IX86_BUILTIN_VPSHLW,
27795 IX86_BUILTIN_VPSHLD,
27796 IX86_BUILTIN_VPSHLQ,
27797 IX86_BUILTIN_VPSHAB,
27798 IX86_BUILTIN_VPSHAW,
27799 IX86_BUILTIN_VPSHAD,
27800 IX86_BUILTIN_VPSHAQ,
27802 IX86_BUILTIN_VFRCZSS,
27803 IX86_BUILTIN_VFRCZSD,
27804 IX86_BUILTIN_VFRCZPS,
27805 IX86_BUILTIN_VFRCZPD,
27806 IX86_BUILTIN_VFRCZPS256,
27807 IX86_BUILTIN_VFRCZPD256,
27809 IX86_BUILTIN_VPCOMEQUB,
27810 IX86_BUILTIN_VPCOMNEUB,
27811 IX86_BUILTIN_VPCOMLTUB,
27812 IX86_BUILTIN_VPCOMLEUB,
27813 IX86_BUILTIN_VPCOMGTUB,
27814 IX86_BUILTIN_VPCOMGEUB,
27815 IX86_BUILTIN_VPCOMFALSEUB,
27816 IX86_BUILTIN_VPCOMTRUEUB,
27818 IX86_BUILTIN_VPCOMEQUW,
27819 IX86_BUILTIN_VPCOMNEUW,
27820 IX86_BUILTIN_VPCOMLTUW,
27821 IX86_BUILTIN_VPCOMLEUW,
27822 IX86_BUILTIN_VPCOMGTUW,
27823 IX86_BUILTIN_VPCOMGEUW,
27824 IX86_BUILTIN_VPCOMFALSEUW,
27825 IX86_BUILTIN_VPCOMTRUEUW,
27827 IX86_BUILTIN_VPCOMEQUD,
27828 IX86_BUILTIN_VPCOMNEUD,
27829 IX86_BUILTIN_VPCOMLTUD,
27830 IX86_BUILTIN_VPCOMLEUD,
27831 IX86_BUILTIN_VPCOMGTUD,
27832 IX86_BUILTIN_VPCOMGEUD,
27833 IX86_BUILTIN_VPCOMFALSEUD,
27834 IX86_BUILTIN_VPCOMTRUEUD,
27836 IX86_BUILTIN_VPCOMEQUQ,
27837 IX86_BUILTIN_VPCOMNEUQ,
27838 IX86_BUILTIN_VPCOMLTUQ,
27839 IX86_BUILTIN_VPCOMLEUQ,
27840 IX86_BUILTIN_VPCOMGTUQ,
27841 IX86_BUILTIN_VPCOMGEUQ,
27842 IX86_BUILTIN_VPCOMFALSEUQ,
27843 IX86_BUILTIN_VPCOMTRUEUQ,
27845 IX86_BUILTIN_VPCOMEQB,
27846 IX86_BUILTIN_VPCOMNEB,
27847 IX86_BUILTIN_VPCOMLTB,
27848 IX86_BUILTIN_VPCOMLEB,
27849 IX86_BUILTIN_VPCOMGTB,
27850 IX86_BUILTIN_VPCOMGEB,
27851 IX86_BUILTIN_VPCOMFALSEB,
27852 IX86_BUILTIN_VPCOMTRUEB,
27854 IX86_BUILTIN_VPCOMEQW,
27855 IX86_BUILTIN_VPCOMNEW,
27856 IX86_BUILTIN_VPCOMLTW,
27857 IX86_BUILTIN_VPCOMLEW,
27858 IX86_BUILTIN_VPCOMGTW,
27859 IX86_BUILTIN_VPCOMGEW,
27860 IX86_BUILTIN_VPCOMFALSEW,
27861 IX86_BUILTIN_VPCOMTRUEW,
27863 IX86_BUILTIN_VPCOMEQD,
27864 IX86_BUILTIN_VPCOMNED,
27865 IX86_BUILTIN_VPCOMLTD,
27866 IX86_BUILTIN_VPCOMLED,
27867 IX86_BUILTIN_VPCOMGTD,
27868 IX86_BUILTIN_VPCOMGED,
27869 IX86_BUILTIN_VPCOMFALSED,
27870 IX86_BUILTIN_VPCOMTRUED,
27872 IX86_BUILTIN_VPCOMEQQ,
27873 IX86_BUILTIN_VPCOMNEQ,
27874 IX86_BUILTIN_VPCOMLTQ,
27875 IX86_BUILTIN_VPCOMLEQ,
27876 IX86_BUILTIN_VPCOMGTQ,
27877 IX86_BUILTIN_VPCOMGEQ,
27878 IX86_BUILTIN_VPCOMFALSEQ,
27879 IX86_BUILTIN_VPCOMTRUEQ,
27881 /* LWP instructions. */
27882 IX86_BUILTIN_LLWPCB,
27883 IX86_BUILTIN_SLWPCB,
27884 IX86_BUILTIN_LWPVAL32,
27885 IX86_BUILTIN_LWPVAL64,
27886 IX86_BUILTIN_LWPINS32,
27887 IX86_BUILTIN_LWPINS64,
27889 IX86_BUILTIN_CLZS,
27891 /* RTM */
27892 IX86_BUILTIN_XBEGIN,
27893 IX86_BUILTIN_XEND,
27894 IX86_BUILTIN_XABORT,
27895 IX86_BUILTIN_XTEST,
27897 /* BMI instructions. */
27898 IX86_BUILTIN_BEXTR32,
27899 IX86_BUILTIN_BEXTR64,
27900 IX86_BUILTIN_CTZS,
27902 /* TBM instructions. */
27903 IX86_BUILTIN_BEXTRI32,
27904 IX86_BUILTIN_BEXTRI64,
27906 /* BMI2 instructions. */
27907 IX86_BUILTIN_BZHI32,
27908 IX86_BUILTIN_BZHI64,
27909 IX86_BUILTIN_PDEP32,
27910 IX86_BUILTIN_PDEP64,
27911 IX86_BUILTIN_PEXT32,
27912 IX86_BUILTIN_PEXT64,
27914 /* ADX instructions. */
27915 IX86_BUILTIN_ADDCARRYX32,
27916 IX86_BUILTIN_ADDCARRYX64,
27918 /* FSGSBASE instructions. */
27919 IX86_BUILTIN_RDFSBASE32,
27920 IX86_BUILTIN_RDFSBASE64,
27921 IX86_BUILTIN_RDGSBASE32,
27922 IX86_BUILTIN_RDGSBASE64,
27923 IX86_BUILTIN_WRFSBASE32,
27924 IX86_BUILTIN_WRFSBASE64,
27925 IX86_BUILTIN_WRGSBASE32,
27926 IX86_BUILTIN_WRGSBASE64,
27928 /* RDRND instructions. */
27929 IX86_BUILTIN_RDRAND16_STEP,
27930 IX86_BUILTIN_RDRAND32_STEP,
27931 IX86_BUILTIN_RDRAND64_STEP,
27933 /* RDSEED instructions. */
27934 IX86_BUILTIN_RDSEED16_STEP,
27935 IX86_BUILTIN_RDSEED32_STEP,
27936 IX86_BUILTIN_RDSEED64_STEP,
27938 /* F16C instructions. */
27939 IX86_BUILTIN_CVTPH2PS,
27940 IX86_BUILTIN_CVTPH2PS256,
27941 IX86_BUILTIN_CVTPS2PH,
27942 IX86_BUILTIN_CVTPS2PH256,
27944 /* CFString built-in for darwin */
27945 IX86_BUILTIN_CFSTRING,
27947 /* Builtins to get CPU type and supported features. */
27948 IX86_BUILTIN_CPU_INIT,
27949 IX86_BUILTIN_CPU_IS,
27950 IX86_BUILTIN_CPU_SUPPORTS,
27952 /* Read/write FLAGS register built-ins. */
27953 IX86_BUILTIN_READ_FLAGS,
27954 IX86_BUILTIN_WRITE_FLAGS,
27956 IX86_BUILTIN_MAX
27959 /* Table for the ix86 builtin decls. */
27960 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27962 /* Table of all of the builtin functions that are possible with different ISA's
27963 but are waiting to be built until a function is declared to use that
27964 ISA. */
27965 struct builtin_isa {
27966 const char *name; /* function name */
27967 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27968 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27969 bool const_p; /* true if the declaration is constant */
27970 bool set_and_not_built_p;
27973 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27976 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27977 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27978 function decl in the ix86_builtins array. Returns the function decl or
27979 NULL_TREE, if the builtin was not added.
27981 If the front end has a special hook for builtin functions, delay adding
27982 builtin functions that aren't in the current ISA until the ISA is changed
27983 with function specific optimization. Doing so, can save about 300K for the
27984 default compiler. When the builtin is expanded, check at that time whether
27985 it is valid.
27987 If the front end doesn't have a special hook, record all builtins, even if
27988 it isn't an instruction set in the current ISA in case the user uses
27989 function specific options for a different ISA, so that we don't get scope
27990 errors if a builtin is added in the middle of a function scope. */
27992 static inline tree
27993 def_builtin (HOST_WIDE_INT mask, const char *name,
27994 enum ix86_builtin_func_type tcode,
27995 enum ix86_builtins code)
27997 tree decl = NULL_TREE;
27999 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
28001 ix86_builtins_isa[(int) code].isa = mask;
28003 mask &= ~OPTION_MASK_ISA_64BIT;
28004 if (mask == 0
28005 || (mask & ix86_isa_flags) != 0
28006 || (lang_hooks.builtin_function
28007 == lang_hooks.builtin_function_ext_scope))
28010 tree type = ix86_get_builtin_func_type (tcode);
28011 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
28012 NULL, NULL_TREE);
28013 ix86_builtins[(int) code] = decl;
28014 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
28016 else
28018 ix86_builtins[(int) code] = NULL_TREE;
28019 ix86_builtins_isa[(int) code].tcode = tcode;
28020 ix86_builtins_isa[(int) code].name = name;
28021 ix86_builtins_isa[(int) code].const_p = false;
28022 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
28026 return decl;
28029 /* Like def_builtin, but also marks the function decl "const". */
28031 static inline tree
28032 def_builtin_const (HOST_WIDE_INT mask, const char *name,
28033 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
28035 tree decl = def_builtin (mask, name, tcode, code);
28036 if (decl)
28037 TREE_READONLY (decl) = 1;
28038 else
28039 ix86_builtins_isa[(int) code].const_p = true;
28041 return decl;
28044 /* Add any new builtin functions for a given ISA that may not have been
28045 declared. This saves a bit of space compared to adding all of the
28046 declarations to the tree, even if we didn't use them. */
28048 static void
28049 ix86_add_new_builtins (HOST_WIDE_INT isa)
28051 int i;
28053 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
28055 if ((ix86_builtins_isa[i].isa & isa) != 0
28056 && ix86_builtins_isa[i].set_and_not_built_p)
28058 tree decl, type;
28060 /* Don't define the builtin again. */
28061 ix86_builtins_isa[i].set_and_not_built_p = false;
28063 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
28064 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
28065 type, i, BUILT_IN_MD, NULL,
28066 NULL_TREE);
28068 ix86_builtins[i] = decl;
28069 if (ix86_builtins_isa[i].const_p)
28070 TREE_READONLY (decl) = 1;
28075 /* Bits for builtin_description.flag. */
28077 /* Set when we don't support the comparison natively, and should
28078 swap_comparison in order to support it. */
28079 #define BUILTIN_DESC_SWAP_OPERANDS 1
28081 struct builtin_description
28083 const HOST_WIDE_INT mask;
28084 const enum insn_code icode;
28085 const char *const name;
28086 const enum ix86_builtins code;
28087 const enum rtx_code comparison;
28088 const int flag;
28091 static const struct builtin_description bdesc_comi[] =
28093 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
28094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
28095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
28096 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
28097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
28098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
28099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
28100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
28101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
28102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
28103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
28104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
28105 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
28106 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
28107 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
28108 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
28109 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
28110 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
28111 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
28112 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
28113 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
28114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
28115 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
28116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
28119 static const struct builtin_description bdesc_pcmpestr[] =
28121 /* SSE4.2 */
28122 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
28123 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
28124 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
28125 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
28126 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
28127 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
28128 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
28131 static const struct builtin_description bdesc_pcmpistr[] =
28133 /* SSE4.2 */
28134 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
28135 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
28136 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
28137 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
28138 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
28139 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
28140 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
28143 /* Special builtins with variable number of arguments. */
28144 static const struct builtin_description bdesc_special_args[] =
28146 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
28147 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
28148 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
28150 /* 80387 (for use internally for atomic compound assignment). */
28151 { 0, CODE_FOR_fnstenv, "__builtin_ia32_fnstenv", IX86_BUILTIN_FNSTENV, UNKNOWN, (int) VOID_FTYPE_PVOID },
28152 { 0, CODE_FOR_fldenv, "__builtin_ia32_fldenv", IX86_BUILTIN_FLDENV, UNKNOWN, (int) VOID_FTYPE_PCVOID },
28153 { 0, CODE_FOR_fnstsw, "__builtin_ia32_fnstsw", IX86_BUILTIN_FNSTSW, UNKNOWN, (int) VOID_FTYPE_PUSHORT },
28154 { 0, CODE_FOR_fnclex, "__builtin_ia32_fnclex", IX86_BUILTIN_FNCLEX, UNKNOWN, (int) VOID_FTYPE_VOID },
28156 /* MMX */
28157 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28159 /* 3DNow! */
28160 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
28162 /* FXSR, XSAVE and XSAVEOPT */
28163 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
28164 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
28165 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28166 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28167 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28169 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28170 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
28171 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28172 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28173 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
28175 /* SSE */
28176 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28177 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28178 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28180 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28181 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
28182 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28183 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
28185 /* SSE or 3DNow!A */
28186 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28187 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
28189 /* SSE2 */
28190 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
28192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
28194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
28196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
28197 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
28198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
28199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
28204 /* SSE3 */
28205 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
28207 /* SSE4.1 */
28208 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
28210 /* SSE4A */
28211 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
28212 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
28214 /* AVX */
28215 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
28216 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
28218 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28219 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28220 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28221 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
28222 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
28224 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28225 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28226 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28227 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28228 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28229 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
28230 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28232 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
28233 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28234 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28236 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
28237 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
28238 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
28239 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
28240 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
28241 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
28242 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
28243 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
28245 /* AVX2 */
28246 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
28247 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
28248 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
28249 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
28250 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
28251 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
28252 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
28253 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
28254 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
28256 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
28257 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
28258 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
28259 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
28260 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
28261 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
28263 /* FSGSBASE */
28264 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28265 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28266 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28267 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28268 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28269 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28270 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28271 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28273 /* RTM */
28274 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28275 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
28276 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
28279 /* Builtins with variable number of arguments. */
28280 static const struct builtin_description bdesc_args[] =
28282 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
28283 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
28284 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
28285 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28286 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28287 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28288 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28290 /* MMX */
28291 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28292 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28293 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28294 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28295 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28296 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28298 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28299 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28300 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28301 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28302 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28303 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28304 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28305 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28307 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28308 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28310 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28311 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28312 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28313 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28315 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28316 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28317 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28318 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28319 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28320 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28322 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28323 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28324 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28325 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28326 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
28327 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
28329 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28330 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
28331 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28333 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
28335 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28336 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28337 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28338 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28339 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28340 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28342 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28343 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28344 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28345 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28346 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28347 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28349 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28350 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28351 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28352 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28354 /* 3DNow! */
28355 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28356 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28357 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28358 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28360 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28361 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28362 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28363 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28364 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28365 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28366 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28367 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28368 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28369 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28370 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28371 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28372 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28373 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28374 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28376 /* 3DNow!A */
28377 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28378 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28379 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28380 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28381 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28382 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28384 /* SSE */
28385 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28386 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28387 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28388 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28389 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28390 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28391 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28392 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28393 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28394 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28395 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28396 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28398 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28400 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28401 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28402 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28403 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28404 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28405 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28406 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28407 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28409 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28410 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28411 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28412 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28413 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28414 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28415 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28416 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28417 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28418 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28419 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28420 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28421 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28422 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28423 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28424 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28425 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28426 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28427 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28428 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28430 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28431 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28432 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28433 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28435 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28436 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28437 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28438 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28440 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28442 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28443 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28444 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28445 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28446 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28448 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28449 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28450 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28452 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28454 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28455 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28456 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28458 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28459 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28461 /* SSE MMX or 3Dnow!A */
28462 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28463 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28464 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28466 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28467 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28468 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28469 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28471 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28472 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28474 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28476 /* SSE2 */
28477 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28479 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28480 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28481 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28482 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28483 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28485 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28486 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28487 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28489 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28491 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28493 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28494 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28495 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28496 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28498 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28499 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28500 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28502 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28503 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28504 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28505 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28506 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28507 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28508 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28509 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28511 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28513 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28515 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28516 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28517 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28519 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28520 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28521 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28522 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28523 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28524 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28525 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28526 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28527 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28528 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28529 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28530 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28532 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28533 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28534 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28535 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28537 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28538 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28539 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28540 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28542 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28544 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28545 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28546 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28548 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28550 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28551 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28552 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28553 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28554 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28555 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28556 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28557 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28559 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28560 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28561 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28562 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28563 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28564 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28565 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28566 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28568 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28569 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28571 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28572 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28573 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28574 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28576 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28577 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28579 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28580 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28581 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28582 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28583 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28584 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28586 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28587 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28588 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28589 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28591 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28592 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28593 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28594 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28595 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28596 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28597 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28598 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28600 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28601 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28602 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28604 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28605 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28607 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28608 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28610 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28612 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28613 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28614 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28615 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28617 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28618 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28619 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28620 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28621 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28622 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28623 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28625 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28626 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28627 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28628 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28629 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28630 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28631 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28633 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28634 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28635 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28636 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28638 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28639 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28640 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28642 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28644 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28646 /* SSE2 MMX */
28647 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28648 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28650 /* SSE3 */
28651 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28652 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28654 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28655 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28656 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28657 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28658 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28659 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28661 /* SSSE3 */
28662 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28663 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28664 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28665 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28666 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28667 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28669 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28670 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28671 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28672 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28673 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28674 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28675 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28676 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28677 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28678 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28679 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28680 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28681 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28682 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28683 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28684 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28685 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28686 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28687 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28688 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28689 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28690 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28691 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28692 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28694 /* SSSE3. */
28695 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28696 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28698 /* SSE4.1 */
28699 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28700 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28701 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28702 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28703 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28704 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28705 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28706 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28707 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28708 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28710 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28711 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28712 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28713 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28714 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28715 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28716 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28717 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28718 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28719 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28720 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28721 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28722 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28724 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28725 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28726 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28727 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28728 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28729 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28730 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28731 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28732 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28733 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28734 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28735 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28737 /* SSE4.1 */
28738 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28739 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28740 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28741 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28743 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28744 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28745 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28746 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28748 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28749 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28751 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28752 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28754 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28755 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28756 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28757 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28759 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28760 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28762 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28763 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28765 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28766 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28767 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28769 /* SSE4.2 */
28770 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28771 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28772 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28773 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28774 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28776 /* SSE4A */
28777 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28778 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28779 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28780 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28782 /* AES */
28783 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28784 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28786 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28787 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28788 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28789 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28791 /* PCLMUL */
28792 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28794 /* AVX */
28795 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28796 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28797 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28798 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28799 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28800 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28802 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28803 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28805 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28806 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28807 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28809 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28810 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28811 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28812 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28813 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28814 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28815 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28816 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28817 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28818 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28819 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28820 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28822 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28824 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28825 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28832 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28836 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28837 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28841 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28842 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28843 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28844 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28846 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28847 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28848 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28849 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28850 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28851 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28852 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28855 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28856 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28857 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28858 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28860 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28862 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28863 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28864 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28866 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28867 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28868 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28869 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28870 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28872 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28874 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28875 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28877 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28879 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28882 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28883 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28888 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28889 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28896 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28897 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28900 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28901 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28904 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28907 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28908 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28909 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28911 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28912 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28913 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28914 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28915 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28916 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28917 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28918 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28919 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28920 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28921 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28922 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28923 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28924 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28925 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28927 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28928 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28930 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28931 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28933 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28935 /* AVX2 */
28936 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28937 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28938 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28939 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28940 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28941 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28942 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28943 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28944 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28945 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28946 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28947 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28948 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28949 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28950 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28951 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28952 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28953 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28954 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28955 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28956 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28957 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28958 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28959 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28960 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28961 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28962 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28963 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28972 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28973 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28974 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28975 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28976 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28977 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28978 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28979 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28980 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28981 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28982 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28983 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28984 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28985 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28986 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28987 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28988 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28989 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28990 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28991 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28992 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28993 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28994 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28995 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28996 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28997 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28998 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28999 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
29000 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
29001 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29002 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29003 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29004 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29005 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29006 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
29007 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29008 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
29009 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29010 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
29011 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
29012 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
29013 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29014 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29015 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29016 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
29017 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
29018 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
29019 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
29020 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
29021 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
29022 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
29023 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
29024 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
29025 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
29026 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
29027 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
29028 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
29029 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
29030 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
29031 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
29032 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
29033 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
29034 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29035 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29036 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29037 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29038 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29039 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29040 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29041 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29042 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29043 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29044 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29045 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29046 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
29047 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
29048 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29049 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29050 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29051 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
29052 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
29053 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
29054 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
29055 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
29056 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
29057 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
29058 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
29059 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
29060 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
29061 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
29062 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
29063 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
29064 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
29065 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29066 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
29067 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
29068 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
29069 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
29070 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
29071 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
29072 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29073 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
29074 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29075 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29076 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29077 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29078 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
29079 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
29080 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
29081 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
29083 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29085 /* BMI */
29086 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29087 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29088 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
29090 /* TBM */
29091 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29092 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29094 /* F16C */
29095 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
29096 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
29097 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
29098 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
29100 /* BMI2 */
29101 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29102 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29103 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29104 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29105 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
29106 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
29109 /* FMA4 and XOP. */
29110 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
29111 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
29112 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
29113 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
29114 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
29115 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
29116 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
29117 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
29118 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
29119 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
29120 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
29121 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
29122 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
29123 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
29124 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
29125 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
29126 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
29127 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
29128 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
29129 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
29130 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
29131 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
29132 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
29133 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
29134 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
29135 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
29136 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
29137 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
29138 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
29139 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
29140 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
29141 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
29142 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
29143 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
29144 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
29145 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
29146 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
29147 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
29148 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
29149 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
29150 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
29151 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
29152 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
29153 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
29154 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
29155 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
29156 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
29157 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
29158 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
29159 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
29160 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
29161 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
29163 static const struct builtin_description bdesc_multi_arg[] =
29165 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
29166 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
29167 UNKNOWN, (int)MULTI_ARG_3_SF },
29168 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
29169 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
29170 UNKNOWN, (int)MULTI_ARG_3_DF },
29172 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
29173 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
29174 UNKNOWN, (int)MULTI_ARG_3_SF },
29175 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
29176 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
29177 UNKNOWN, (int)MULTI_ARG_3_DF },
29179 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
29180 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
29181 UNKNOWN, (int)MULTI_ARG_3_SF },
29182 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
29183 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
29184 UNKNOWN, (int)MULTI_ARG_3_DF },
29185 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
29186 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
29187 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29188 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
29189 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
29190 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29192 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
29193 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
29194 UNKNOWN, (int)MULTI_ARG_3_SF },
29195 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
29196 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
29197 UNKNOWN, (int)MULTI_ARG_3_DF },
29198 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
29199 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
29200 UNKNOWN, (int)MULTI_ARG_3_SF2 },
29201 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
29202 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
29203 UNKNOWN, (int)MULTI_ARG_3_DF2 },
29205 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
29206 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
29207 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
29208 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
29209 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
29210 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
29211 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
29213 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29214 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
29215 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
29216 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
29217 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
29218 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
29219 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
29221 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
29223 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29224 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29225 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29226 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29227 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29228 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29229 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29230 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29231 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29232 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29233 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29234 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29236 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29237 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
29238 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
29239 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
29240 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
29241 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
29242 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
29243 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
29244 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29245 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
29246 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
29247 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
29248 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29249 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
29250 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
29251 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
29253 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_1_SF },
29254 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_1_DF },
29255 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
29256 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
29257 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
29258 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
29260 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29261 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29262 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29263 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29264 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29265 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29266 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29267 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29268 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29269 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29270 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29271 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29272 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29273 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29274 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29276 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
29277 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29278 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29279 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
29280 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
29281 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
29282 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
29284 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
29285 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29286 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29287 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
29288 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
29289 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
29290 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
29292 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
29293 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29294 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29295 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
29296 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
29297 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
29298 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
29300 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29301 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29302 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29303 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
29304 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
29305 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
29306 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
29308 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
29309 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29310 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29311 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
29312 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
29313 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
29314 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
29316 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
29317 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29318 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29319 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
29320 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
29321 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
29322 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
29324 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
29325 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29326 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29327 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
29328 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
29329 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
29330 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
29332 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29333 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29334 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29335 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
29336 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
29337 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
29338 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
29340 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29341 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29342 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29343 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29344 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29345 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29346 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29347 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29349 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29350 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29351 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29352 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29353 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29354 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29355 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29356 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29358 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29359 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29360 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29361 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29365 /* TM vector builtins. */
29367 /* Reuse the existing x86-specific `struct builtin_description' cause
29368 we're lazy. Add casts to make them fit. */
29369 static const struct builtin_description bdesc_tm[] =
29371 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29372 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29373 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29374 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29375 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29376 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29377 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29379 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29380 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29381 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29382 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29383 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29384 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29385 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29387 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29388 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29389 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29390 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29391 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29392 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29393 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29395 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29396 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29397 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29400 /* TM callbacks. */
29402 /* Return the builtin decl needed to load a vector of TYPE. */
29404 static tree
29405 ix86_builtin_tm_load (tree type)
29407 if (TREE_CODE (type) == VECTOR_TYPE)
29409 switch (tree_to_uhwi (TYPE_SIZE (type)))
29411 case 64:
29412 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29413 case 128:
29414 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29415 case 256:
29416 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29419 return NULL_TREE;
29422 /* Return the builtin decl needed to store a vector of TYPE. */
29424 static tree
29425 ix86_builtin_tm_store (tree type)
29427 if (TREE_CODE (type) == VECTOR_TYPE)
29429 switch (tree_to_uhwi (TYPE_SIZE (type)))
29431 case 64:
29432 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29433 case 128:
29434 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29435 case 256:
29436 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29439 return NULL_TREE;
29442 /* Initialize the transactional memory vector load/store builtins. */
29444 static void
29445 ix86_init_tm_builtins (void)
29447 enum ix86_builtin_func_type ftype;
29448 const struct builtin_description *d;
29449 size_t i;
29450 tree decl;
29451 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29452 tree attrs_log, attrs_type_log;
29454 if (!flag_tm)
29455 return;
29457 /* If there are no builtins defined, we must be compiling in a
29458 language without trans-mem support. */
29459 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29460 return;
29462 /* Use whatever attributes a normal TM load has. */
29463 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29464 attrs_load = DECL_ATTRIBUTES (decl);
29465 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29466 /* Use whatever attributes a normal TM store has. */
29467 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29468 attrs_store = DECL_ATTRIBUTES (decl);
29469 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29470 /* Use whatever attributes a normal TM log has. */
29471 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29472 attrs_log = DECL_ATTRIBUTES (decl);
29473 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29475 for (i = 0, d = bdesc_tm;
29476 i < ARRAY_SIZE (bdesc_tm);
29477 i++, d++)
29479 if ((d->mask & ix86_isa_flags) != 0
29480 || (lang_hooks.builtin_function
29481 == lang_hooks.builtin_function_ext_scope))
29483 tree type, attrs, attrs_type;
29484 enum built_in_function code = (enum built_in_function) d->code;
29486 ftype = (enum ix86_builtin_func_type) d->flag;
29487 type = ix86_get_builtin_func_type (ftype);
29489 if (BUILTIN_TM_LOAD_P (code))
29491 attrs = attrs_load;
29492 attrs_type = attrs_type_load;
29494 else if (BUILTIN_TM_STORE_P (code))
29496 attrs = attrs_store;
29497 attrs_type = attrs_type_store;
29499 else
29501 attrs = attrs_log;
29502 attrs_type = attrs_type_log;
29504 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29505 /* The builtin without the prefix for
29506 calling it directly. */
29507 d->name + strlen ("__builtin_"),
29508 attrs);
29509 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29510 set the TYPE_ATTRIBUTES. */
29511 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29513 set_builtin_decl (code, decl, false);
29518 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29519 in the current target ISA to allow the user to compile particular modules
29520 with different target specific options that differ from the command line
29521 options. */
29522 static void
29523 ix86_init_mmx_sse_builtins (void)
29525 const struct builtin_description * d;
29526 enum ix86_builtin_func_type ftype;
29527 size_t i;
29529 /* Add all special builtins with variable number of operands. */
29530 for (i = 0, d = bdesc_special_args;
29531 i < ARRAY_SIZE (bdesc_special_args);
29532 i++, d++)
29534 if (d->name == 0)
29535 continue;
29537 ftype = (enum ix86_builtin_func_type) d->flag;
29538 def_builtin (d->mask, d->name, ftype, d->code);
29541 /* Add all builtins with variable number of operands. */
29542 for (i = 0, d = bdesc_args;
29543 i < ARRAY_SIZE (bdesc_args);
29544 i++, d++)
29546 if (d->name == 0)
29547 continue;
29549 ftype = (enum ix86_builtin_func_type) d->flag;
29550 def_builtin_const (d->mask, d->name, ftype, d->code);
29553 /* pcmpestr[im] insns. */
29554 for (i = 0, d = bdesc_pcmpestr;
29555 i < ARRAY_SIZE (bdesc_pcmpestr);
29556 i++, d++)
29558 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29559 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29560 else
29561 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29562 def_builtin_const (d->mask, d->name, ftype, d->code);
29565 /* pcmpistr[im] insns. */
29566 for (i = 0, d = bdesc_pcmpistr;
29567 i < ARRAY_SIZE (bdesc_pcmpistr);
29568 i++, d++)
29570 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29571 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29572 else
29573 ftype = INT_FTYPE_V16QI_V16QI_INT;
29574 def_builtin_const (d->mask, d->name, ftype, d->code);
29577 /* comi/ucomi insns. */
29578 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29580 if (d->mask == OPTION_MASK_ISA_SSE2)
29581 ftype = INT_FTYPE_V2DF_V2DF;
29582 else
29583 ftype = INT_FTYPE_V4SF_V4SF;
29584 def_builtin_const (d->mask, d->name, ftype, d->code);
29587 /* SSE */
29588 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29589 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29590 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29591 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29593 /* SSE or 3DNow!A */
29594 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29595 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29596 IX86_BUILTIN_MASKMOVQ);
29598 /* SSE2 */
29599 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29600 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29602 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29603 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29604 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29605 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29607 /* SSE3. */
29608 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29609 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29610 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29611 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29613 /* AES */
29614 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29615 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29616 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29617 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29618 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29619 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29620 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29621 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29622 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29623 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29624 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29625 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29627 /* PCLMUL */
29628 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29629 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29631 /* RDRND */
29632 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29633 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29634 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29635 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29636 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29637 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29638 IX86_BUILTIN_RDRAND64_STEP);
29640 /* AVX2 */
29641 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29642 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29643 IX86_BUILTIN_GATHERSIV2DF);
29645 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29646 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29647 IX86_BUILTIN_GATHERSIV4DF);
29649 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29650 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29651 IX86_BUILTIN_GATHERDIV2DF);
29653 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29654 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29655 IX86_BUILTIN_GATHERDIV4DF);
29657 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29658 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29659 IX86_BUILTIN_GATHERSIV4SF);
29661 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29662 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29663 IX86_BUILTIN_GATHERSIV8SF);
29665 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29666 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29667 IX86_BUILTIN_GATHERDIV4SF);
29669 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29670 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29671 IX86_BUILTIN_GATHERDIV8SF);
29673 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29674 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29675 IX86_BUILTIN_GATHERSIV2DI);
29677 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29678 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29679 IX86_BUILTIN_GATHERSIV4DI);
29681 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29682 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29683 IX86_BUILTIN_GATHERDIV2DI);
29685 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29686 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29687 IX86_BUILTIN_GATHERDIV4DI);
29689 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29690 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29691 IX86_BUILTIN_GATHERSIV4SI);
29693 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29694 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29695 IX86_BUILTIN_GATHERSIV8SI);
29697 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29698 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29699 IX86_BUILTIN_GATHERDIV4SI);
29701 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29702 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29703 IX86_BUILTIN_GATHERDIV8SI);
29705 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29706 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29707 IX86_BUILTIN_GATHERALTSIV4DF);
29709 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29710 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29711 IX86_BUILTIN_GATHERALTDIV8SF);
29713 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29714 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29715 IX86_BUILTIN_GATHERALTSIV4DI);
29717 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29718 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29719 IX86_BUILTIN_GATHERALTDIV8SI);
29721 /* RTM. */
29722 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29723 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29725 /* MMX access to the vec_init patterns. */
29726 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29727 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29729 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29730 V4HI_FTYPE_HI_HI_HI_HI,
29731 IX86_BUILTIN_VEC_INIT_V4HI);
29733 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29734 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29735 IX86_BUILTIN_VEC_INIT_V8QI);
29737 /* Access to the vec_extract patterns. */
29738 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29739 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29740 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29741 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29742 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29743 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29744 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29745 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29746 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29747 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29749 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29750 "__builtin_ia32_vec_ext_v4hi",
29751 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29753 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29754 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29756 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29757 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29759 /* Access to the vec_set patterns. */
29760 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29761 "__builtin_ia32_vec_set_v2di",
29762 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29764 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29765 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29767 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29768 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29770 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29771 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29773 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29774 "__builtin_ia32_vec_set_v4hi",
29775 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29777 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29778 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29780 /* RDSEED */
29781 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29782 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29783 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29784 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29785 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29786 "__builtin_ia32_rdseed_di_step",
29787 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29789 /* ADCX */
29790 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29791 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29792 def_builtin (OPTION_MASK_ISA_64BIT,
29793 "__builtin_ia32_addcarryx_u64",
29794 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29795 IX86_BUILTIN_ADDCARRYX64);
29797 /* Read/write FLAGS. */
29798 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
29799 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
29800 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
29801 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
29802 def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
29803 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
29804 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
29805 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
29808 /* Add FMA4 multi-arg argument instructions */
29809 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29811 if (d->name == 0)
29812 continue;
29814 ftype = (enum ix86_builtin_func_type) d->flag;
29815 def_builtin_const (d->mask, d->name, ftype, d->code);
29819 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29820 to return a pointer to VERSION_DECL if the outcome of the expression
29821 formed by PREDICATE_CHAIN is true. This function will be called during
29822 version dispatch to decide which function version to execute. It returns
29823 the basic block at the end, to which more conditions can be added. */
29825 static basic_block
29826 add_condition_to_bb (tree function_decl, tree version_decl,
29827 tree predicate_chain, basic_block new_bb)
29829 gimple return_stmt;
29830 tree convert_expr, result_var;
29831 gimple convert_stmt;
29832 gimple call_cond_stmt;
29833 gimple if_else_stmt;
29835 basic_block bb1, bb2, bb3;
29836 edge e12, e23;
29838 tree cond_var, and_expr_var = NULL_TREE;
29839 gimple_seq gseq;
29841 tree predicate_decl, predicate_arg;
29843 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29845 gcc_assert (new_bb != NULL);
29846 gseq = bb_seq (new_bb);
29849 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29850 build_fold_addr_expr (version_decl));
29851 result_var = create_tmp_var (ptr_type_node, NULL);
29852 convert_stmt = gimple_build_assign (result_var, convert_expr);
29853 return_stmt = gimple_build_return (result_var);
29855 if (predicate_chain == NULL_TREE)
29857 gimple_seq_add_stmt (&gseq, convert_stmt);
29858 gimple_seq_add_stmt (&gseq, return_stmt);
29859 set_bb_seq (new_bb, gseq);
29860 gimple_set_bb (convert_stmt, new_bb);
29861 gimple_set_bb (return_stmt, new_bb);
29862 pop_cfun ();
29863 return new_bb;
29866 while (predicate_chain != NULL)
29868 cond_var = create_tmp_var (integer_type_node, NULL);
29869 predicate_decl = TREE_PURPOSE (predicate_chain);
29870 predicate_arg = TREE_VALUE (predicate_chain);
29871 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29872 gimple_call_set_lhs (call_cond_stmt, cond_var);
29874 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29875 gimple_set_bb (call_cond_stmt, new_bb);
29876 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29878 predicate_chain = TREE_CHAIN (predicate_chain);
29880 if (and_expr_var == NULL)
29881 and_expr_var = cond_var;
29882 else
29884 gimple assign_stmt;
29885 /* Use MIN_EXPR to check if any integer is zero?.
29886 and_expr_var = min_expr <cond_var, and_expr_var> */
29887 assign_stmt = gimple_build_assign (and_expr_var,
29888 build2 (MIN_EXPR, integer_type_node,
29889 cond_var, and_expr_var));
29891 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29892 gimple_set_bb (assign_stmt, new_bb);
29893 gimple_seq_add_stmt (&gseq, assign_stmt);
29897 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29898 integer_zero_node,
29899 NULL_TREE, NULL_TREE);
29900 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29901 gimple_set_bb (if_else_stmt, new_bb);
29902 gimple_seq_add_stmt (&gseq, if_else_stmt);
29904 gimple_seq_add_stmt (&gseq, convert_stmt);
29905 gimple_seq_add_stmt (&gseq, return_stmt);
29906 set_bb_seq (new_bb, gseq);
29908 bb1 = new_bb;
29909 e12 = split_block (bb1, if_else_stmt);
29910 bb2 = e12->dest;
29911 e12->flags &= ~EDGE_FALLTHRU;
29912 e12->flags |= EDGE_TRUE_VALUE;
29914 e23 = split_block (bb2, return_stmt);
29916 gimple_set_bb (convert_stmt, bb2);
29917 gimple_set_bb (return_stmt, bb2);
29919 bb3 = e23->dest;
29920 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29922 remove_edge (e23);
29923 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
29925 pop_cfun ();
29927 return bb3;
29930 /* This parses the attribute arguments to target in DECL and determines
29931 the right builtin to use to match the platform specification.
29932 It returns the priority value for this version decl. If PREDICATE_LIST
29933 is not NULL, it stores the list of cpu features that need to be checked
29934 before dispatching this function. */
29936 static unsigned int
29937 get_builtin_code_for_version (tree decl, tree *predicate_list)
29939 tree attrs;
29940 struct cl_target_option cur_target;
29941 tree target_node;
29942 struct cl_target_option *new_target;
29943 const char *arg_str = NULL;
29944 const char *attrs_str = NULL;
29945 char *tok_str = NULL;
29946 char *token;
29948 /* Priority of i386 features, greater value is higher priority. This is
29949 used to decide the order in which function dispatch must happen. For
29950 instance, a version specialized for SSE4.2 should be checked for dispatch
29951 before a version for SSE3, as SSE4.2 implies SSE3. */
29952 enum feature_priority
29954 P_ZERO = 0,
29955 P_MMX,
29956 P_SSE,
29957 P_SSE2,
29958 P_SSE3,
29959 P_SSSE3,
29960 P_PROC_SSSE3,
29961 P_SSE4_a,
29962 P_PROC_SSE4_a,
29963 P_SSE4_1,
29964 P_SSE4_2,
29965 P_PROC_SSE4_2,
29966 P_POPCNT,
29967 P_AVX,
29968 P_AVX2,
29969 P_FMA,
29970 P_PROC_FMA
29973 enum feature_priority priority = P_ZERO;
29975 /* These are the target attribute strings for which a dispatcher is
29976 available, from fold_builtin_cpu. */
29978 static struct _feature_list
29980 const char *const name;
29981 const enum feature_priority priority;
29983 const feature_list[] =
29985 {"mmx", P_MMX},
29986 {"sse", P_SSE},
29987 {"sse2", P_SSE2},
29988 {"sse3", P_SSE3},
29989 {"ssse3", P_SSSE3},
29990 {"sse4.1", P_SSE4_1},
29991 {"sse4.2", P_SSE4_2},
29992 {"popcnt", P_POPCNT},
29993 {"avx", P_AVX},
29994 {"avx2", P_AVX2}
29998 static unsigned int NUM_FEATURES
29999 = sizeof (feature_list) / sizeof (struct _feature_list);
30001 unsigned int i;
30003 tree predicate_chain = NULL_TREE;
30004 tree predicate_decl, predicate_arg;
30006 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30007 gcc_assert (attrs != NULL);
30009 attrs = TREE_VALUE (TREE_VALUE (attrs));
30011 gcc_assert (TREE_CODE (attrs) == STRING_CST);
30012 attrs_str = TREE_STRING_POINTER (attrs);
30014 /* Return priority zero for default function. */
30015 if (strcmp (attrs_str, "default") == 0)
30016 return 0;
30018 /* Handle arch= if specified. For priority, set it to be 1 more than
30019 the best instruction set the processor can handle. For instance, if
30020 there is a version for atom and a version for ssse3 (the highest ISA
30021 priority for atom), the atom version must be checked for dispatch
30022 before the ssse3 version. */
30023 if (strstr (attrs_str, "arch=") != NULL)
30025 cl_target_option_save (&cur_target, &global_options);
30026 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
30027 &global_options_set);
30029 gcc_assert (target_node);
30030 new_target = TREE_TARGET_OPTION (target_node);
30031 gcc_assert (new_target);
30033 if (new_target->arch_specified && new_target->arch > 0)
30035 switch (new_target->arch)
30037 case PROCESSOR_CORE2:
30038 arg_str = "core2";
30039 priority = P_PROC_SSSE3;
30040 break;
30041 case PROCESSOR_COREI7:
30042 arg_str = "corei7";
30043 priority = P_PROC_SSE4_2;
30044 break;
30045 case PROCESSOR_COREI7_AVX:
30046 arg_str = "corei7-avx";
30047 priority = P_PROC_SSE4_2;
30048 break;
30049 case PROCESSOR_ATOM:
30050 arg_str = "atom";
30051 priority = P_PROC_SSSE3;
30052 break;
30053 case PROCESSOR_AMDFAM10:
30054 arg_str = "amdfam10h";
30055 priority = P_PROC_SSE4_a;
30056 break;
30057 case PROCESSOR_BDVER1:
30058 arg_str = "bdver1";
30059 priority = P_PROC_FMA;
30060 break;
30061 case PROCESSOR_BDVER2:
30062 arg_str = "bdver2";
30063 priority = P_PROC_FMA;
30064 break;
30068 cl_target_option_restore (&global_options, &cur_target);
30070 if (predicate_list && arg_str == NULL)
30072 error_at (DECL_SOURCE_LOCATION (decl),
30073 "No dispatcher found for the versioning attributes");
30074 return 0;
30077 if (predicate_list)
30079 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
30080 /* For a C string literal the length includes the trailing NULL. */
30081 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
30082 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30083 predicate_chain);
30087 /* Process feature name. */
30088 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
30089 strcpy (tok_str, attrs_str);
30090 token = strtok (tok_str, ",");
30091 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
30093 while (token != NULL)
30095 /* Do not process "arch=" */
30096 if (strncmp (token, "arch=", 5) == 0)
30098 token = strtok (NULL, ",");
30099 continue;
30101 for (i = 0; i < NUM_FEATURES; ++i)
30103 if (strcmp (token, feature_list[i].name) == 0)
30105 if (predicate_list)
30107 predicate_arg = build_string_literal (
30108 strlen (feature_list[i].name) + 1,
30109 feature_list[i].name);
30110 predicate_chain = tree_cons (predicate_decl, predicate_arg,
30111 predicate_chain);
30113 /* Find the maximum priority feature. */
30114 if (feature_list[i].priority > priority)
30115 priority = feature_list[i].priority;
30117 break;
30120 if (predicate_list && i == NUM_FEATURES)
30122 error_at (DECL_SOURCE_LOCATION (decl),
30123 "No dispatcher found for %s", token);
30124 return 0;
30126 token = strtok (NULL, ",");
30128 free (tok_str);
30130 if (predicate_list && predicate_chain == NULL_TREE)
30132 error_at (DECL_SOURCE_LOCATION (decl),
30133 "No dispatcher found for the versioning attributes : %s",
30134 attrs_str);
30135 return 0;
30137 else if (predicate_list)
30139 predicate_chain = nreverse (predicate_chain);
30140 *predicate_list = predicate_chain;
30143 return priority;
30146 /* This compares the priority of target features in function DECL1
30147 and DECL2. It returns positive value if DECL1 is higher priority,
30148 negative value if DECL2 is higher priority and 0 if they are the
30149 same. */
30151 static int
30152 ix86_compare_version_priority (tree decl1, tree decl2)
30154 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
30155 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
30157 return (int)priority1 - (int)priority2;
30160 /* V1 and V2 point to function versions with different priorities
30161 based on the target ISA. This function compares their priorities. */
30163 static int
30164 feature_compare (const void *v1, const void *v2)
30166 typedef struct _function_version_info
30168 tree version_decl;
30169 tree predicate_chain;
30170 unsigned int dispatch_priority;
30171 } function_version_info;
30173 const function_version_info c1 = *(const function_version_info *)v1;
30174 const function_version_info c2 = *(const function_version_info *)v2;
30175 return (c2.dispatch_priority - c1.dispatch_priority);
30178 /* This function generates the dispatch function for
30179 multi-versioned functions. DISPATCH_DECL is the function which will
30180 contain the dispatch logic. FNDECLS are the function choices for
30181 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
30182 in DISPATCH_DECL in which the dispatch code is generated. */
30184 static int
30185 dispatch_function_versions (tree dispatch_decl,
30186 void *fndecls_p,
30187 basic_block *empty_bb)
30189 tree default_decl;
30190 gimple ifunc_cpu_init_stmt;
30191 gimple_seq gseq;
30192 int ix;
30193 tree ele;
30194 vec<tree> *fndecls;
30195 unsigned int num_versions = 0;
30196 unsigned int actual_versions = 0;
30197 unsigned int i;
30199 struct _function_version_info
30201 tree version_decl;
30202 tree predicate_chain;
30203 unsigned int dispatch_priority;
30204 }*function_version_info;
30206 gcc_assert (dispatch_decl != NULL
30207 && fndecls_p != NULL
30208 && empty_bb != NULL);
30210 /*fndecls_p is actually a vector. */
30211 fndecls = static_cast<vec<tree> *> (fndecls_p);
30213 /* At least one more version other than the default. */
30214 num_versions = fndecls->length ();
30215 gcc_assert (num_versions >= 2);
30217 function_version_info = (struct _function_version_info *)
30218 XNEWVEC (struct _function_version_info, (num_versions - 1));
30220 /* The first version in the vector is the default decl. */
30221 default_decl = (*fndecls)[0];
30223 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
30225 gseq = bb_seq (*empty_bb);
30226 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
30227 constructors, so explicity call __builtin_cpu_init here. */
30228 ifunc_cpu_init_stmt = gimple_build_call_vec (
30229 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
30230 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
30231 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
30232 set_bb_seq (*empty_bb, gseq);
30234 pop_cfun ();
30237 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
30239 tree version_decl = ele;
30240 tree predicate_chain = NULL_TREE;
30241 unsigned int priority;
30242 /* Get attribute string, parse it and find the right predicate decl.
30243 The predicate function could be a lengthy combination of many
30244 features, like arch-type and various isa-variants. */
30245 priority = get_builtin_code_for_version (version_decl,
30246 &predicate_chain);
30248 if (predicate_chain == NULL_TREE)
30249 continue;
30251 function_version_info [actual_versions].version_decl = version_decl;
30252 function_version_info [actual_versions].predicate_chain
30253 = predicate_chain;
30254 function_version_info [actual_versions].dispatch_priority = priority;
30255 actual_versions++;
30258 /* Sort the versions according to descending order of dispatch priority. The
30259 priority is based on the ISA. This is not a perfect solution. There
30260 could still be ambiguity. If more than one function version is suitable
30261 to execute, which one should be dispatched? In future, allow the user
30262 to specify a dispatch priority next to the version. */
30263 qsort (function_version_info, actual_versions,
30264 sizeof (struct _function_version_info), feature_compare);
30266 for (i = 0; i < actual_versions; ++i)
30267 *empty_bb = add_condition_to_bb (dispatch_decl,
30268 function_version_info[i].version_decl,
30269 function_version_info[i].predicate_chain,
30270 *empty_bb);
30272 /* dispatch default version at the end. */
30273 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
30274 NULL, *empty_bb);
30276 free (function_version_info);
30277 return 0;
30280 /* Comparator function to be used in qsort routine to sort attribute
30281 specification strings to "target". */
30283 static int
30284 attr_strcmp (const void *v1, const void *v2)
30286 const char *c1 = *(char *const*)v1;
30287 const char *c2 = *(char *const*)v2;
30288 return strcmp (c1, c2);
30291 /* ARGLIST is the argument to target attribute. This function tokenizes
30292 the comma separated arguments, sorts them and returns a string which
30293 is a unique identifier for the comma separated arguments. It also
30294 replaces non-identifier characters "=,-" with "_". */
30296 static char *
30297 sorted_attr_string (tree arglist)
30299 tree arg;
30300 size_t str_len_sum = 0;
30301 char **args = NULL;
30302 char *attr_str, *ret_str;
30303 char *attr = NULL;
30304 unsigned int argnum = 1;
30305 unsigned int i;
30307 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30309 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30310 size_t len = strlen (str);
30311 str_len_sum += len + 1;
30312 if (arg != arglist)
30313 argnum++;
30314 for (i = 0; i < strlen (str); i++)
30315 if (str[i] == ',')
30316 argnum++;
30319 attr_str = XNEWVEC (char, str_len_sum);
30320 str_len_sum = 0;
30321 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30323 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30324 size_t len = strlen (str);
30325 memcpy (attr_str + str_len_sum, str, len);
30326 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
30327 str_len_sum += len + 1;
30330 /* Replace "=,-" with "_". */
30331 for (i = 0; i < strlen (attr_str); i++)
30332 if (attr_str[i] == '=' || attr_str[i]== '-')
30333 attr_str[i] = '_';
30335 if (argnum == 1)
30336 return attr_str;
30338 args = XNEWVEC (char *, argnum);
30340 i = 0;
30341 attr = strtok (attr_str, ",");
30342 while (attr != NULL)
30344 args[i] = attr;
30345 i++;
30346 attr = strtok (NULL, ",");
30349 qsort (args, argnum, sizeof (char *), attr_strcmp);
30351 ret_str = XNEWVEC (char, str_len_sum);
30352 str_len_sum = 0;
30353 for (i = 0; i < argnum; i++)
30355 size_t len = strlen (args[i]);
30356 memcpy (ret_str + str_len_sum, args[i], len);
30357 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
30358 str_len_sum += len + 1;
30361 XDELETEVEC (args);
30362 XDELETEVEC (attr_str);
30363 return ret_str;
30366 /* This function changes the assembler name for functions that are
30367 versions. If DECL is a function version and has a "target"
30368 attribute, it appends the attribute string to its assembler name. */
30370 static tree
30371 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30373 tree version_attr;
30374 const char *orig_name, *version_string;
30375 char *attr_str, *assembler_name;
30377 if (DECL_DECLARED_INLINE_P (decl)
30378 && lookup_attribute ("gnu_inline",
30379 DECL_ATTRIBUTES (decl)))
30380 error_at (DECL_SOURCE_LOCATION (decl),
30381 "Function versions cannot be marked as gnu_inline,"
30382 " bodies have to be generated");
30384 if (DECL_VIRTUAL_P (decl)
30385 || DECL_VINDEX (decl))
30386 sorry ("Virtual function multiversioning not supported");
30388 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30390 /* target attribute string cannot be NULL. */
30391 gcc_assert (version_attr != NULL_TREE);
30393 orig_name = IDENTIFIER_POINTER (id);
30394 version_string
30395 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30397 if (strcmp (version_string, "default") == 0)
30398 return id;
30400 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30401 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30403 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30405 /* Allow assembler name to be modified if already set. */
30406 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30407 SET_DECL_RTL (decl, NULL);
30409 tree ret = get_identifier (assembler_name);
30410 XDELETEVEC (attr_str);
30411 XDELETEVEC (assembler_name);
30412 return ret;
30415 /* This function returns true if FN1 and FN2 are versions of the same function,
30416 that is, the target strings of the function decls are different. This assumes
30417 that FN1 and FN2 have the same signature. */
30419 static bool
30420 ix86_function_versions (tree fn1, tree fn2)
30422 tree attr1, attr2;
30423 char *target1, *target2;
30424 bool result;
30426 if (TREE_CODE (fn1) != FUNCTION_DECL
30427 || TREE_CODE (fn2) != FUNCTION_DECL)
30428 return false;
30430 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30431 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30433 /* At least one function decl should have the target attribute specified. */
30434 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30435 return false;
30437 /* Diagnose missing target attribute if one of the decls is already
30438 multi-versioned. */
30439 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30441 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30443 if (attr2 != NULL_TREE)
30445 tree tem = fn1;
30446 fn1 = fn2;
30447 fn2 = tem;
30448 attr1 = attr2;
30450 error_at (DECL_SOURCE_LOCATION (fn2),
30451 "missing %<target%> attribute for multi-versioned %D",
30452 fn2);
30453 inform (DECL_SOURCE_LOCATION (fn1),
30454 "previous declaration of %D", fn1);
30455 /* Prevent diagnosing of the same error multiple times. */
30456 DECL_ATTRIBUTES (fn2)
30457 = tree_cons (get_identifier ("target"),
30458 copy_node (TREE_VALUE (attr1)),
30459 DECL_ATTRIBUTES (fn2));
30461 return false;
30464 target1 = sorted_attr_string (TREE_VALUE (attr1));
30465 target2 = sorted_attr_string (TREE_VALUE (attr2));
30467 /* The sorted target strings must be different for fn1 and fn2
30468 to be versions. */
30469 if (strcmp (target1, target2) == 0)
30470 result = false;
30471 else
30472 result = true;
30474 XDELETEVEC (target1);
30475 XDELETEVEC (target2);
30477 return result;
30480 static tree
30481 ix86_mangle_decl_assembler_name (tree decl, tree id)
30483 /* For function version, add the target suffix to the assembler name. */
30484 if (TREE_CODE (decl) == FUNCTION_DECL
30485 && DECL_FUNCTION_VERSIONED (decl))
30486 id = ix86_mangle_function_version_assembler_name (decl, id);
30487 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30488 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30489 #endif
30491 return id;
30494 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30495 is true, append the full path name of the source file. */
30497 static char *
30498 make_name (tree decl, const char *suffix, bool make_unique)
30500 char *global_var_name;
30501 int name_len;
30502 const char *name;
30503 const char *unique_name = NULL;
30505 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30507 /* Get a unique name that can be used globally without any chances
30508 of collision at link time. */
30509 if (make_unique)
30510 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30512 name_len = strlen (name) + strlen (suffix) + 2;
30514 if (make_unique)
30515 name_len += strlen (unique_name) + 1;
30516 global_var_name = XNEWVEC (char, name_len);
30518 /* Use '.' to concatenate names as it is demangler friendly. */
30519 if (make_unique)
30520 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30521 suffix);
30522 else
30523 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30525 return global_var_name;
30528 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30530 /* Make a dispatcher declaration for the multi-versioned function DECL.
30531 Calls to DECL function will be replaced with calls to the dispatcher
30532 by the front-end. Return the decl created. */
30534 static tree
30535 make_dispatcher_decl (const tree decl)
30537 tree func_decl;
30538 char *func_name;
30539 tree fn_type, func_type;
30540 bool is_uniq = false;
30542 if (TREE_PUBLIC (decl) == 0)
30543 is_uniq = true;
30545 func_name = make_name (decl, "ifunc", is_uniq);
30547 fn_type = TREE_TYPE (decl);
30548 func_type = build_function_type (TREE_TYPE (fn_type),
30549 TYPE_ARG_TYPES (fn_type));
30551 func_decl = build_fn_decl (func_name, func_type);
30552 XDELETEVEC (func_name);
30553 TREE_USED (func_decl) = 1;
30554 DECL_CONTEXT (func_decl) = NULL_TREE;
30555 DECL_INITIAL (func_decl) = error_mark_node;
30556 DECL_ARTIFICIAL (func_decl) = 1;
30557 /* Mark this func as external, the resolver will flip it again if
30558 it gets generated. */
30559 DECL_EXTERNAL (func_decl) = 1;
30560 /* This will be of type IFUNCs have to be externally visible. */
30561 TREE_PUBLIC (func_decl) = 1;
30563 return func_decl;
30566 #endif
30568 /* Returns true if decl is multi-versioned and DECL is the default function,
30569 that is it is not tagged with target specific optimization. */
30571 static bool
30572 is_function_default_version (const tree decl)
30574 if (TREE_CODE (decl) != FUNCTION_DECL
30575 || !DECL_FUNCTION_VERSIONED (decl))
30576 return false;
30577 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30578 gcc_assert (attr);
30579 attr = TREE_VALUE (TREE_VALUE (attr));
30580 return (TREE_CODE (attr) == STRING_CST
30581 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30584 /* Make a dispatcher declaration for the multi-versioned function DECL.
30585 Calls to DECL function will be replaced with calls to the dispatcher
30586 by the front-end. Returns the decl of the dispatcher function. */
30588 static tree
30589 ix86_get_function_versions_dispatcher (void *decl)
30591 tree fn = (tree) decl;
30592 struct cgraph_node *node = NULL;
30593 struct cgraph_node *default_node = NULL;
30594 struct cgraph_function_version_info *node_v = NULL;
30595 struct cgraph_function_version_info *first_v = NULL;
30597 tree dispatch_decl = NULL;
30599 struct cgraph_function_version_info *default_version_info = NULL;
30601 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30603 node = cgraph_get_node (fn);
30604 gcc_assert (node != NULL);
30606 node_v = get_cgraph_node_version (node);
30607 gcc_assert (node_v != NULL);
30609 if (node_v->dispatcher_resolver != NULL)
30610 return node_v->dispatcher_resolver;
30612 /* Find the default version and make it the first node. */
30613 first_v = node_v;
30614 /* Go to the beginning of the chain. */
30615 while (first_v->prev != NULL)
30616 first_v = first_v->prev;
30617 default_version_info = first_v;
30618 while (default_version_info != NULL)
30620 if (is_function_default_version
30621 (default_version_info->this_node->decl))
30622 break;
30623 default_version_info = default_version_info->next;
30626 /* If there is no default node, just return NULL. */
30627 if (default_version_info == NULL)
30628 return NULL;
30630 /* Make default info the first node. */
30631 if (first_v != default_version_info)
30633 default_version_info->prev->next = default_version_info->next;
30634 if (default_version_info->next)
30635 default_version_info->next->prev = default_version_info->prev;
30636 first_v->prev = default_version_info;
30637 default_version_info->next = first_v;
30638 default_version_info->prev = NULL;
30641 default_node = default_version_info->this_node;
30643 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30644 if (targetm.has_ifunc_p ())
30646 struct cgraph_function_version_info *it_v = NULL;
30647 struct cgraph_node *dispatcher_node = NULL;
30648 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30650 /* Right now, the dispatching is done via ifunc. */
30651 dispatch_decl = make_dispatcher_decl (default_node->decl);
30653 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30654 gcc_assert (dispatcher_node != NULL);
30655 dispatcher_node->dispatcher_function = 1;
30656 dispatcher_version_info
30657 = insert_new_cgraph_node_version (dispatcher_node);
30658 dispatcher_version_info->next = default_version_info;
30659 dispatcher_node->definition = 1;
30661 /* Set the dispatcher for all the versions. */
30662 it_v = default_version_info;
30663 while (it_v != NULL)
30665 it_v->dispatcher_resolver = dispatch_decl;
30666 it_v = it_v->next;
30669 else
30670 #endif
30672 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30673 "multiversioning needs ifunc which is not supported "
30674 "on this target");
30677 return dispatch_decl;
30680 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30681 it to CHAIN. */
30683 static tree
30684 make_attribute (const char *name, const char *arg_name, tree chain)
30686 tree attr_name;
30687 tree attr_arg_name;
30688 tree attr_args;
30689 tree attr;
30691 attr_name = get_identifier (name);
30692 attr_arg_name = build_string (strlen (arg_name), arg_name);
30693 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30694 attr = tree_cons (attr_name, attr_args, chain);
30695 return attr;
30698 /* Make the resolver function decl to dispatch the versions of
30699 a multi-versioned function, DEFAULT_DECL. Create an
30700 empty basic block in the resolver and store the pointer in
30701 EMPTY_BB. Return the decl of the resolver function. */
30703 static tree
30704 make_resolver_func (const tree default_decl,
30705 const tree dispatch_decl,
30706 basic_block *empty_bb)
30708 char *resolver_name;
30709 tree decl, type, decl_name, t;
30710 bool is_uniq = false;
30712 /* IFUNC's have to be globally visible. So, if the default_decl is
30713 not, then the name of the IFUNC should be made unique. */
30714 if (TREE_PUBLIC (default_decl) == 0)
30715 is_uniq = true;
30717 /* Append the filename to the resolver function if the versions are
30718 not externally visible. This is because the resolver function has
30719 to be externally visible for the loader to find it. So, appending
30720 the filename will prevent conflicts with a resolver function from
30721 another module which is based on the same version name. */
30722 resolver_name = make_name (default_decl, "resolver", is_uniq);
30724 /* The resolver function should return a (void *). */
30725 type = build_function_type_list (ptr_type_node, NULL_TREE);
30727 decl = build_fn_decl (resolver_name, type);
30728 decl_name = get_identifier (resolver_name);
30729 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30731 DECL_NAME (decl) = decl_name;
30732 TREE_USED (decl) = 1;
30733 DECL_ARTIFICIAL (decl) = 1;
30734 DECL_IGNORED_P (decl) = 0;
30735 /* IFUNC resolvers have to be externally visible. */
30736 TREE_PUBLIC (decl) = 1;
30737 DECL_UNINLINABLE (decl) = 1;
30739 /* Resolver is not external, body is generated. */
30740 DECL_EXTERNAL (decl) = 0;
30741 DECL_EXTERNAL (dispatch_decl) = 0;
30743 DECL_CONTEXT (decl) = NULL_TREE;
30744 DECL_INITIAL (decl) = make_node (BLOCK);
30745 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30747 if (DECL_COMDAT_GROUP (default_decl)
30748 || TREE_PUBLIC (default_decl))
30750 /* In this case, each translation unit with a call to this
30751 versioned function will put out a resolver. Ensure it
30752 is comdat to keep just one copy. */
30753 DECL_COMDAT (decl) = 1;
30754 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30756 /* Build result decl and add to function_decl. */
30757 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30758 DECL_ARTIFICIAL (t) = 1;
30759 DECL_IGNORED_P (t) = 1;
30760 DECL_RESULT (decl) = t;
30762 gimplify_function_tree (decl);
30763 push_cfun (DECL_STRUCT_FUNCTION (decl));
30764 *empty_bb = init_lowered_empty_function (decl, false);
30766 cgraph_add_new_function (decl, true);
30767 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30769 pop_cfun ();
30771 gcc_assert (dispatch_decl != NULL);
30772 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30773 DECL_ATTRIBUTES (dispatch_decl)
30774 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30776 /* Create the alias for dispatch to resolver here. */
30777 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30778 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30779 XDELETEVEC (resolver_name);
30780 return decl;
30783 /* Generate the dispatching code body to dispatch multi-versioned function
30784 DECL. The target hook is called to process the "target" attributes and
30785 provide the code to dispatch the right function at run-time. NODE points
30786 to the dispatcher decl whose body will be created. */
30788 static tree
30789 ix86_generate_version_dispatcher_body (void *node_p)
30791 tree resolver_decl;
30792 basic_block empty_bb;
30793 tree default_ver_decl;
30794 struct cgraph_node *versn;
30795 struct cgraph_node *node;
30797 struct cgraph_function_version_info *node_version_info = NULL;
30798 struct cgraph_function_version_info *versn_info = NULL;
30800 node = (cgraph_node *)node_p;
30802 node_version_info = get_cgraph_node_version (node);
30803 gcc_assert (node->dispatcher_function
30804 && node_version_info != NULL);
30806 if (node_version_info->dispatcher_resolver)
30807 return node_version_info->dispatcher_resolver;
30809 /* The first version in the chain corresponds to the default version. */
30810 default_ver_decl = node_version_info->next->this_node->decl;
30812 /* node is going to be an alias, so remove the finalized bit. */
30813 node->definition = false;
30815 resolver_decl = make_resolver_func (default_ver_decl,
30816 node->decl, &empty_bb);
30818 node_version_info->dispatcher_resolver = resolver_decl;
30820 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30822 stack_vec<tree, 2> fn_ver_vec;
30824 for (versn_info = node_version_info->next; versn_info;
30825 versn_info = versn_info->next)
30827 versn = versn_info->this_node;
30828 /* Check for virtual functions here again, as by this time it should
30829 have been determined if this function needs a vtable index or
30830 not. This happens for methods in derived classes that override
30831 virtual methods in base classes but are not explicitly marked as
30832 virtual. */
30833 if (DECL_VINDEX (versn->decl))
30834 sorry ("Virtual function multiversioning not supported");
30836 fn_ver_vec.safe_push (versn->decl);
30839 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30840 rebuild_cgraph_edges ();
30841 pop_cfun ();
30842 return resolver_decl;
30844 /* This builds the processor_model struct type defined in
30845 libgcc/config/i386/cpuinfo.c */
30847 static tree
30848 build_processor_model_struct (void)
30850 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30851 "__cpu_features"};
30852 tree field = NULL_TREE, field_chain = NULL_TREE;
30853 int i;
30854 tree type = make_node (RECORD_TYPE);
30856 /* The first 3 fields are unsigned int. */
30857 for (i = 0; i < 3; ++i)
30859 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30860 get_identifier (field_name[i]), unsigned_type_node);
30861 if (field_chain != NULL_TREE)
30862 DECL_CHAIN (field) = field_chain;
30863 field_chain = field;
30866 /* The last field is an array of unsigned integers of size one. */
30867 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30868 get_identifier (field_name[3]),
30869 build_array_type (unsigned_type_node,
30870 build_index_type (size_one_node)));
30871 if (field_chain != NULL_TREE)
30872 DECL_CHAIN (field) = field_chain;
30873 field_chain = field;
30875 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30876 return type;
30879 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30881 static tree
30882 make_var_decl (tree type, const char *name)
30884 tree new_decl;
30886 new_decl = build_decl (UNKNOWN_LOCATION,
30887 VAR_DECL,
30888 get_identifier(name),
30889 type);
30891 DECL_EXTERNAL (new_decl) = 1;
30892 TREE_STATIC (new_decl) = 1;
30893 TREE_PUBLIC (new_decl) = 1;
30894 DECL_INITIAL (new_decl) = 0;
30895 DECL_ARTIFICIAL (new_decl) = 0;
30896 DECL_PRESERVE_P (new_decl) = 1;
30898 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30899 assemble_variable (new_decl, 0, 0, 0);
30901 return new_decl;
30904 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30905 into an integer defined in libgcc/config/i386/cpuinfo.c */
30907 static tree
30908 fold_builtin_cpu (tree fndecl, tree *args)
30910 unsigned int i;
30911 enum ix86_builtins fn_code = (enum ix86_builtins)
30912 DECL_FUNCTION_CODE (fndecl);
30913 tree param_string_cst = NULL;
30915 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30916 enum processor_features
30918 F_CMOV = 0,
30919 F_MMX,
30920 F_POPCNT,
30921 F_SSE,
30922 F_SSE2,
30923 F_SSE3,
30924 F_SSSE3,
30925 F_SSE4_1,
30926 F_SSE4_2,
30927 F_AVX,
30928 F_AVX2,
30929 F_MAX
30932 /* These are the values for vendor types and cpu types and subtypes
30933 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30934 the corresponding start value. */
30935 enum processor_model
30937 M_INTEL = 1,
30938 M_AMD,
30939 M_CPU_TYPE_START,
30940 M_INTEL_ATOM,
30941 M_INTEL_CORE2,
30942 M_INTEL_COREI7,
30943 M_AMDFAM10H,
30944 M_AMDFAM15H,
30945 M_INTEL_SLM,
30946 M_CPU_SUBTYPE_START,
30947 M_INTEL_COREI7_NEHALEM,
30948 M_INTEL_COREI7_WESTMERE,
30949 M_INTEL_COREI7_SANDYBRIDGE,
30950 M_AMDFAM10H_BARCELONA,
30951 M_AMDFAM10H_SHANGHAI,
30952 M_AMDFAM10H_ISTANBUL,
30953 M_AMDFAM15H_BDVER1,
30954 M_AMDFAM15H_BDVER2,
30955 M_AMDFAM15H_BDVER3,
30956 M_AMDFAM15H_BDVER4
30959 static struct _arch_names_table
30961 const char *const name;
30962 const enum processor_model model;
30964 const arch_names_table[] =
30966 {"amd", M_AMD},
30967 {"intel", M_INTEL},
30968 {"atom", M_INTEL_ATOM},
30969 {"slm", M_INTEL_SLM},
30970 {"core2", M_INTEL_CORE2},
30971 {"corei7", M_INTEL_COREI7},
30972 {"nehalem", M_INTEL_COREI7_NEHALEM},
30973 {"westmere", M_INTEL_COREI7_WESTMERE},
30974 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30975 {"amdfam10h", M_AMDFAM10H},
30976 {"barcelona", M_AMDFAM10H_BARCELONA},
30977 {"shanghai", M_AMDFAM10H_SHANGHAI},
30978 {"istanbul", M_AMDFAM10H_ISTANBUL},
30979 {"amdfam15h", M_AMDFAM15H},
30980 {"bdver1", M_AMDFAM15H_BDVER1},
30981 {"bdver2", M_AMDFAM15H_BDVER2},
30982 {"bdver3", M_AMDFAM15H_BDVER3},
30983 {"bdver4", M_AMDFAM15H_BDVER4},
30986 static struct _isa_names_table
30988 const char *const name;
30989 const enum processor_features feature;
30991 const isa_names_table[] =
30993 {"cmov", F_CMOV},
30994 {"mmx", F_MMX},
30995 {"popcnt", F_POPCNT},
30996 {"sse", F_SSE},
30997 {"sse2", F_SSE2},
30998 {"sse3", F_SSE3},
30999 {"ssse3", F_SSSE3},
31000 {"sse4.1", F_SSE4_1},
31001 {"sse4.2", F_SSE4_2},
31002 {"avx", F_AVX},
31003 {"avx2", F_AVX2}
31006 tree __processor_model_type = build_processor_model_struct ();
31007 tree __cpu_model_var = make_var_decl (__processor_model_type,
31008 "__cpu_model");
31011 varpool_add_new_variable (__cpu_model_var);
31013 gcc_assert ((args != NULL) && (*args != NULL));
31015 param_string_cst = *args;
31016 while (param_string_cst
31017 && TREE_CODE (param_string_cst) != STRING_CST)
31019 /* *args must be a expr that can contain other EXPRS leading to a
31020 STRING_CST. */
31021 if (!EXPR_P (param_string_cst))
31023 error ("Parameter to builtin must be a string constant or literal");
31024 return integer_zero_node;
31026 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
31029 gcc_assert (param_string_cst);
31031 if (fn_code == IX86_BUILTIN_CPU_IS)
31033 tree ref;
31034 tree field;
31035 tree final;
31037 unsigned int field_val = 0;
31038 unsigned int NUM_ARCH_NAMES
31039 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
31041 for (i = 0; i < NUM_ARCH_NAMES; i++)
31042 if (strcmp (arch_names_table[i].name,
31043 TREE_STRING_POINTER (param_string_cst)) == 0)
31044 break;
31046 if (i == NUM_ARCH_NAMES)
31048 error ("Parameter to builtin not valid: %s",
31049 TREE_STRING_POINTER (param_string_cst));
31050 return integer_zero_node;
31053 field = TYPE_FIELDS (__processor_model_type);
31054 field_val = arch_names_table[i].model;
31056 /* CPU types are stored in the next field. */
31057 if (field_val > M_CPU_TYPE_START
31058 && field_val < M_CPU_SUBTYPE_START)
31060 field = DECL_CHAIN (field);
31061 field_val -= M_CPU_TYPE_START;
31064 /* CPU subtypes are stored in the next field. */
31065 if (field_val > M_CPU_SUBTYPE_START)
31067 field = DECL_CHAIN ( DECL_CHAIN (field));
31068 field_val -= M_CPU_SUBTYPE_START;
31071 /* Get the appropriate field in __cpu_model. */
31072 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31073 field, NULL_TREE);
31075 /* Check the value. */
31076 final = build2 (EQ_EXPR, unsigned_type_node, ref,
31077 build_int_cstu (unsigned_type_node, field_val));
31078 return build1 (CONVERT_EXPR, integer_type_node, final);
31080 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31082 tree ref;
31083 tree array_elt;
31084 tree field;
31085 tree final;
31087 unsigned int field_val = 0;
31088 unsigned int NUM_ISA_NAMES
31089 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
31091 for (i = 0; i < NUM_ISA_NAMES; i++)
31092 if (strcmp (isa_names_table[i].name,
31093 TREE_STRING_POINTER (param_string_cst)) == 0)
31094 break;
31096 if (i == NUM_ISA_NAMES)
31098 error ("Parameter to builtin not valid: %s",
31099 TREE_STRING_POINTER (param_string_cst));
31100 return integer_zero_node;
31103 field = TYPE_FIELDS (__processor_model_type);
31104 /* Get the last field, which is __cpu_features. */
31105 while (DECL_CHAIN (field))
31106 field = DECL_CHAIN (field);
31108 /* Get the appropriate field: __cpu_model.__cpu_features */
31109 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
31110 field, NULL_TREE);
31112 /* Access the 0th element of __cpu_features array. */
31113 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
31114 integer_zero_node, NULL_TREE, NULL_TREE);
31116 field_val = (1 << isa_names_table[i].feature);
31117 /* Return __cpu_model.__cpu_features[0] & field_val */
31118 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
31119 build_int_cstu (unsigned_type_node, field_val));
31120 return build1 (CONVERT_EXPR, integer_type_node, final);
31122 gcc_unreachable ();
31125 static tree
31126 ix86_fold_builtin (tree fndecl, int n_args,
31127 tree *args, bool ignore ATTRIBUTE_UNUSED)
31129 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
31131 enum ix86_builtins fn_code = (enum ix86_builtins)
31132 DECL_FUNCTION_CODE (fndecl);
31133 if (fn_code == IX86_BUILTIN_CPU_IS
31134 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
31136 gcc_assert (n_args == 1);
31137 return fold_builtin_cpu (fndecl, args);
31141 #ifdef SUBTARGET_FOLD_BUILTIN
31142 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
31143 #endif
31145 return NULL_TREE;
31148 /* Make builtins to detect cpu type and features supported. NAME is
31149 the builtin name, CODE is the builtin code, and FTYPE is the function
31150 type of the builtin. */
31152 static void
31153 make_cpu_type_builtin (const char* name, int code,
31154 enum ix86_builtin_func_type ftype, bool is_const)
31156 tree decl;
31157 tree type;
31159 type = ix86_get_builtin_func_type (ftype);
31160 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
31161 NULL, NULL_TREE);
31162 gcc_assert (decl != NULL_TREE);
31163 ix86_builtins[(int) code] = decl;
31164 TREE_READONLY (decl) = is_const;
31167 /* Make builtins to get CPU type and features supported. The created
31168 builtins are :
31170 __builtin_cpu_init (), to detect cpu type and features,
31171 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
31172 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
31175 static void
31176 ix86_init_platform_type_builtins (void)
31178 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
31179 INT_FTYPE_VOID, false);
31180 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
31181 INT_FTYPE_PCCHAR, true);
31182 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
31183 INT_FTYPE_PCCHAR, true);
31186 /* Internal method for ix86_init_builtins. */
31188 static void
31189 ix86_init_builtins_va_builtins_abi (void)
31191 tree ms_va_ref, sysv_va_ref;
31192 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
31193 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
31194 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
31195 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
31197 if (!TARGET_64BIT)
31198 return;
31199 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
31200 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
31201 ms_va_ref = build_reference_type (ms_va_list_type_node);
31202 sysv_va_ref =
31203 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
31205 fnvoid_va_end_ms =
31206 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31207 fnvoid_va_start_ms =
31208 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
31209 fnvoid_va_end_sysv =
31210 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
31211 fnvoid_va_start_sysv =
31212 build_varargs_function_type_list (void_type_node, sysv_va_ref,
31213 NULL_TREE);
31214 fnvoid_va_copy_ms =
31215 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
31216 NULL_TREE);
31217 fnvoid_va_copy_sysv =
31218 build_function_type_list (void_type_node, sysv_va_ref,
31219 sysv_va_ref, NULL_TREE);
31221 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
31222 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
31223 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
31224 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
31225 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
31226 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
31227 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
31228 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31229 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
31230 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31231 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
31232 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31235 static void
31236 ix86_init_builtin_types (void)
31238 tree float128_type_node, float80_type_node;
31240 /* The __float80 type. */
31241 float80_type_node = long_double_type_node;
31242 if (TYPE_MODE (float80_type_node) != XFmode)
31244 /* The __float80 type. */
31245 float80_type_node = make_node (REAL_TYPE);
31247 TYPE_PRECISION (float80_type_node) = 80;
31248 layout_type (float80_type_node);
31250 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
31252 /* The __float128 type. */
31253 float128_type_node = make_node (REAL_TYPE);
31254 TYPE_PRECISION (float128_type_node) = 128;
31255 layout_type (float128_type_node);
31256 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
31258 /* This macro is built by i386-builtin-types.awk. */
31259 DEFINE_BUILTIN_PRIMITIVE_TYPES;
31262 static void
31263 ix86_init_builtins (void)
31265 tree t;
31267 ix86_init_builtin_types ();
31269 /* Builtins to get CPU type and features. */
31270 ix86_init_platform_type_builtins ();
31272 /* TFmode support builtins. */
31273 def_builtin_const (0, "__builtin_infq",
31274 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
31275 def_builtin_const (0, "__builtin_huge_valq",
31276 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
31278 /* We will expand them to normal call if SSE isn't available since
31279 they are used by libgcc. */
31280 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
31281 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
31282 BUILT_IN_MD, "__fabstf2", NULL_TREE);
31283 TREE_READONLY (t) = 1;
31284 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
31286 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
31287 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
31288 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
31289 TREE_READONLY (t) = 1;
31290 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
31292 ix86_init_tm_builtins ();
31293 ix86_init_mmx_sse_builtins ();
31295 if (TARGET_LP64)
31296 ix86_init_builtins_va_builtins_abi ();
31298 #ifdef SUBTARGET_INIT_BUILTINS
31299 SUBTARGET_INIT_BUILTINS;
31300 #endif
31303 /* Return the ix86 builtin for CODE. */
31305 static tree
31306 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
31308 if (code >= IX86_BUILTIN_MAX)
31309 return error_mark_node;
31311 return ix86_builtins[code];
31314 /* Errors in the source file can cause expand_expr to return const0_rtx
31315 where we expect a vector. To avoid crashing, use one of the vector
31316 clear instructions. */
31317 static rtx
31318 safe_vector_operand (rtx x, enum machine_mode mode)
31320 if (x == const0_rtx)
31321 x = CONST0_RTX (mode);
31322 return x;
31325 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
31327 static rtx
31328 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
31330 rtx pat;
31331 tree arg0 = CALL_EXPR_ARG (exp, 0);
31332 tree arg1 = CALL_EXPR_ARG (exp, 1);
31333 rtx op0 = expand_normal (arg0);
31334 rtx op1 = expand_normal (arg1);
31335 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31336 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31337 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
31339 if (VECTOR_MODE_P (mode0))
31340 op0 = safe_vector_operand (op0, mode0);
31341 if (VECTOR_MODE_P (mode1))
31342 op1 = safe_vector_operand (op1, mode1);
31344 if (optimize || !target
31345 || GET_MODE (target) != tmode
31346 || !insn_data[icode].operand[0].predicate (target, tmode))
31347 target = gen_reg_rtx (tmode);
31349 if (GET_MODE (op1) == SImode && mode1 == TImode)
31351 rtx x = gen_reg_rtx (V4SImode);
31352 emit_insn (gen_sse2_loadd (x, op1));
31353 op1 = gen_lowpart (TImode, x);
31356 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31357 op0 = copy_to_mode_reg (mode0, op0);
31358 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31359 op1 = copy_to_mode_reg (mode1, op1);
31361 pat = GEN_FCN (icode) (target, op0, op1);
31362 if (! pat)
31363 return 0;
31365 emit_insn (pat);
31367 return target;
31370 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31372 static rtx
31373 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31374 enum ix86_builtin_func_type m_type,
31375 enum rtx_code sub_code)
31377 rtx pat;
31378 int i;
31379 int nargs;
31380 bool comparison_p = false;
31381 bool tf_p = false;
31382 bool last_arg_constant = false;
31383 int num_memory = 0;
31384 struct {
31385 rtx op;
31386 enum machine_mode mode;
31387 } args[4];
31389 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31391 switch (m_type)
31393 case MULTI_ARG_4_DF2_DI_I:
31394 case MULTI_ARG_4_DF2_DI_I1:
31395 case MULTI_ARG_4_SF2_SI_I:
31396 case MULTI_ARG_4_SF2_SI_I1:
31397 nargs = 4;
31398 last_arg_constant = true;
31399 break;
31401 case MULTI_ARG_3_SF:
31402 case MULTI_ARG_3_DF:
31403 case MULTI_ARG_3_SF2:
31404 case MULTI_ARG_3_DF2:
31405 case MULTI_ARG_3_DI:
31406 case MULTI_ARG_3_SI:
31407 case MULTI_ARG_3_SI_DI:
31408 case MULTI_ARG_3_HI:
31409 case MULTI_ARG_3_HI_SI:
31410 case MULTI_ARG_3_QI:
31411 case MULTI_ARG_3_DI2:
31412 case MULTI_ARG_3_SI2:
31413 case MULTI_ARG_3_HI2:
31414 case MULTI_ARG_3_QI2:
31415 nargs = 3;
31416 break;
31418 case MULTI_ARG_2_SF:
31419 case MULTI_ARG_2_DF:
31420 case MULTI_ARG_2_DI:
31421 case MULTI_ARG_2_SI:
31422 case MULTI_ARG_2_HI:
31423 case MULTI_ARG_2_QI:
31424 nargs = 2;
31425 break;
31427 case MULTI_ARG_2_DI_IMM:
31428 case MULTI_ARG_2_SI_IMM:
31429 case MULTI_ARG_2_HI_IMM:
31430 case MULTI_ARG_2_QI_IMM:
31431 nargs = 2;
31432 last_arg_constant = true;
31433 break;
31435 case MULTI_ARG_1_SF:
31436 case MULTI_ARG_1_DF:
31437 case MULTI_ARG_1_SF2:
31438 case MULTI_ARG_1_DF2:
31439 case MULTI_ARG_1_DI:
31440 case MULTI_ARG_1_SI:
31441 case MULTI_ARG_1_HI:
31442 case MULTI_ARG_1_QI:
31443 case MULTI_ARG_1_SI_DI:
31444 case MULTI_ARG_1_HI_DI:
31445 case MULTI_ARG_1_HI_SI:
31446 case MULTI_ARG_1_QI_DI:
31447 case MULTI_ARG_1_QI_SI:
31448 case MULTI_ARG_1_QI_HI:
31449 nargs = 1;
31450 break;
31452 case MULTI_ARG_2_DI_CMP:
31453 case MULTI_ARG_2_SI_CMP:
31454 case MULTI_ARG_2_HI_CMP:
31455 case MULTI_ARG_2_QI_CMP:
31456 nargs = 2;
31457 comparison_p = true;
31458 break;
31460 case MULTI_ARG_2_SF_TF:
31461 case MULTI_ARG_2_DF_TF:
31462 case MULTI_ARG_2_DI_TF:
31463 case MULTI_ARG_2_SI_TF:
31464 case MULTI_ARG_2_HI_TF:
31465 case MULTI_ARG_2_QI_TF:
31466 nargs = 2;
31467 tf_p = true;
31468 break;
31470 default:
31471 gcc_unreachable ();
31474 if (optimize || !target
31475 || GET_MODE (target) != tmode
31476 || !insn_data[icode].operand[0].predicate (target, tmode))
31477 target = gen_reg_rtx (tmode);
31479 gcc_assert (nargs <= 4);
31481 for (i = 0; i < nargs; i++)
31483 tree arg = CALL_EXPR_ARG (exp, i);
31484 rtx op = expand_normal (arg);
31485 int adjust = (comparison_p) ? 1 : 0;
31486 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31488 if (last_arg_constant && i == nargs - 1)
31490 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31492 enum insn_code new_icode = icode;
31493 switch (icode)
31495 case CODE_FOR_xop_vpermil2v2df3:
31496 case CODE_FOR_xop_vpermil2v4sf3:
31497 case CODE_FOR_xop_vpermil2v4df3:
31498 case CODE_FOR_xop_vpermil2v8sf3:
31499 error ("the last argument must be a 2-bit immediate");
31500 return gen_reg_rtx (tmode);
31501 case CODE_FOR_xop_rotlv2di3:
31502 new_icode = CODE_FOR_rotlv2di3;
31503 goto xop_rotl;
31504 case CODE_FOR_xop_rotlv4si3:
31505 new_icode = CODE_FOR_rotlv4si3;
31506 goto xop_rotl;
31507 case CODE_FOR_xop_rotlv8hi3:
31508 new_icode = CODE_FOR_rotlv8hi3;
31509 goto xop_rotl;
31510 case CODE_FOR_xop_rotlv16qi3:
31511 new_icode = CODE_FOR_rotlv16qi3;
31512 xop_rotl:
31513 if (CONST_INT_P (op))
31515 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31516 op = GEN_INT (INTVAL (op) & mask);
31517 gcc_checking_assert
31518 (insn_data[icode].operand[i + 1].predicate (op, mode));
31520 else
31522 gcc_checking_assert
31523 (nargs == 2
31524 && insn_data[new_icode].operand[0].mode == tmode
31525 && insn_data[new_icode].operand[1].mode == tmode
31526 && insn_data[new_icode].operand[2].mode == mode
31527 && insn_data[new_icode].operand[0].predicate
31528 == insn_data[icode].operand[0].predicate
31529 && insn_data[new_icode].operand[1].predicate
31530 == insn_data[icode].operand[1].predicate);
31531 icode = new_icode;
31532 goto non_constant;
31534 break;
31535 default:
31536 gcc_unreachable ();
31540 else
31542 non_constant:
31543 if (VECTOR_MODE_P (mode))
31544 op = safe_vector_operand (op, mode);
31546 /* If we aren't optimizing, only allow one memory operand to be
31547 generated. */
31548 if (memory_operand (op, mode))
31549 num_memory++;
31551 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31553 if (optimize
31554 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31555 || num_memory > 1)
31556 op = force_reg (mode, op);
31559 args[i].op = op;
31560 args[i].mode = mode;
31563 switch (nargs)
31565 case 1:
31566 pat = GEN_FCN (icode) (target, args[0].op);
31567 break;
31569 case 2:
31570 if (tf_p)
31571 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31572 GEN_INT ((int)sub_code));
31573 else if (! comparison_p)
31574 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31575 else
31577 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31578 args[0].op,
31579 args[1].op);
31581 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31583 break;
31585 case 3:
31586 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31587 break;
31589 case 4:
31590 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31591 break;
31593 default:
31594 gcc_unreachable ();
31597 if (! pat)
31598 return 0;
31600 emit_insn (pat);
31601 return target;
31604 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31605 insns with vec_merge. */
31607 static rtx
31608 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31609 rtx target)
31611 rtx pat;
31612 tree arg0 = CALL_EXPR_ARG (exp, 0);
31613 rtx op1, op0 = expand_normal (arg0);
31614 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31615 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31617 if (optimize || !target
31618 || GET_MODE (target) != tmode
31619 || !insn_data[icode].operand[0].predicate (target, tmode))
31620 target = gen_reg_rtx (tmode);
31622 if (VECTOR_MODE_P (mode0))
31623 op0 = safe_vector_operand (op0, mode0);
31625 if ((optimize && !register_operand (op0, mode0))
31626 || !insn_data[icode].operand[1].predicate (op0, mode0))
31627 op0 = copy_to_mode_reg (mode0, op0);
31629 op1 = op0;
31630 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31631 op1 = copy_to_mode_reg (mode0, op1);
31633 pat = GEN_FCN (icode) (target, op0, op1);
31634 if (! pat)
31635 return 0;
31636 emit_insn (pat);
31637 return target;
31640 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31642 static rtx
31643 ix86_expand_sse_compare (const struct builtin_description *d,
31644 tree exp, rtx target, bool swap)
31646 rtx pat;
31647 tree arg0 = CALL_EXPR_ARG (exp, 0);
31648 tree arg1 = CALL_EXPR_ARG (exp, 1);
31649 rtx op0 = expand_normal (arg0);
31650 rtx op1 = expand_normal (arg1);
31651 rtx op2;
31652 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31653 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31654 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31655 enum rtx_code comparison = d->comparison;
31657 if (VECTOR_MODE_P (mode0))
31658 op0 = safe_vector_operand (op0, mode0);
31659 if (VECTOR_MODE_P (mode1))
31660 op1 = safe_vector_operand (op1, mode1);
31662 /* Swap operands if we have a comparison that isn't available in
31663 hardware. */
31664 if (swap)
31666 rtx tmp = gen_reg_rtx (mode1);
31667 emit_move_insn (tmp, op1);
31668 op1 = op0;
31669 op0 = tmp;
31672 if (optimize || !target
31673 || GET_MODE (target) != tmode
31674 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31675 target = gen_reg_rtx (tmode);
31677 if ((optimize && !register_operand (op0, mode0))
31678 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31679 op0 = copy_to_mode_reg (mode0, op0);
31680 if ((optimize && !register_operand (op1, mode1))
31681 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31682 op1 = copy_to_mode_reg (mode1, op1);
31684 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31685 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31686 if (! pat)
31687 return 0;
31688 emit_insn (pat);
31689 return target;
31692 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31694 static rtx
31695 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31696 rtx target)
31698 rtx pat;
31699 tree arg0 = CALL_EXPR_ARG (exp, 0);
31700 tree arg1 = CALL_EXPR_ARG (exp, 1);
31701 rtx op0 = expand_normal (arg0);
31702 rtx op1 = expand_normal (arg1);
31703 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31704 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31705 enum rtx_code comparison = d->comparison;
31707 if (VECTOR_MODE_P (mode0))
31708 op0 = safe_vector_operand (op0, mode0);
31709 if (VECTOR_MODE_P (mode1))
31710 op1 = safe_vector_operand (op1, mode1);
31712 /* Swap operands if we have a comparison that isn't available in
31713 hardware. */
31714 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31716 rtx tmp = op1;
31717 op1 = op0;
31718 op0 = tmp;
31721 target = gen_reg_rtx (SImode);
31722 emit_move_insn (target, const0_rtx);
31723 target = gen_rtx_SUBREG (QImode, target, 0);
31725 if ((optimize && !register_operand (op0, mode0))
31726 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31727 op0 = copy_to_mode_reg (mode0, op0);
31728 if ((optimize && !register_operand (op1, mode1))
31729 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31730 op1 = copy_to_mode_reg (mode1, op1);
31732 pat = GEN_FCN (d->icode) (op0, op1);
31733 if (! pat)
31734 return 0;
31735 emit_insn (pat);
31736 emit_insn (gen_rtx_SET (VOIDmode,
31737 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31738 gen_rtx_fmt_ee (comparison, QImode,
31739 SET_DEST (pat),
31740 const0_rtx)));
31742 return SUBREG_REG (target);
31745 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31747 static rtx
31748 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31749 rtx target)
31751 rtx pat;
31752 tree arg0 = CALL_EXPR_ARG (exp, 0);
31753 rtx op1, op0 = expand_normal (arg0);
31754 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31755 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31757 if (optimize || target == 0
31758 || GET_MODE (target) != tmode
31759 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31760 target = gen_reg_rtx (tmode);
31762 if (VECTOR_MODE_P (mode0))
31763 op0 = safe_vector_operand (op0, mode0);
31765 if ((optimize && !register_operand (op0, mode0))
31766 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31767 op0 = copy_to_mode_reg (mode0, op0);
31769 op1 = GEN_INT (d->comparison);
31771 pat = GEN_FCN (d->icode) (target, op0, op1);
31772 if (! pat)
31773 return 0;
31774 emit_insn (pat);
31775 return target;
31778 static rtx
31779 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31780 tree exp, rtx target)
31782 rtx pat;
31783 tree arg0 = CALL_EXPR_ARG (exp, 0);
31784 tree arg1 = CALL_EXPR_ARG (exp, 1);
31785 rtx op0 = expand_normal (arg0);
31786 rtx op1 = expand_normal (arg1);
31787 rtx op2;
31788 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31789 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31790 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31792 if (optimize || target == 0
31793 || GET_MODE (target) != tmode
31794 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31795 target = gen_reg_rtx (tmode);
31797 op0 = safe_vector_operand (op0, mode0);
31798 op1 = safe_vector_operand (op1, mode1);
31800 if ((optimize && !register_operand (op0, mode0))
31801 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31802 op0 = copy_to_mode_reg (mode0, op0);
31803 if ((optimize && !register_operand (op1, mode1))
31804 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31805 op1 = copy_to_mode_reg (mode1, op1);
31807 op2 = GEN_INT (d->comparison);
31809 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31810 if (! pat)
31811 return 0;
31812 emit_insn (pat);
31813 return target;
31816 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31818 static rtx
31819 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31820 rtx target)
31822 rtx pat;
31823 tree arg0 = CALL_EXPR_ARG (exp, 0);
31824 tree arg1 = CALL_EXPR_ARG (exp, 1);
31825 rtx op0 = expand_normal (arg0);
31826 rtx op1 = expand_normal (arg1);
31827 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31828 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31829 enum rtx_code comparison = d->comparison;
31831 if (VECTOR_MODE_P (mode0))
31832 op0 = safe_vector_operand (op0, mode0);
31833 if (VECTOR_MODE_P (mode1))
31834 op1 = safe_vector_operand (op1, mode1);
31836 target = gen_reg_rtx (SImode);
31837 emit_move_insn (target, const0_rtx);
31838 target = gen_rtx_SUBREG (QImode, target, 0);
31840 if ((optimize && !register_operand (op0, mode0))
31841 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31842 op0 = copy_to_mode_reg (mode0, op0);
31843 if ((optimize && !register_operand (op1, mode1))
31844 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31845 op1 = copy_to_mode_reg (mode1, op1);
31847 pat = GEN_FCN (d->icode) (op0, op1);
31848 if (! pat)
31849 return 0;
31850 emit_insn (pat);
31851 emit_insn (gen_rtx_SET (VOIDmode,
31852 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31853 gen_rtx_fmt_ee (comparison, QImode,
31854 SET_DEST (pat),
31855 const0_rtx)));
31857 return SUBREG_REG (target);
31860 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31862 static rtx
31863 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31864 tree exp, rtx target)
31866 rtx pat;
31867 tree arg0 = CALL_EXPR_ARG (exp, 0);
31868 tree arg1 = CALL_EXPR_ARG (exp, 1);
31869 tree arg2 = CALL_EXPR_ARG (exp, 2);
31870 tree arg3 = CALL_EXPR_ARG (exp, 3);
31871 tree arg4 = CALL_EXPR_ARG (exp, 4);
31872 rtx scratch0, scratch1;
31873 rtx op0 = expand_normal (arg0);
31874 rtx op1 = expand_normal (arg1);
31875 rtx op2 = expand_normal (arg2);
31876 rtx op3 = expand_normal (arg3);
31877 rtx op4 = expand_normal (arg4);
31878 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31880 tmode0 = insn_data[d->icode].operand[0].mode;
31881 tmode1 = insn_data[d->icode].operand[1].mode;
31882 modev2 = insn_data[d->icode].operand[2].mode;
31883 modei3 = insn_data[d->icode].operand[3].mode;
31884 modev4 = insn_data[d->icode].operand[4].mode;
31885 modei5 = insn_data[d->icode].operand[5].mode;
31886 modeimm = insn_data[d->icode].operand[6].mode;
31888 if (VECTOR_MODE_P (modev2))
31889 op0 = safe_vector_operand (op0, modev2);
31890 if (VECTOR_MODE_P (modev4))
31891 op2 = safe_vector_operand (op2, modev4);
31893 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31894 op0 = copy_to_mode_reg (modev2, op0);
31895 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31896 op1 = copy_to_mode_reg (modei3, op1);
31897 if ((optimize && !register_operand (op2, modev4))
31898 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31899 op2 = copy_to_mode_reg (modev4, op2);
31900 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31901 op3 = copy_to_mode_reg (modei5, op3);
31903 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31905 error ("the fifth argument must be an 8-bit immediate");
31906 return const0_rtx;
31909 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31911 if (optimize || !target
31912 || GET_MODE (target) != tmode0
31913 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31914 target = gen_reg_rtx (tmode0);
31916 scratch1 = gen_reg_rtx (tmode1);
31918 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31920 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31922 if (optimize || !target
31923 || GET_MODE (target) != tmode1
31924 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31925 target = gen_reg_rtx (tmode1);
31927 scratch0 = gen_reg_rtx (tmode0);
31929 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31931 else
31933 gcc_assert (d->flag);
31935 scratch0 = gen_reg_rtx (tmode0);
31936 scratch1 = gen_reg_rtx (tmode1);
31938 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31941 if (! pat)
31942 return 0;
31944 emit_insn (pat);
31946 if (d->flag)
31948 target = gen_reg_rtx (SImode);
31949 emit_move_insn (target, const0_rtx);
31950 target = gen_rtx_SUBREG (QImode, target, 0);
31952 emit_insn
31953 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31954 gen_rtx_fmt_ee (EQ, QImode,
31955 gen_rtx_REG ((enum machine_mode) d->flag,
31956 FLAGS_REG),
31957 const0_rtx)));
31958 return SUBREG_REG (target);
31960 else
31961 return target;
31965 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31967 static rtx
31968 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31969 tree exp, rtx target)
31971 rtx pat;
31972 tree arg0 = CALL_EXPR_ARG (exp, 0);
31973 tree arg1 = CALL_EXPR_ARG (exp, 1);
31974 tree arg2 = CALL_EXPR_ARG (exp, 2);
31975 rtx scratch0, scratch1;
31976 rtx op0 = expand_normal (arg0);
31977 rtx op1 = expand_normal (arg1);
31978 rtx op2 = expand_normal (arg2);
31979 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31981 tmode0 = insn_data[d->icode].operand[0].mode;
31982 tmode1 = insn_data[d->icode].operand[1].mode;
31983 modev2 = insn_data[d->icode].operand[2].mode;
31984 modev3 = insn_data[d->icode].operand[3].mode;
31985 modeimm = insn_data[d->icode].operand[4].mode;
31987 if (VECTOR_MODE_P (modev2))
31988 op0 = safe_vector_operand (op0, modev2);
31989 if (VECTOR_MODE_P (modev3))
31990 op1 = safe_vector_operand (op1, modev3);
31992 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31993 op0 = copy_to_mode_reg (modev2, op0);
31994 if ((optimize && !register_operand (op1, modev3))
31995 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31996 op1 = copy_to_mode_reg (modev3, op1);
31998 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
32000 error ("the third argument must be an 8-bit immediate");
32001 return const0_rtx;
32004 if (d->code == IX86_BUILTIN_PCMPISTRI128)
32006 if (optimize || !target
32007 || GET_MODE (target) != tmode0
32008 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
32009 target = gen_reg_rtx (tmode0);
32011 scratch1 = gen_reg_rtx (tmode1);
32013 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
32015 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
32017 if (optimize || !target
32018 || GET_MODE (target) != tmode1
32019 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
32020 target = gen_reg_rtx (tmode1);
32022 scratch0 = gen_reg_rtx (tmode0);
32024 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
32026 else
32028 gcc_assert (d->flag);
32030 scratch0 = gen_reg_rtx (tmode0);
32031 scratch1 = gen_reg_rtx (tmode1);
32033 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
32036 if (! pat)
32037 return 0;
32039 emit_insn (pat);
32041 if (d->flag)
32043 target = gen_reg_rtx (SImode);
32044 emit_move_insn (target, const0_rtx);
32045 target = gen_rtx_SUBREG (QImode, target, 0);
32047 emit_insn
32048 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
32049 gen_rtx_fmt_ee (EQ, QImode,
32050 gen_rtx_REG ((enum machine_mode) d->flag,
32051 FLAGS_REG),
32052 const0_rtx)));
32053 return SUBREG_REG (target);
32055 else
32056 return target;
32059 /* Subroutine of ix86_expand_builtin to take care of insns with
32060 variable number of operands. */
32062 static rtx
32063 ix86_expand_args_builtin (const struct builtin_description *d,
32064 tree exp, rtx target)
32066 rtx pat, real_target;
32067 unsigned int i, nargs;
32068 unsigned int nargs_constant = 0;
32069 int num_memory = 0;
32070 struct
32072 rtx op;
32073 enum machine_mode mode;
32074 } args[4];
32075 bool last_arg_count = false;
32076 enum insn_code icode = d->icode;
32077 const struct insn_data_d *insn_p = &insn_data[icode];
32078 enum machine_mode tmode = insn_p->operand[0].mode;
32079 enum machine_mode rmode = VOIDmode;
32080 bool swap = false;
32081 enum rtx_code comparison = d->comparison;
32083 switch ((enum ix86_builtin_func_type) d->flag)
32085 case V2DF_FTYPE_V2DF_ROUND:
32086 case V4DF_FTYPE_V4DF_ROUND:
32087 case V4SF_FTYPE_V4SF_ROUND:
32088 case V8SF_FTYPE_V8SF_ROUND:
32089 case V4SI_FTYPE_V4SF_ROUND:
32090 case V8SI_FTYPE_V8SF_ROUND:
32091 return ix86_expand_sse_round (d, exp, target);
32092 case V4SI_FTYPE_V2DF_V2DF_ROUND:
32093 case V8SI_FTYPE_V4DF_V4DF_ROUND:
32094 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
32095 case INT_FTYPE_V8SF_V8SF_PTEST:
32096 case INT_FTYPE_V4DI_V4DI_PTEST:
32097 case INT_FTYPE_V4DF_V4DF_PTEST:
32098 case INT_FTYPE_V4SF_V4SF_PTEST:
32099 case INT_FTYPE_V2DI_V2DI_PTEST:
32100 case INT_FTYPE_V2DF_V2DF_PTEST:
32101 return ix86_expand_sse_ptest (d, exp, target);
32102 case FLOAT128_FTYPE_FLOAT128:
32103 case FLOAT_FTYPE_FLOAT:
32104 case INT_FTYPE_INT:
32105 case UINT64_FTYPE_INT:
32106 case UINT16_FTYPE_UINT16:
32107 case INT64_FTYPE_INT64:
32108 case INT64_FTYPE_V4SF:
32109 case INT64_FTYPE_V2DF:
32110 case INT_FTYPE_V16QI:
32111 case INT_FTYPE_V8QI:
32112 case INT_FTYPE_V8SF:
32113 case INT_FTYPE_V4DF:
32114 case INT_FTYPE_V4SF:
32115 case INT_FTYPE_V2DF:
32116 case INT_FTYPE_V32QI:
32117 case V16QI_FTYPE_V16QI:
32118 case V8SI_FTYPE_V8SF:
32119 case V8SI_FTYPE_V4SI:
32120 case V8HI_FTYPE_V8HI:
32121 case V8HI_FTYPE_V16QI:
32122 case V8QI_FTYPE_V8QI:
32123 case V8SF_FTYPE_V8SF:
32124 case V8SF_FTYPE_V8SI:
32125 case V8SF_FTYPE_V4SF:
32126 case V8SF_FTYPE_V8HI:
32127 case V4SI_FTYPE_V4SI:
32128 case V4SI_FTYPE_V16QI:
32129 case V4SI_FTYPE_V4SF:
32130 case V4SI_FTYPE_V8SI:
32131 case V4SI_FTYPE_V8HI:
32132 case V4SI_FTYPE_V4DF:
32133 case V4SI_FTYPE_V2DF:
32134 case V4HI_FTYPE_V4HI:
32135 case V4DF_FTYPE_V4DF:
32136 case V4DF_FTYPE_V4SI:
32137 case V4DF_FTYPE_V4SF:
32138 case V4DF_FTYPE_V2DF:
32139 case V4SF_FTYPE_V4SF:
32140 case V4SF_FTYPE_V4SI:
32141 case V4SF_FTYPE_V8SF:
32142 case V4SF_FTYPE_V4DF:
32143 case V4SF_FTYPE_V8HI:
32144 case V4SF_FTYPE_V2DF:
32145 case V2DI_FTYPE_V2DI:
32146 case V2DI_FTYPE_V16QI:
32147 case V2DI_FTYPE_V8HI:
32148 case V2DI_FTYPE_V4SI:
32149 case V2DF_FTYPE_V2DF:
32150 case V2DF_FTYPE_V4SI:
32151 case V2DF_FTYPE_V4DF:
32152 case V2DF_FTYPE_V4SF:
32153 case V2DF_FTYPE_V2SI:
32154 case V2SI_FTYPE_V2SI:
32155 case V2SI_FTYPE_V4SF:
32156 case V2SI_FTYPE_V2SF:
32157 case V2SI_FTYPE_V2DF:
32158 case V2SF_FTYPE_V2SF:
32159 case V2SF_FTYPE_V2SI:
32160 case V32QI_FTYPE_V32QI:
32161 case V32QI_FTYPE_V16QI:
32162 case V16HI_FTYPE_V16HI:
32163 case V16HI_FTYPE_V8HI:
32164 case V8SI_FTYPE_V8SI:
32165 case V16HI_FTYPE_V16QI:
32166 case V8SI_FTYPE_V16QI:
32167 case V4DI_FTYPE_V16QI:
32168 case V8SI_FTYPE_V8HI:
32169 case V4DI_FTYPE_V8HI:
32170 case V4DI_FTYPE_V4SI:
32171 case V4DI_FTYPE_V2DI:
32172 nargs = 1;
32173 break;
32174 case V4SF_FTYPE_V4SF_VEC_MERGE:
32175 case V2DF_FTYPE_V2DF_VEC_MERGE:
32176 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
32177 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
32178 case V16QI_FTYPE_V16QI_V16QI:
32179 case V16QI_FTYPE_V8HI_V8HI:
32180 case V8QI_FTYPE_V8QI_V8QI:
32181 case V8QI_FTYPE_V4HI_V4HI:
32182 case V8HI_FTYPE_V8HI_V8HI:
32183 case V8HI_FTYPE_V16QI_V16QI:
32184 case V8HI_FTYPE_V4SI_V4SI:
32185 case V8SF_FTYPE_V8SF_V8SF:
32186 case V8SF_FTYPE_V8SF_V8SI:
32187 case V4SI_FTYPE_V4SI_V4SI:
32188 case V4SI_FTYPE_V8HI_V8HI:
32189 case V4SI_FTYPE_V4SF_V4SF:
32190 case V4SI_FTYPE_V2DF_V2DF:
32191 case V4HI_FTYPE_V4HI_V4HI:
32192 case V4HI_FTYPE_V8QI_V8QI:
32193 case V4HI_FTYPE_V2SI_V2SI:
32194 case V4DF_FTYPE_V4DF_V4DF:
32195 case V4DF_FTYPE_V4DF_V4DI:
32196 case V4SF_FTYPE_V4SF_V4SF:
32197 case V4SF_FTYPE_V4SF_V4SI:
32198 case V4SF_FTYPE_V4SF_V2SI:
32199 case V4SF_FTYPE_V4SF_V2DF:
32200 case V4SF_FTYPE_V4SF_DI:
32201 case V4SF_FTYPE_V4SF_SI:
32202 case V2DI_FTYPE_V2DI_V2DI:
32203 case V2DI_FTYPE_V16QI_V16QI:
32204 case V2DI_FTYPE_V4SI_V4SI:
32205 case V2UDI_FTYPE_V4USI_V4USI:
32206 case V2DI_FTYPE_V2DI_V16QI:
32207 case V2DI_FTYPE_V2DF_V2DF:
32208 case V2SI_FTYPE_V2SI_V2SI:
32209 case V2SI_FTYPE_V4HI_V4HI:
32210 case V2SI_FTYPE_V2SF_V2SF:
32211 case V2DF_FTYPE_V2DF_V2DF:
32212 case V2DF_FTYPE_V2DF_V4SF:
32213 case V2DF_FTYPE_V2DF_V2DI:
32214 case V2DF_FTYPE_V2DF_DI:
32215 case V2DF_FTYPE_V2DF_SI:
32216 case V2SF_FTYPE_V2SF_V2SF:
32217 case V1DI_FTYPE_V1DI_V1DI:
32218 case V1DI_FTYPE_V8QI_V8QI:
32219 case V1DI_FTYPE_V2SI_V2SI:
32220 case V32QI_FTYPE_V16HI_V16HI:
32221 case V16HI_FTYPE_V8SI_V8SI:
32222 case V32QI_FTYPE_V32QI_V32QI:
32223 case V16HI_FTYPE_V32QI_V32QI:
32224 case V16HI_FTYPE_V16HI_V16HI:
32225 case V8SI_FTYPE_V4DF_V4DF:
32226 case V8SI_FTYPE_V8SI_V8SI:
32227 case V8SI_FTYPE_V16HI_V16HI:
32228 case V4DI_FTYPE_V4DI_V4DI:
32229 case V4DI_FTYPE_V8SI_V8SI:
32230 case V4UDI_FTYPE_V8USI_V8USI:
32231 if (comparison == UNKNOWN)
32232 return ix86_expand_binop_builtin (icode, exp, target);
32233 nargs = 2;
32234 break;
32235 case V4SF_FTYPE_V4SF_V4SF_SWAP:
32236 case V2DF_FTYPE_V2DF_V2DF_SWAP:
32237 gcc_assert (comparison != UNKNOWN);
32238 nargs = 2;
32239 swap = true;
32240 break;
32241 case V16HI_FTYPE_V16HI_V8HI_COUNT:
32242 case V16HI_FTYPE_V16HI_SI_COUNT:
32243 case V8SI_FTYPE_V8SI_V4SI_COUNT:
32244 case V8SI_FTYPE_V8SI_SI_COUNT:
32245 case V4DI_FTYPE_V4DI_V2DI_COUNT:
32246 case V4DI_FTYPE_V4DI_INT_COUNT:
32247 case V8HI_FTYPE_V8HI_V8HI_COUNT:
32248 case V8HI_FTYPE_V8HI_SI_COUNT:
32249 case V4SI_FTYPE_V4SI_V4SI_COUNT:
32250 case V4SI_FTYPE_V4SI_SI_COUNT:
32251 case V4HI_FTYPE_V4HI_V4HI_COUNT:
32252 case V4HI_FTYPE_V4HI_SI_COUNT:
32253 case V2DI_FTYPE_V2DI_V2DI_COUNT:
32254 case V2DI_FTYPE_V2DI_SI_COUNT:
32255 case V2SI_FTYPE_V2SI_V2SI_COUNT:
32256 case V2SI_FTYPE_V2SI_SI_COUNT:
32257 case V1DI_FTYPE_V1DI_V1DI_COUNT:
32258 case V1DI_FTYPE_V1DI_SI_COUNT:
32259 nargs = 2;
32260 last_arg_count = true;
32261 break;
32262 case UINT64_FTYPE_UINT64_UINT64:
32263 case UINT_FTYPE_UINT_UINT:
32264 case UINT_FTYPE_UINT_USHORT:
32265 case UINT_FTYPE_UINT_UCHAR:
32266 case UINT16_FTYPE_UINT16_INT:
32267 case UINT8_FTYPE_UINT8_INT:
32268 nargs = 2;
32269 break;
32270 case V2DI_FTYPE_V2DI_INT_CONVERT:
32271 nargs = 2;
32272 rmode = V1TImode;
32273 nargs_constant = 1;
32274 break;
32275 case V4DI_FTYPE_V4DI_INT_CONVERT:
32276 nargs = 2;
32277 rmode = V2TImode;
32278 nargs_constant = 1;
32279 break;
32280 case V8HI_FTYPE_V8HI_INT:
32281 case V8HI_FTYPE_V8SF_INT:
32282 case V8HI_FTYPE_V4SF_INT:
32283 case V8SF_FTYPE_V8SF_INT:
32284 case V4SI_FTYPE_V4SI_INT:
32285 case V4SI_FTYPE_V8SI_INT:
32286 case V4HI_FTYPE_V4HI_INT:
32287 case V4DF_FTYPE_V4DF_INT:
32288 case V4SF_FTYPE_V4SF_INT:
32289 case V4SF_FTYPE_V8SF_INT:
32290 case V2DI_FTYPE_V2DI_INT:
32291 case V2DF_FTYPE_V2DF_INT:
32292 case V2DF_FTYPE_V4DF_INT:
32293 case V16HI_FTYPE_V16HI_INT:
32294 case V8SI_FTYPE_V8SI_INT:
32295 case V4DI_FTYPE_V4DI_INT:
32296 case V2DI_FTYPE_V4DI_INT:
32297 nargs = 2;
32298 nargs_constant = 1;
32299 break;
32300 case V16QI_FTYPE_V16QI_V16QI_V16QI:
32301 case V8SF_FTYPE_V8SF_V8SF_V8SF:
32302 case V4DF_FTYPE_V4DF_V4DF_V4DF:
32303 case V4SF_FTYPE_V4SF_V4SF_V4SF:
32304 case V2DF_FTYPE_V2DF_V2DF_V2DF:
32305 case V32QI_FTYPE_V32QI_V32QI_V32QI:
32306 nargs = 3;
32307 break;
32308 case V32QI_FTYPE_V32QI_V32QI_INT:
32309 case V16HI_FTYPE_V16HI_V16HI_INT:
32310 case V16QI_FTYPE_V16QI_V16QI_INT:
32311 case V4DI_FTYPE_V4DI_V4DI_INT:
32312 case V8HI_FTYPE_V8HI_V8HI_INT:
32313 case V8SI_FTYPE_V8SI_V8SI_INT:
32314 case V8SI_FTYPE_V8SI_V4SI_INT:
32315 case V8SF_FTYPE_V8SF_V8SF_INT:
32316 case V8SF_FTYPE_V8SF_V4SF_INT:
32317 case V4SI_FTYPE_V4SI_V4SI_INT:
32318 case V4DF_FTYPE_V4DF_V4DF_INT:
32319 case V4DF_FTYPE_V4DF_V2DF_INT:
32320 case V4SF_FTYPE_V4SF_V4SF_INT:
32321 case V2DI_FTYPE_V2DI_V2DI_INT:
32322 case V4DI_FTYPE_V4DI_V2DI_INT:
32323 case V2DF_FTYPE_V2DF_V2DF_INT:
32324 nargs = 3;
32325 nargs_constant = 1;
32326 break;
32327 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
32328 nargs = 3;
32329 rmode = V4DImode;
32330 nargs_constant = 1;
32331 break;
32332 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
32333 nargs = 3;
32334 rmode = V2DImode;
32335 nargs_constant = 1;
32336 break;
32337 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
32338 nargs = 3;
32339 rmode = DImode;
32340 nargs_constant = 1;
32341 break;
32342 case V2DI_FTYPE_V2DI_UINT_UINT:
32343 nargs = 3;
32344 nargs_constant = 2;
32345 break;
32346 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
32347 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
32348 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
32349 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
32350 nargs = 4;
32351 nargs_constant = 1;
32352 break;
32353 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
32354 nargs = 4;
32355 nargs_constant = 2;
32356 break;
32357 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
32358 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32359 nargs = 4;
32360 break;
32361 default:
32362 gcc_unreachable ();
32365 gcc_assert (nargs <= ARRAY_SIZE (args));
32367 if (comparison != UNKNOWN)
32369 gcc_assert (nargs == 2);
32370 return ix86_expand_sse_compare (d, exp, target, swap);
32373 if (rmode == VOIDmode || rmode == tmode)
32375 if (optimize
32376 || target == 0
32377 || GET_MODE (target) != tmode
32378 || !insn_p->operand[0].predicate (target, tmode))
32379 target = gen_reg_rtx (tmode);
32380 real_target = target;
32382 else
32384 real_target = gen_reg_rtx (tmode);
32385 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32388 for (i = 0; i < nargs; i++)
32390 tree arg = CALL_EXPR_ARG (exp, i);
32391 rtx op = expand_normal (arg);
32392 enum machine_mode mode = insn_p->operand[i + 1].mode;
32393 bool match = insn_p->operand[i + 1].predicate (op, mode);
32395 if (last_arg_count && (i + 1) == nargs)
32397 /* SIMD shift insns take either an 8-bit immediate or
32398 register as count. But builtin functions take int as
32399 count. If count doesn't match, we put it in register. */
32400 if (!match)
32402 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32403 if (!insn_p->operand[i + 1].predicate (op, mode))
32404 op = copy_to_reg (op);
32407 else if ((nargs - i) <= nargs_constant)
32409 if (!match)
32410 switch (icode)
32412 case CODE_FOR_avx2_inserti128:
32413 case CODE_FOR_avx2_extracti128:
32414 error ("the last argument must be an 1-bit immediate");
32415 return const0_rtx;
32417 case CODE_FOR_sse4_1_roundsd:
32418 case CODE_FOR_sse4_1_roundss:
32420 case CODE_FOR_sse4_1_roundpd:
32421 case CODE_FOR_sse4_1_roundps:
32422 case CODE_FOR_avx_roundpd256:
32423 case CODE_FOR_avx_roundps256:
32425 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32426 case CODE_FOR_sse4_1_roundps_sfix:
32427 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32428 case CODE_FOR_avx_roundps_sfix256:
32430 case CODE_FOR_sse4_1_blendps:
32431 case CODE_FOR_avx_blendpd256:
32432 case CODE_FOR_avx_vpermilv4df:
32433 error ("the last argument must be a 4-bit immediate");
32434 return const0_rtx;
32436 case CODE_FOR_sse4_1_blendpd:
32437 case CODE_FOR_avx_vpermilv2df:
32438 case CODE_FOR_xop_vpermil2v2df3:
32439 case CODE_FOR_xop_vpermil2v4sf3:
32440 case CODE_FOR_xop_vpermil2v4df3:
32441 case CODE_FOR_xop_vpermil2v8sf3:
32442 error ("the last argument must be a 2-bit immediate");
32443 return const0_rtx;
32445 case CODE_FOR_avx_vextractf128v4df:
32446 case CODE_FOR_avx_vextractf128v8sf:
32447 case CODE_FOR_avx_vextractf128v8si:
32448 case CODE_FOR_avx_vinsertf128v4df:
32449 case CODE_FOR_avx_vinsertf128v8sf:
32450 case CODE_FOR_avx_vinsertf128v8si:
32451 error ("the last argument must be a 1-bit immediate");
32452 return const0_rtx;
32454 case CODE_FOR_avx_vmcmpv2df3:
32455 case CODE_FOR_avx_vmcmpv4sf3:
32456 case CODE_FOR_avx_cmpv2df3:
32457 case CODE_FOR_avx_cmpv4sf3:
32458 case CODE_FOR_avx_cmpv4df3:
32459 case CODE_FOR_avx_cmpv8sf3:
32460 error ("the last argument must be a 5-bit immediate");
32461 return const0_rtx;
32463 default:
32464 switch (nargs_constant)
32466 case 2:
32467 if ((nargs - i) == nargs_constant)
32469 error ("the next to last argument must be an 8-bit immediate");
32470 break;
32472 case 1:
32473 error ("the last argument must be an 8-bit immediate");
32474 break;
32475 default:
32476 gcc_unreachable ();
32478 return const0_rtx;
32481 else
32483 if (VECTOR_MODE_P (mode))
32484 op = safe_vector_operand (op, mode);
32486 /* If we aren't optimizing, only allow one memory operand to
32487 be generated. */
32488 if (memory_operand (op, mode))
32489 num_memory++;
32491 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32493 if (optimize || !match || num_memory > 1)
32494 op = copy_to_mode_reg (mode, op);
32496 else
32498 op = copy_to_reg (op);
32499 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32503 args[i].op = op;
32504 args[i].mode = mode;
32507 switch (nargs)
32509 case 1:
32510 pat = GEN_FCN (icode) (real_target, args[0].op);
32511 break;
32512 case 2:
32513 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32514 break;
32515 case 3:
32516 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32517 args[2].op);
32518 break;
32519 case 4:
32520 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32521 args[2].op, args[3].op);
32522 break;
32523 default:
32524 gcc_unreachable ();
32527 if (! pat)
32528 return 0;
32530 emit_insn (pat);
32531 return target;
32534 /* Subroutine of ix86_expand_builtin to take care of special insns
32535 with variable number of operands. */
32537 static rtx
32538 ix86_expand_special_args_builtin (const struct builtin_description *d,
32539 tree exp, rtx target)
32541 tree arg;
32542 rtx pat, op;
32543 unsigned int i, nargs, arg_adjust, memory;
32544 bool aligned_mem = false;
32545 struct
32547 rtx op;
32548 enum machine_mode mode;
32549 } args[3];
32550 enum insn_code icode = d->icode;
32551 bool last_arg_constant = false;
32552 const struct insn_data_d *insn_p = &insn_data[icode];
32553 enum machine_mode tmode = insn_p->operand[0].mode;
32554 enum { load, store } klass;
32556 switch ((enum ix86_builtin_func_type) d->flag)
32558 case VOID_FTYPE_VOID:
32559 emit_insn (GEN_FCN (icode) (target));
32560 return 0;
32561 case VOID_FTYPE_UINT64:
32562 case VOID_FTYPE_UNSIGNED:
32563 nargs = 0;
32564 klass = store;
32565 memory = 0;
32566 break;
32568 case INT_FTYPE_VOID:
32569 case UINT64_FTYPE_VOID:
32570 case UNSIGNED_FTYPE_VOID:
32571 nargs = 0;
32572 klass = load;
32573 memory = 0;
32574 break;
32575 case UINT64_FTYPE_PUNSIGNED:
32576 case V2DI_FTYPE_PV2DI:
32577 case V4DI_FTYPE_PV4DI:
32578 case V32QI_FTYPE_PCCHAR:
32579 case V16QI_FTYPE_PCCHAR:
32580 case V8SF_FTYPE_PCV4SF:
32581 case V8SF_FTYPE_PCFLOAT:
32582 case V4SF_FTYPE_PCFLOAT:
32583 case V4DF_FTYPE_PCV2DF:
32584 case V4DF_FTYPE_PCDOUBLE:
32585 case V2DF_FTYPE_PCDOUBLE:
32586 case VOID_FTYPE_PVOID:
32587 nargs = 1;
32588 klass = load;
32589 memory = 0;
32590 switch (icode)
32592 case CODE_FOR_sse4_1_movntdqa:
32593 case CODE_FOR_avx2_movntdqa:
32594 aligned_mem = true;
32595 break;
32596 default:
32597 break;
32599 break;
32600 case VOID_FTYPE_PV2SF_V4SF:
32601 case VOID_FTYPE_PV4DI_V4DI:
32602 case VOID_FTYPE_PV2DI_V2DI:
32603 case VOID_FTYPE_PCHAR_V32QI:
32604 case VOID_FTYPE_PCHAR_V16QI:
32605 case VOID_FTYPE_PFLOAT_V8SF:
32606 case VOID_FTYPE_PFLOAT_V4SF:
32607 case VOID_FTYPE_PDOUBLE_V4DF:
32608 case VOID_FTYPE_PDOUBLE_V2DF:
32609 case VOID_FTYPE_PLONGLONG_LONGLONG:
32610 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32611 case VOID_FTYPE_PINT_INT:
32612 nargs = 1;
32613 klass = store;
32614 /* Reserve memory operand for target. */
32615 memory = ARRAY_SIZE (args);
32616 switch (icode)
32618 /* These builtins and instructions require the memory
32619 to be properly aligned. */
32620 case CODE_FOR_avx_movntv4di:
32621 case CODE_FOR_sse2_movntv2di:
32622 case CODE_FOR_avx_movntv8sf:
32623 case CODE_FOR_sse_movntv4sf:
32624 case CODE_FOR_sse4a_vmmovntv4sf:
32625 case CODE_FOR_avx_movntv4df:
32626 case CODE_FOR_sse2_movntv2df:
32627 case CODE_FOR_sse4a_vmmovntv2df:
32628 case CODE_FOR_sse2_movntidi:
32629 case CODE_FOR_sse_movntq:
32630 case CODE_FOR_sse2_movntisi:
32631 aligned_mem = true;
32632 break;
32633 default:
32634 break;
32636 break;
32637 case V4SF_FTYPE_V4SF_PCV2SF:
32638 case V2DF_FTYPE_V2DF_PCDOUBLE:
32639 nargs = 2;
32640 klass = load;
32641 memory = 1;
32642 break;
32643 case V8SF_FTYPE_PCV8SF_V8SI:
32644 case V4DF_FTYPE_PCV4DF_V4DI:
32645 case V4SF_FTYPE_PCV4SF_V4SI:
32646 case V2DF_FTYPE_PCV2DF_V2DI:
32647 case V8SI_FTYPE_PCV8SI_V8SI:
32648 case V4DI_FTYPE_PCV4DI_V4DI:
32649 case V4SI_FTYPE_PCV4SI_V4SI:
32650 case V2DI_FTYPE_PCV2DI_V2DI:
32651 nargs = 2;
32652 klass = load;
32653 memory = 0;
32654 break;
32655 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32656 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32657 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32658 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32659 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32660 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32661 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32662 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32663 nargs = 2;
32664 klass = store;
32665 /* Reserve memory operand for target. */
32666 memory = ARRAY_SIZE (args);
32667 break;
32668 case VOID_FTYPE_UINT_UINT_UINT:
32669 case VOID_FTYPE_UINT64_UINT_UINT:
32670 case UCHAR_FTYPE_UINT_UINT_UINT:
32671 case UCHAR_FTYPE_UINT64_UINT_UINT:
32672 nargs = 3;
32673 klass = load;
32674 memory = ARRAY_SIZE (args);
32675 last_arg_constant = true;
32676 break;
32677 default:
32678 gcc_unreachable ();
32681 gcc_assert (nargs <= ARRAY_SIZE (args));
32683 if (klass == store)
32685 arg = CALL_EXPR_ARG (exp, 0);
32686 op = expand_normal (arg);
32687 gcc_assert (target == 0);
32688 if (memory)
32690 op = ix86_zero_extend_to_Pmode (op);
32691 target = gen_rtx_MEM (tmode, op);
32692 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
32693 on it. Try to improve it using get_pointer_alignment,
32694 and if the special builtin is one that requires strict
32695 mode alignment, also from it's GET_MODE_ALIGNMENT.
32696 Failure to do so could lead to ix86_legitimate_combined_insn
32697 rejecting all changes to such insns. */
32698 unsigned int align = get_pointer_alignment (arg);
32699 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
32700 align = GET_MODE_ALIGNMENT (tmode);
32701 if (MEM_ALIGN (target) < align)
32702 set_mem_align (target, align);
32704 else
32705 target = force_reg (tmode, op);
32706 arg_adjust = 1;
32708 else
32710 arg_adjust = 0;
32711 if (optimize
32712 || target == 0
32713 || !register_operand (target, tmode)
32714 || GET_MODE (target) != tmode)
32715 target = gen_reg_rtx (tmode);
32718 for (i = 0; i < nargs; i++)
32720 enum machine_mode mode = insn_p->operand[i + 1].mode;
32721 bool match;
32723 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32724 op = expand_normal (arg);
32725 match = insn_p->operand[i + 1].predicate (op, mode);
32727 if (last_arg_constant && (i + 1) == nargs)
32729 if (!match)
32731 if (icode == CODE_FOR_lwp_lwpvalsi3
32732 || icode == CODE_FOR_lwp_lwpinssi3
32733 || icode == CODE_FOR_lwp_lwpvaldi3
32734 || icode == CODE_FOR_lwp_lwpinsdi3)
32735 error ("the last argument must be a 32-bit immediate");
32736 else
32737 error ("the last argument must be an 8-bit immediate");
32738 return const0_rtx;
32741 else
32743 if (i == memory)
32745 /* This must be the memory operand. */
32746 op = ix86_zero_extend_to_Pmode (op);
32747 op = gen_rtx_MEM (mode, op);
32748 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
32749 on it. Try to improve it using get_pointer_alignment,
32750 and if the special builtin is one that requires strict
32751 mode alignment, also from it's GET_MODE_ALIGNMENT.
32752 Failure to do so could lead to ix86_legitimate_combined_insn
32753 rejecting all changes to such insns. */
32754 unsigned int align = get_pointer_alignment (arg);
32755 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
32756 align = GET_MODE_ALIGNMENT (mode);
32757 if (MEM_ALIGN (op) < align)
32758 set_mem_align (op, align);
32760 else
32762 /* This must be register. */
32763 if (VECTOR_MODE_P (mode))
32764 op = safe_vector_operand (op, mode);
32766 gcc_assert (GET_MODE (op) == mode
32767 || GET_MODE (op) == VOIDmode);
32768 op = copy_to_mode_reg (mode, op);
32772 args[i].op = op;
32773 args[i].mode = mode;
32776 switch (nargs)
32778 case 0:
32779 pat = GEN_FCN (icode) (target);
32780 break;
32781 case 1:
32782 pat = GEN_FCN (icode) (target, args[0].op);
32783 break;
32784 case 2:
32785 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32786 break;
32787 case 3:
32788 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32789 break;
32790 default:
32791 gcc_unreachable ();
32794 if (! pat)
32795 return 0;
32796 emit_insn (pat);
32797 return klass == store ? 0 : target;
32800 /* Return the integer constant in ARG. Constrain it to be in the range
32801 of the subparts of VEC_TYPE; issue an error if not. */
32803 static int
32804 get_element_number (tree vec_type, tree arg)
32806 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32808 if (!tree_fits_uhwi_p (arg)
32809 || (elt = tree_to_uhwi (arg), elt > max))
32811 error ("selector must be an integer constant in the range 0..%wi", max);
32812 return 0;
32815 return elt;
32818 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32819 ix86_expand_vector_init. We DO have language-level syntax for this, in
32820 the form of (type){ init-list }. Except that since we can't place emms
32821 instructions from inside the compiler, we can't allow the use of MMX
32822 registers unless the user explicitly asks for it. So we do *not* define
32823 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32824 we have builtins invoked by mmintrin.h that gives us license to emit
32825 these sorts of instructions. */
32827 static rtx
32828 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32830 enum machine_mode tmode = TYPE_MODE (type);
32831 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32832 int i, n_elt = GET_MODE_NUNITS (tmode);
32833 rtvec v = rtvec_alloc (n_elt);
32835 gcc_assert (VECTOR_MODE_P (tmode));
32836 gcc_assert (call_expr_nargs (exp) == n_elt);
32838 for (i = 0; i < n_elt; ++i)
32840 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32841 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32844 if (!target || !register_operand (target, tmode))
32845 target = gen_reg_rtx (tmode);
32847 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32848 return target;
32851 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32852 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32853 had a language-level syntax for referencing vector elements. */
32855 static rtx
32856 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32858 enum machine_mode tmode, mode0;
32859 tree arg0, arg1;
32860 int elt;
32861 rtx op0;
32863 arg0 = CALL_EXPR_ARG (exp, 0);
32864 arg1 = CALL_EXPR_ARG (exp, 1);
32866 op0 = expand_normal (arg0);
32867 elt = get_element_number (TREE_TYPE (arg0), arg1);
32869 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32870 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32871 gcc_assert (VECTOR_MODE_P (mode0));
32873 op0 = force_reg (mode0, op0);
32875 if (optimize || !target || !register_operand (target, tmode))
32876 target = gen_reg_rtx (tmode);
32878 ix86_expand_vector_extract (true, target, op0, elt);
32880 return target;
32883 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32884 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32885 a language-level syntax for referencing vector elements. */
32887 static rtx
32888 ix86_expand_vec_set_builtin (tree exp)
32890 enum machine_mode tmode, mode1;
32891 tree arg0, arg1, arg2;
32892 int elt;
32893 rtx op0, op1, target;
32895 arg0 = CALL_EXPR_ARG (exp, 0);
32896 arg1 = CALL_EXPR_ARG (exp, 1);
32897 arg2 = CALL_EXPR_ARG (exp, 2);
32899 tmode = TYPE_MODE (TREE_TYPE (arg0));
32900 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32901 gcc_assert (VECTOR_MODE_P (tmode));
32903 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32904 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32905 elt = get_element_number (TREE_TYPE (arg0), arg2);
32907 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32908 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32910 op0 = force_reg (tmode, op0);
32911 op1 = force_reg (mode1, op1);
32913 /* OP0 is the source of these builtin functions and shouldn't be
32914 modified. Create a copy, use it and return it as target. */
32915 target = gen_reg_rtx (tmode);
32916 emit_move_insn (target, op0);
32917 ix86_expand_vector_set (true, target, op1, elt);
32919 return target;
32922 /* Expand an expression EXP that calls a built-in function,
32923 with result going to TARGET if that's convenient
32924 (and in mode MODE if that's convenient).
32925 SUBTARGET may be used as the target for computing one of EXP's operands.
32926 IGNORE is nonzero if the value is to be ignored. */
32928 static rtx
32929 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32930 enum machine_mode mode, int ignore)
32932 const struct builtin_description *d;
32933 size_t i;
32934 enum insn_code icode;
32935 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32936 tree arg0, arg1, arg2, arg3, arg4;
32937 rtx op0, op1, op2, op3, op4, pat, insn;
32938 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32939 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32941 /* For CPU builtins that can be folded, fold first and expand the fold. */
32942 switch (fcode)
32944 case IX86_BUILTIN_CPU_INIT:
32946 /* Make it call __cpu_indicator_init in libgcc. */
32947 tree call_expr, fndecl, type;
32948 type = build_function_type_list (integer_type_node, NULL_TREE);
32949 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32950 call_expr = build_call_expr (fndecl, 0);
32951 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32953 case IX86_BUILTIN_CPU_IS:
32954 case IX86_BUILTIN_CPU_SUPPORTS:
32956 tree arg0 = CALL_EXPR_ARG (exp, 0);
32957 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32958 gcc_assert (fold_expr != NULL_TREE);
32959 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32963 /* Determine whether the builtin function is available under the current ISA.
32964 Originally the builtin was not created if it wasn't applicable to the
32965 current ISA based on the command line switches. With function specific
32966 options, we need to check in the context of the function making the call
32967 whether it is supported. */
32968 if (ix86_builtins_isa[fcode].isa
32969 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32971 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32972 NULL, (enum fpmath_unit) 0, false);
32974 if (!opts)
32975 error ("%qE needs unknown isa option", fndecl);
32976 else
32978 gcc_assert (opts != NULL);
32979 error ("%qE needs isa option %s", fndecl, opts);
32980 free (opts);
32982 return const0_rtx;
32985 switch (fcode)
32987 case IX86_BUILTIN_MASKMOVQ:
32988 case IX86_BUILTIN_MASKMOVDQU:
32989 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32990 ? CODE_FOR_mmx_maskmovq
32991 : CODE_FOR_sse2_maskmovdqu);
32992 /* Note the arg order is different from the operand order. */
32993 arg1 = CALL_EXPR_ARG (exp, 0);
32994 arg2 = CALL_EXPR_ARG (exp, 1);
32995 arg0 = CALL_EXPR_ARG (exp, 2);
32996 op0 = expand_normal (arg0);
32997 op1 = expand_normal (arg1);
32998 op2 = expand_normal (arg2);
32999 mode0 = insn_data[icode].operand[0].mode;
33000 mode1 = insn_data[icode].operand[1].mode;
33001 mode2 = insn_data[icode].operand[2].mode;
33003 op0 = ix86_zero_extend_to_Pmode (op0);
33004 op0 = gen_rtx_MEM (mode1, op0);
33006 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33007 op0 = copy_to_mode_reg (mode0, op0);
33008 if (!insn_data[icode].operand[1].predicate (op1, mode1))
33009 op1 = copy_to_mode_reg (mode1, op1);
33010 if (!insn_data[icode].operand[2].predicate (op2, mode2))
33011 op2 = copy_to_mode_reg (mode2, op2);
33012 pat = GEN_FCN (icode) (op0, op1, op2);
33013 if (! pat)
33014 return 0;
33015 emit_insn (pat);
33016 return 0;
33018 case IX86_BUILTIN_LDMXCSR:
33019 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
33020 target = assign_386_stack_local (SImode, SLOT_TEMP);
33021 emit_move_insn (target, op0);
33022 emit_insn (gen_sse_ldmxcsr (target));
33023 return 0;
33025 case IX86_BUILTIN_STMXCSR:
33026 target = assign_386_stack_local (SImode, SLOT_TEMP);
33027 emit_insn (gen_sse_stmxcsr (target));
33028 return copy_to_mode_reg (SImode, target);
33030 case IX86_BUILTIN_CLFLUSH:
33031 arg0 = CALL_EXPR_ARG (exp, 0);
33032 op0 = expand_normal (arg0);
33033 icode = CODE_FOR_sse2_clflush;
33034 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33035 op0 = ix86_zero_extend_to_Pmode (op0);
33037 emit_insn (gen_sse2_clflush (op0));
33038 return 0;
33040 case IX86_BUILTIN_MONITOR:
33041 arg0 = CALL_EXPR_ARG (exp, 0);
33042 arg1 = CALL_EXPR_ARG (exp, 1);
33043 arg2 = CALL_EXPR_ARG (exp, 2);
33044 op0 = expand_normal (arg0);
33045 op1 = expand_normal (arg1);
33046 op2 = expand_normal (arg2);
33047 if (!REG_P (op0))
33048 op0 = ix86_zero_extend_to_Pmode (op0);
33049 if (!REG_P (op1))
33050 op1 = copy_to_mode_reg (SImode, op1);
33051 if (!REG_P (op2))
33052 op2 = copy_to_mode_reg (SImode, op2);
33053 emit_insn (ix86_gen_monitor (op0, op1, op2));
33054 return 0;
33056 case IX86_BUILTIN_MWAIT:
33057 arg0 = CALL_EXPR_ARG (exp, 0);
33058 arg1 = CALL_EXPR_ARG (exp, 1);
33059 op0 = expand_normal (arg0);
33060 op1 = expand_normal (arg1);
33061 if (!REG_P (op0))
33062 op0 = copy_to_mode_reg (SImode, op0);
33063 if (!REG_P (op1))
33064 op1 = copy_to_mode_reg (SImode, op1);
33065 emit_insn (gen_sse3_mwait (op0, op1));
33066 return 0;
33068 case IX86_BUILTIN_VEC_INIT_V2SI:
33069 case IX86_BUILTIN_VEC_INIT_V4HI:
33070 case IX86_BUILTIN_VEC_INIT_V8QI:
33071 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
33073 case IX86_BUILTIN_VEC_EXT_V2DF:
33074 case IX86_BUILTIN_VEC_EXT_V2DI:
33075 case IX86_BUILTIN_VEC_EXT_V4SF:
33076 case IX86_BUILTIN_VEC_EXT_V4SI:
33077 case IX86_BUILTIN_VEC_EXT_V8HI:
33078 case IX86_BUILTIN_VEC_EXT_V2SI:
33079 case IX86_BUILTIN_VEC_EXT_V4HI:
33080 case IX86_BUILTIN_VEC_EXT_V16QI:
33081 return ix86_expand_vec_ext_builtin (exp, target);
33083 case IX86_BUILTIN_VEC_SET_V2DI:
33084 case IX86_BUILTIN_VEC_SET_V4SF:
33085 case IX86_BUILTIN_VEC_SET_V4SI:
33086 case IX86_BUILTIN_VEC_SET_V8HI:
33087 case IX86_BUILTIN_VEC_SET_V4HI:
33088 case IX86_BUILTIN_VEC_SET_V16QI:
33089 return ix86_expand_vec_set_builtin (exp);
33091 case IX86_BUILTIN_INFQ:
33092 case IX86_BUILTIN_HUGE_VALQ:
33094 REAL_VALUE_TYPE inf;
33095 rtx tmp;
33097 real_inf (&inf);
33098 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
33100 tmp = validize_mem (force_const_mem (mode, tmp));
33102 if (target == 0)
33103 target = gen_reg_rtx (mode);
33105 emit_move_insn (target, tmp);
33106 return target;
33109 case IX86_BUILTIN_RDPMC:
33110 case IX86_BUILTIN_RDTSC:
33111 case IX86_BUILTIN_RDTSCP:
33113 op0 = gen_reg_rtx (DImode);
33114 op1 = gen_reg_rtx (DImode);
33116 if (fcode == IX86_BUILTIN_RDPMC)
33118 arg0 = CALL_EXPR_ARG (exp, 0);
33119 op2 = expand_normal (arg0);
33120 if (!register_operand (op2, SImode))
33121 op2 = copy_to_mode_reg (SImode, op2);
33123 insn = (TARGET_64BIT
33124 ? gen_rdpmc_rex64 (op0, op1, op2)
33125 : gen_rdpmc (op0, op2));
33126 emit_insn (insn);
33128 else if (fcode == IX86_BUILTIN_RDTSC)
33130 insn = (TARGET_64BIT
33131 ? gen_rdtsc_rex64 (op0, op1)
33132 : gen_rdtsc (op0));
33133 emit_insn (insn);
33135 else
33137 op2 = gen_reg_rtx (SImode);
33139 insn = (TARGET_64BIT
33140 ? gen_rdtscp_rex64 (op0, op1, op2)
33141 : gen_rdtscp (op0, op2));
33142 emit_insn (insn);
33144 arg0 = CALL_EXPR_ARG (exp, 0);
33145 op4 = expand_normal (arg0);
33146 if (!address_operand (op4, VOIDmode))
33148 op4 = convert_memory_address (Pmode, op4);
33149 op4 = copy_addr_to_reg (op4);
33151 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
33154 if (target == 0)
33156 /* mode is VOIDmode if __builtin_rd* has been called
33157 without lhs. */
33158 if (mode == VOIDmode)
33159 return target;
33160 target = gen_reg_rtx (mode);
33163 if (TARGET_64BIT)
33165 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
33166 op1, 1, OPTAB_DIRECT);
33167 op0 = expand_simple_binop (DImode, IOR, op0, op1,
33168 op0, 1, OPTAB_DIRECT);
33171 emit_move_insn (target, op0);
33172 return target;
33174 case IX86_BUILTIN_FXSAVE:
33175 case IX86_BUILTIN_FXRSTOR:
33176 case IX86_BUILTIN_FXSAVE64:
33177 case IX86_BUILTIN_FXRSTOR64:
33178 case IX86_BUILTIN_FNSTENV:
33179 case IX86_BUILTIN_FLDENV:
33180 case IX86_BUILTIN_FNSTSW:
33181 mode0 = BLKmode;
33182 switch (fcode)
33184 case IX86_BUILTIN_FXSAVE:
33185 icode = CODE_FOR_fxsave;
33186 break;
33187 case IX86_BUILTIN_FXRSTOR:
33188 icode = CODE_FOR_fxrstor;
33189 break;
33190 case IX86_BUILTIN_FXSAVE64:
33191 icode = CODE_FOR_fxsave64;
33192 break;
33193 case IX86_BUILTIN_FXRSTOR64:
33194 icode = CODE_FOR_fxrstor64;
33195 break;
33196 case IX86_BUILTIN_FNSTENV:
33197 icode = CODE_FOR_fnstenv;
33198 break;
33199 case IX86_BUILTIN_FLDENV:
33200 icode = CODE_FOR_fldenv;
33201 break;
33202 case IX86_BUILTIN_FNSTSW:
33203 icode = CODE_FOR_fnstsw;
33204 mode0 = HImode;
33205 break;
33206 default:
33207 gcc_unreachable ();
33210 arg0 = CALL_EXPR_ARG (exp, 0);
33211 op0 = expand_normal (arg0);
33213 if (!address_operand (op0, VOIDmode))
33215 op0 = convert_memory_address (Pmode, op0);
33216 op0 = copy_addr_to_reg (op0);
33218 op0 = gen_rtx_MEM (mode0, op0);
33220 pat = GEN_FCN (icode) (op0);
33221 if (pat)
33222 emit_insn (pat);
33223 return 0;
33225 case IX86_BUILTIN_XSAVE:
33226 case IX86_BUILTIN_XRSTOR:
33227 case IX86_BUILTIN_XSAVE64:
33228 case IX86_BUILTIN_XRSTOR64:
33229 case IX86_BUILTIN_XSAVEOPT:
33230 case IX86_BUILTIN_XSAVEOPT64:
33231 arg0 = CALL_EXPR_ARG (exp, 0);
33232 arg1 = CALL_EXPR_ARG (exp, 1);
33233 op0 = expand_normal (arg0);
33234 op1 = expand_normal (arg1);
33236 if (!address_operand (op0, VOIDmode))
33238 op0 = convert_memory_address (Pmode, op0);
33239 op0 = copy_addr_to_reg (op0);
33241 op0 = gen_rtx_MEM (BLKmode, op0);
33243 op1 = force_reg (DImode, op1);
33245 if (TARGET_64BIT)
33247 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
33248 NULL, 1, OPTAB_DIRECT);
33249 switch (fcode)
33251 case IX86_BUILTIN_XSAVE:
33252 icode = CODE_FOR_xsave_rex64;
33253 break;
33254 case IX86_BUILTIN_XRSTOR:
33255 icode = CODE_FOR_xrstor_rex64;
33256 break;
33257 case IX86_BUILTIN_XSAVE64:
33258 icode = CODE_FOR_xsave64;
33259 break;
33260 case IX86_BUILTIN_XRSTOR64:
33261 icode = CODE_FOR_xrstor64;
33262 break;
33263 case IX86_BUILTIN_XSAVEOPT:
33264 icode = CODE_FOR_xsaveopt_rex64;
33265 break;
33266 case IX86_BUILTIN_XSAVEOPT64:
33267 icode = CODE_FOR_xsaveopt64;
33268 break;
33269 default:
33270 gcc_unreachable ();
33273 op2 = gen_lowpart (SImode, op2);
33274 op1 = gen_lowpart (SImode, op1);
33275 pat = GEN_FCN (icode) (op0, op1, op2);
33277 else
33279 switch (fcode)
33281 case IX86_BUILTIN_XSAVE:
33282 icode = CODE_FOR_xsave;
33283 break;
33284 case IX86_BUILTIN_XRSTOR:
33285 icode = CODE_FOR_xrstor;
33286 break;
33287 case IX86_BUILTIN_XSAVEOPT:
33288 icode = CODE_FOR_xsaveopt;
33289 break;
33290 default:
33291 gcc_unreachable ();
33293 pat = GEN_FCN (icode) (op0, op1);
33296 if (pat)
33297 emit_insn (pat);
33298 return 0;
33300 case IX86_BUILTIN_LLWPCB:
33301 arg0 = CALL_EXPR_ARG (exp, 0);
33302 op0 = expand_normal (arg0);
33303 icode = CODE_FOR_lwp_llwpcb;
33304 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33305 op0 = ix86_zero_extend_to_Pmode (op0);
33306 emit_insn (gen_lwp_llwpcb (op0));
33307 return 0;
33309 case IX86_BUILTIN_SLWPCB:
33310 icode = CODE_FOR_lwp_slwpcb;
33311 if (!target
33312 || !insn_data[icode].operand[0].predicate (target, Pmode))
33313 target = gen_reg_rtx (Pmode);
33314 emit_insn (gen_lwp_slwpcb (target));
33315 return target;
33317 case IX86_BUILTIN_BEXTRI32:
33318 case IX86_BUILTIN_BEXTRI64:
33319 arg0 = CALL_EXPR_ARG (exp, 0);
33320 arg1 = CALL_EXPR_ARG (exp, 1);
33321 op0 = expand_normal (arg0);
33322 op1 = expand_normal (arg1);
33323 icode = (fcode == IX86_BUILTIN_BEXTRI32
33324 ? CODE_FOR_tbm_bextri_si
33325 : CODE_FOR_tbm_bextri_di);
33326 if (!CONST_INT_P (op1))
33328 error ("last argument must be an immediate");
33329 return const0_rtx;
33331 else
33333 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
33334 unsigned char lsb_index = INTVAL (op1) & 0xFF;
33335 op1 = GEN_INT (length);
33336 op2 = GEN_INT (lsb_index);
33337 pat = GEN_FCN (icode) (target, op0, op1, op2);
33338 if (pat)
33339 emit_insn (pat);
33340 return target;
33343 case IX86_BUILTIN_RDRAND16_STEP:
33344 icode = CODE_FOR_rdrandhi_1;
33345 mode0 = HImode;
33346 goto rdrand_step;
33348 case IX86_BUILTIN_RDRAND32_STEP:
33349 icode = CODE_FOR_rdrandsi_1;
33350 mode0 = SImode;
33351 goto rdrand_step;
33353 case IX86_BUILTIN_RDRAND64_STEP:
33354 icode = CODE_FOR_rdranddi_1;
33355 mode0 = DImode;
33357 rdrand_step:
33358 op0 = gen_reg_rtx (mode0);
33359 emit_insn (GEN_FCN (icode) (op0));
33361 arg0 = CALL_EXPR_ARG (exp, 0);
33362 op1 = expand_normal (arg0);
33363 if (!address_operand (op1, VOIDmode))
33365 op1 = convert_memory_address (Pmode, op1);
33366 op1 = copy_addr_to_reg (op1);
33368 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33370 op1 = gen_reg_rtx (SImode);
33371 emit_move_insn (op1, CONST1_RTX (SImode));
33373 /* Emit SImode conditional move. */
33374 if (mode0 == HImode)
33376 op2 = gen_reg_rtx (SImode);
33377 emit_insn (gen_zero_extendhisi2 (op2, op0));
33379 else if (mode0 == SImode)
33380 op2 = op0;
33381 else
33382 op2 = gen_rtx_SUBREG (SImode, op0, 0);
33384 if (target == 0)
33385 target = gen_reg_rtx (SImode);
33387 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
33388 const0_rtx);
33389 emit_insn (gen_rtx_SET (VOIDmode, target,
33390 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
33391 return target;
33393 case IX86_BUILTIN_RDSEED16_STEP:
33394 icode = CODE_FOR_rdseedhi_1;
33395 mode0 = HImode;
33396 goto rdseed_step;
33398 case IX86_BUILTIN_RDSEED32_STEP:
33399 icode = CODE_FOR_rdseedsi_1;
33400 mode0 = SImode;
33401 goto rdseed_step;
33403 case IX86_BUILTIN_RDSEED64_STEP:
33404 icode = CODE_FOR_rdseeddi_1;
33405 mode0 = DImode;
33407 rdseed_step:
33408 op0 = gen_reg_rtx (mode0);
33409 emit_insn (GEN_FCN (icode) (op0));
33411 arg0 = CALL_EXPR_ARG (exp, 0);
33412 op1 = expand_normal (arg0);
33413 if (!address_operand (op1, VOIDmode))
33415 op1 = convert_memory_address (Pmode, op1);
33416 op1 = copy_addr_to_reg (op1);
33418 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33420 op2 = gen_reg_rtx (QImode);
33422 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33423 const0_rtx);
33424 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33426 if (target == 0)
33427 target = gen_reg_rtx (SImode);
33429 emit_insn (gen_zero_extendqisi2 (target, op2));
33430 return target;
33432 case IX86_BUILTIN_ADDCARRYX32:
33433 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33434 mode0 = SImode;
33435 goto addcarryx;
33437 case IX86_BUILTIN_ADDCARRYX64:
33438 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33439 mode0 = DImode;
33441 addcarryx:
33442 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33443 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33444 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33445 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33447 op0 = gen_reg_rtx (QImode);
33449 /* Generate CF from input operand. */
33450 op1 = expand_normal (arg0);
33451 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33452 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33454 /* Gen ADCX instruction to compute X+Y+CF. */
33455 op2 = expand_normal (arg1);
33456 op3 = expand_normal (arg2);
33458 if (!REG_P (op2))
33459 op2 = copy_to_mode_reg (mode0, op2);
33460 if (!REG_P (op3))
33461 op3 = copy_to_mode_reg (mode0, op3);
33463 op0 = gen_reg_rtx (mode0);
33465 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33466 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33467 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33469 /* Store the result. */
33470 op4 = expand_normal (arg3);
33471 if (!address_operand (op4, VOIDmode))
33473 op4 = convert_memory_address (Pmode, op4);
33474 op4 = copy_addr_to_reg (op4);
33476 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33478 /* Return current CF value. */
33479 if (target == 0)
33480 target = gen_reg_rtx (QImode);
33482 PUT_MODE (pat, QImode);
33483 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33484 return target;
33486 case IX86_BUILTIN_READ_FLAGS:
33487 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
33489 if (optimize
33490 || target == NULL_RTX
33491 || !nonimmediate_operand (target, word_mode)
33492 || GET_MODE (target) != word_mode)
33493 target = gen_reg_rtx (word_mode);
33495 emit_insn (gen_pop (target));
33496 return target;
33498 case IX86_BUILTIN_WRITE_FLAGS:
33500 arg0 = CALL_EXPR_ARG (exp, 0);
33501 op0 = expand_normal (arg0);
33502 if (!general_no_elim_operand (op0, word_mode))
33503 op0 = copy_to_mode_reg (word_mode, op0);
33505 emit_insn (gen_push (op0));
33506 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
33507 return 0;
33509 case IX86_BUILTIN_GATHERSIV2DF:
33510 icode = CODE_FOR_avx2_gathersiv2df;
33511 goto gather_gen;
33512 case IX86_BUILTIN_GATHERSIV4DF:
33513 icode = CODE_FOR_avx2_gathersiv4df;
33514 goto gather_gen;
33515 case IX86_BUILTIN_GATHERDIV2DF:
33516 icode = CODE_FOR_avx2_gatherdiv2df;
33517 goto gather_gen;
33518 case IX86_BUILTIN_GATHERDIV4DF:
33519 icode = CODE_FOR_avx2_gatherdiv4df;
33520 goto gather_gen;
33521 case IX86_BUILTIN_GATHERSIV4SF:
33522 icode = CODE_FOR_avx2_gathersiv4sf;
33523 goto gather_gen;
33524 case IX86_BUILTIN_GATHERSIV8SF:
33525 icode = CODE_FOR_avx2_gathersiv8sf;
33526 goto gather_gen;
33527 case IX86_BUILTIN_GATHERDIV4SF:
33528 icode = CODE_FOR_avx2_gatherdiv4sf;
33529 goto gather_gen;
33530 case IX86_BUILTIN_GATHERDIV8SF:
33531 icode = CODE_FOR_avx2_gatherdiv8sf;
33532 goto gather_gen;
33533 case IX86_BUILTIN_GATHERSIV2DI:
33534 icode = CODE_FOR_avx2_gathersiv2di;
33535 goto gather_gen;
33536 case IX86_BUILTIN_GATHERSIV4DI:
33537 icode = CODE_FOR_avx2_gathersiv4di;
33538 goto gather_gen;
33539 case IX86_BUILTIN_GATHERDIV2DI:
33540 icode = CODE_FOR_avx2_gatherdiv2di;
33541 goto gather_gen;
33542 case IX86_BUILTIN_GATHERDIV4DI:
33543 icode = CODE_FOR_avx2_gatherdiv4di;
33544 goto gather_gen;
33545 case IX86_BUILTIN_GATHERSIV4SI:
33546 icode = CODE_FOR_avx2_gathersiv4si;
33547 goto gather_gen;
33548 case IX86_BUILTIN_GATHERSIV8SI:
33549 icode = CODE_FOR_avx2_gathersiv8si;
33550 goto gather_gen;
33551 case IX86_BUILTIN_GATHERDIV4SI:
33552 icode = CODE_FOR_avx2_gatherdiv4si;
33553 goto gather_gen;
33554 case IX86_BUILTIN_GATHERDIV8SI:
33555 icode = CODE_FOR_avx2_gatherdiv8si;
33556 goto gather_gen;
33557 case IX86_BUILTIN_GATHERALTSIV4DF:
33558 icode = CODE_FOR_avx2_gathersiv4df;
33559 goto gather_gen;
33560 case IX86_BUILTIN_GATHERALTDIV8SF:
33561 icode = CODE_FOR_avx2_gatherdiv8sf;
33562 goto gather_gen;
33563 case IX86_BUILTIN_GATHERALTSIV4DI:
33564 icode = CODE_FOR_avx2_gathersiv4di;
33565 goto gather_gen;
33566 case IX86_BUILTIN_GATHERALTDIV8SI:
33567 icode = CODE_FOR_avx2_gatherdiv8si;
33568 goto gather_gen;
33570 gather_gen:
33571 arg0 = CALL_EXPR_ARG (exp, 0);
33572 arg1 = CALL_EXPR_ARG (exp, 1);
33573 arg2 = CALL_EXPR_ARG (exp, 2);
33574 arg3 = CALL_EXPR_ARG (exp, 3);
33575 arg4 = CALL_EXPR_ARG (exp, 4);
33576 op0 = expand_normal (arg0);
33577 op1 = expand_normal (arg1);
33578 op2 = expand_normal (arg2);
33579 op3 = expand_normal (arg3);
33580 op4 = expand_normal (arg4);
33581 /* Note the arg order is different from the operand order. */
33582 mode0 = insn_data[icode].operand[1].mode;
33583 mode2 = insn_data[icode].operand[3].mode;
33584 mode3 = insn_data[icode].operand[4].mode;
33585 mode4 = insn_data[icode].operand[5].mode;
33587 if (target == NULL_RTX
33588 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33589 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33590 else
33591 subtarget = target;
33593 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33594 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33596 rtx half = gen_reg_rtx (V4SImode);
33597 if (!nonimmediate_operand (op2, V8SImode))
33598 op2 = copy_to_mode_reg (V8SImode, op2);
33599 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33600 op2 = half;
33602 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33603 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33605 rtx (*gen) (rtx, rtx);
33606 rtx half = gen_reg_rtx (mode0);
33607 if (mode0 == V4SFmode)
33608 gen = gen_vec_extract_lo_v8sf;
33609 else
33610 gen = gen_vec_extract_lo_v8si;
33611 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33612 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33613 emit_insn (gen (half, op0));
33614 op0 = half;
33615 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33616 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33617 emit_insn (gen (half, op3));
33618 op3 = half;
33621 /* Force memory operand only with base register here. But we
33622 don't want to do it on memory operand for other builtin
33623 functions. */
33624 op1 = ix86_zero_extend_to_Pmode (op1);
33626 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33627 op0 = copy_to_mode_reg (mode0, op0);
33628 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33629 op1 = copy_to_mode_reg (Pmode, op1);
33630 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33631 op2 = copy_to_mode_reg (mode2, op2);
33632 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33633 op3 = copy_to_mode_reg (mode3, op3);
33634 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33636 error ("last argument must be scale 1, 2, 4, 8");
33637 return const0_rtx;
33640 /* Optimize. If mask is known to have all high bits set,
33641 replace op0 with pc_rtx to signal that the instruction
33642 overwrites the whole destination and doesn't use its
33643 previous contents. */
33644 if (optimize)
33646 if (TREE_CODE (arg3) == VECTOR_CST)
33648 unsigned int negative = 0;
33649 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33651 tree cst = VECTOR_CST_ELT (arg3, i);
33652 if (TREE_CODE (cst) == INTEGER_CST
33653 && tree_int_cst_sign_bit (cst))
33654 negative++;
33655 else if (TREE_CODE (cst) == REAL_CST
33656 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33657 negative++;
33659 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33660 op0 = pc_rtx;
33662 else if (TREE_CODE (arg3) == SSA_NAME)
33664 /* Recognize also when mask is like:
33665 __v2df src = _mm_setzero_pd ();
33666 __v2df mask = _mm_cmpeq_pd (src, src);
33668 __v8sf src = _mm256_setzero_ps ();
33669 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33670 as that is a cheaper way to load all ones into
33671 a register than having to load a constant from
33672 memory. */
33673 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33674 if (is_gimple_call (def_stmt))
33676 tree fndecl = gimple_call_fndecl (def_stmt);
33677 if (fndecl
33678 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33679 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33681 case IX86_BUILTIN_CMPPD:
33682 case IX86_BUILTIN_CMPPS:
33683 case IX86_BUILTIN_CMPPD256:
33684 case IX86_BUILTIN_CMPPS256:
33685 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33686 break;
33687 /* FALLTHRU */
33688 case IX86_BUILTIN_CMPEQPD:
33689 case IX86_BUILTIN_CMPEQPS:
33690 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33691 && initializer_zerop (gimple_call_arg (def_stmt,
33692 1)))
33693 op0 = pc_rtx;
33694 break;
33695 default:
33696 break;
33702 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33703 if (! pat)
33704 return const0_rtx;
33705 emit_insn (pat);
33707 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33708 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33710 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33711 ? V4SFmode : V4SImode;
33712 if (target == NULL_RTX)
33713 target = gen_reg_rtx (tmode);
33714 if (tmode == V4SFmode)
33715 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33716 else
33717 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33719 else
33720 target = subtarget;
33722 return target;
33724 case IX86_BUILTIN_XABORT:
33725 icode = CODE_FOR_xabort;
33726 arg0 = CALL_EXPR_ARG (exp, 0);
33727 op0 = expand_normal (arg0);
33728 mode0 = insn_data[icode].operand[0].mode;
33729 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33731 error ("the xabort's argument must be an 8-bit immediate");
33732 return const0_rtx;
33734 emit_insn (gen_xabort (op0));
33735 return 0;
33737 default:
33738 break;
33741 for (i = 0, d = bdesc_special_args;
33742 i < ARRAY_SIZE (bdesc_special_args);
33743 i++, d++)
33744 if (d->code == fcode)
33745 return ix86_expand_special_args_builtin (d, exp, target);
33747 for (i = 0, d = bdesc_args;
33748 i < ARRAY_SIZE (bdesc_args);
33749 i++, d++)
33750 if (d->code == fcode)
33751 switch (fcode)
33753 case IX86_BUILTIN_FABSQ:
33754 case IX86_BUILTIN_COPYSIGNQ:
33755 if (!TARGET_SSE)
33756 /* Emit a normal call if SSE isn't available. */
33757 return expand_call (exp, target, ignore);
33758 default:
33759 return ix86_expand_args_builtin (d, exp, target);
33762 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33763 if (d->code == fcode)
33764 return ix86_expand_sse_comi (d, exp, target);
33766 for (i = 0, d = bdesc_pcmpestr;
33767 i < ARRAY_SIZE (bdesc_pcmpestr);
33768 i++, d++)
33769 if (d->code == fcode)
33770 return ix86_expand_sse_pcmpestr (d, exp, target);
33772 for (i = 0, d = bdesc_pcmpistr;
33773 i < ARRAY_SIZE (bdesc_pcmpistr);
33774 i++, d++)
33775 if (d->code == fcode)
33776 return ix86_expand_sse_pcmpistr (d, exp, target);
33778 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33779 if (d->code == fcode)
33780 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33781 (enum ix86_builtin_func_type)
33782 d->flag, d->comparison);
33784 gcc_unreachable ();
33787 /* This returns the target-specific builtin with code CODE if
33788 current_function_decl has visibility on this builtin, which is checked
33789 using isa flags. Returns NULL_TREE otherwise. */
33791 static tree ix86_get_builtin (enum ix86_builtins code)
33793 struct cl_target_option *opts;
33794 tree target_tree = NULL_TREE;
33796 /* Determine the isa flags of current_function_decl. */
33798 if (current_function_decl)
33799 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
33801 if (target_tree == NULL)
33802 target_tree = target_option_default_node;
33804 opts = TREE_TARGET_OPTION (target_tree);
33806 if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
33807 return ix86_builtin_decl (code, true);
33808 else
33809 return NULL_TREE;
33812 /* Returns a function decl for a vectorized version of the builtin function
33813 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33814 if it is not available. */
33816 static tree
33817 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33818 tree type_in)
33820 enum machine_mode in_mode, out_mode;
33821 int in_n, out_n;
33822 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33824 if (TREE_CODE (type_out) != VECTOR_TYPE
33825 || TREE_CODE (type_in) != VECTOR_TYPE
33826 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33827 return NULL_TREE;
33829 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33830 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33831 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33832 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33834 switch (fn)
33836 case BUILT_IN_SQRT:
33837 if (out_mode == DFmode && in_mode == DFmode)
33839 if (out_n == 2 && in_n == 2)
33840 return ix86_get_builtin (IX86_BUILTIN_SQRTPD);
33841 else if (out_n == 4 && in_n == 4)
33842 return ix86_get_builtin (IX86_BUILTIN_SQRTPD256);
33844 break;
33846 case BUILT_IN_SQRTF:
33847 if (out_mode == SFmode && in_mode == SFmode)
33849 if (out_n == 4 && in_n == 4)
33850 return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR);
33851 else if (out_n == 8 && in_n == 8)
33852 return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR256);
33854 break;
33856 case BUILT_IN_IFLOOR:
33857 case BUILT_IN_LFLOOR:
33858 case BUILT_IN_LLFLOOR:
33859 /* The round insn does not trap on denormals. */
33860 if (flag_trapping_math || !TARGET_ROUND)
33861 break;
33863 if (out_mode == SImode && in_mode == DFmode)
33865 if (out_n == 4 && in_n == 2)
33866 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
33867 else if (out_n == 8 && in_n == 4)
33868 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
33870 break;
33872 case BUILT_IN_IFLOORF:
33873 case BUILT_IN_LFLOORF:
33874 case BUILT_IN_LLFLOORF:
33875 /* The round insn does not trap on denormals. */
33876 if (flag_trapping_math || !TARGET_ROUND)
33877 break;
33879 if (out_mode == SImode && in_mode == SFmode)
33881 if (out_n == 4 && in_n == 4)
33882 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
33883 else if (out_n == 8 && in_n == 8)
33884 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
33886 break;
33888 case BUILT_IN_ICEIL:
33889 case BUILT_IN_LCEIL:
33890 case BUILT_IN_LLCEIL:
33891 /* The round insn does not trap on denormals. */
33892 if (flag_trapping_math || !TARGET_ROUND)
33893 break;
33895 if (out_mode == SImode && in_mode == DFmode)
33897 if (out_n == 4 && in_n == 2)
33898 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
33899 else if (out_n == 8 && in_n == 4)
33900 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
33902 break;
33904 case BUILT_IN_ICEILF:
33905 case BUILT_IN_LCEILF:
33906 case BUILT_IN_LLCEILF:
33907 /* The round insn does not trap on denormals. */
33908 if (flag_trapping_math || !TARGET_ROUND)
33909 break;
33911 if (out_mode == SImode && in_mode == SFmode)
33913 if (out_n == 4 && in_n == 4)
33914 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
33915 else if (out_n == 8 && in_n == 8)
33916 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
33918 break;
33920 case BUILT_IN_IRINT:
33921 case BUILT_IN_LRINT:
33922 case BUILT_IN_LLRINT:
33923 if (out_mode == SImode && in_mode == DFmode)
33925 if (out_n == 4 && in_n == 2)
33926 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
33927 else if (out_n == 8 && in_n == 4)
33928 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
33930 break;
33932 case BUILT_IN_IRINTF:
33933 case BUILT_IN_LRINTF:
33934 case BUILT_IN_LLRINTF:
33935 if (out_mode == SImode && in_mode == SFmode)
33937 if (out_n == 4 && in_n == 4)
33938 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
33939 else if (out_n == 8 && in_n == 8)
33940 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
33942 break;
33944 case BUILT_IN_IROUND:
33945 case BUILT_IN_LROUND:
33946 case BUILT_IN_LLROUND:
33947 /* The round insn does not trap on denormals. */
33948 if (flag_trapping_math || !TARGET_ROUND)
33949 break;
33951 if (out_mode == SImode && in_mode == DFmode)
33953 if (out_n == 4 && in_n == 2)
33954 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
33955 else if (out_n == 8 && in_n == 4)
33956 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
33958 break;
33960 case BUILT_IN_IROUNDF:
33961 case BUILT_IN_LROUNDF:
33962 case BUILT_IN_LLROUNDF:
33963 /* The round insn does not trap on denormals. */
33964 if (flag_trapping_math || !TARGET_ROUND)
33965 break;
33967 if (out_mode == SImode && in_mode == SFmode)
33969 if (out_n == 4 && in_n == 4)
33970 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
33971 else if (out_n == 8 && in_n == 8)
33972 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
33974 break;
33976 case BUILT_IN_COPYSIGN:
33977 if (out_mode == DFmode && in_mode == DFmode)
33979 if (out_n == 2 && in_n == 2)
33980 return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD);
33981 else if (out_n == 4 && in_n == 4)
33982 return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD256);
33984 break;
33986 case BUILT_IN_COPYSIGNF:
33987 if (out_mode == SFmode && in_mode == SFmode)
33989 if (out_n == 4 && in_n == 4)
33990 return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS);
33991 else if (out_n == 8 && in_n == 8)
33992 return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS256);
33994 break;
33996 case BUILT_IN_FLOOR:
33997 /* The round insn does not trap on denormals. */
33998 if (flag_trapping_math || !TARGET_ROUND)
33999 break;
34001 if (out_mode == DFmode && in_mode == DFmode)
34003 if (out_n == 2 && in_n == 2)
34004 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
34005 else if (out_n == 4 && in_n == 4)
34006 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
34008 break;
34010 case BUILT_IN_FLOORF:
34011 /* The round insn does not trap on denormals. */
34012 if (flag_trapping_math || !TARGET_ROUND)
34013 break;
34015 if (out_mode == SFmode && in_mode == SFmode)
34017 if (out_n == 4 && in_n == 4)
34018 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
34019 else if (out_n == 8 && in_n == 8)
34020 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
34022 break;
34024 case BUILT_IN_CEIL:
34025 /* The round insn does not trap on denormals. */
34026 if (flag_trapping_math || !TARGET_ROUND)
34027 break;
34029 if (out_mode == DFmode && in_mode == DFmode)
34031 if (out_n == 2 && in_n == 2)
34032 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
34033 else if (out_n == 4 && in_n == 4)
34034 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
34036 break;
34038 case BUILT_IN_CEILF:
34039 /* The round insn does not trap on denormals. */
34040 if (flag_trapping_math || !TARGET_ROUND)
34041 break;
34043 if (out_mode == SFmode && in_mode == SFmode)
34045 if (out_n == 4 && in_n == 4)
34046 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
34047 else if (out_n == 8 && in_n == 8)
34048 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
34050 break;
34052 case BUILT_IN_TRUNC:
34053 /* The round insn does not trap on denormals. */
34054 if (flag_trapping_math || !TARGET_ROUND)
34055 break;
34057 if (out_mode == DFmode && in_mode == DFmode)
34059 if (out_n == 2 && in_n == 2)
34060 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
34061 else if (out_n == 4 && in_n == 4)
34062 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
34064 break;
34066 case BUILT_IN_TRUNCF:
34067 /* The round insn does not trap on denormals. */
34068 if (flag_trapping_math || !TARGET_ROUND)
34069 break;
34071 if (out_mode == SFmode && in_mode == SFmode)
34073 if (out_n == 4 && in_n == 4)
34074 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
34075 else if (out_n == 8 && in_n == 8)
34076 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
34078 break;
34080 case BUILT_IN_RINT:
34081 /* The round insn does not trap on denormals. */
34082 if (flag_trapping_math || !TARGET_ROUND)
34083 break;
34085 if (out_mode == DFmode && in_mode == DFmode)
34087 if (out_n == 2 && in_n == 2)
34088 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
34089 else if (out_n == 4 && in_n == 4)
34090 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
34092 break;
34094 case BUILT_IN_RINTF:
34095 /* The round insn does not trap on denormals. */
34096 if (flag_trapping_math || !TARGET_ROUND)
34097 break;
34099 if (out_mode == SFmode && in_mode == SFmode)
34101 if (out_n == 4 && in_n == 4)
34102 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
34103 else if (out_n == 8 && in_n == 8)
34104 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
34106 break;
34108 case BUILT_IN_ROUND:
34109 /* The round insn does not trap on denormals. */
34110 if (flag_trapping_math || !TARGET_ROUND)
34111 break;
34113 if (out_mode == DFmode && in_mode == DFmode)
34115 if (out_n == 2 && in_n == 2)
34116 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ);
34117 else if (out_n == 4 && in_n == 4)
34118 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ256);
34120 break;
34122 case BUILT_IN_ROUNDF:
34123 /* The round insn does not trap on denormals. */
34124 if (flag_trapping_math || !TARGET_ROUND)
34125 break;
34127 if (out_mode == SFmode && in_mode == SFmode)
34129 if (out_n == 4 && in_n == 4)
34130 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ);
34131 else if (out_n == 8 && in_n == 8)
34132 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ256);
34134 break;
34136 case BUILT_IN_FMA:
34137 if (out_mode == DFmode && in_mode == DFmode)
34139 if (out_n == 2 && in_n == 2)
34140 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
34141 if (out_n == 4 && in_n == 4)
34142 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
34144 break;
34146 case BUILT_IN_FMAF:
34147 if (out_mode == SFmode && in_mode == SFmode)
34149 if (out_n == 4 && in_n == 4)
34150 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
34151 if (out_n == 8 && in_n == 8)
34152 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
34154 break;
34156 default:
34157 break;
34160 /* Dispatch to a handler for a vectorization library. */
34161 if (ix86_veclib_handler)
34162 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
34163 type_in);
34165 return NULL_TREE;
34168 /* Handler for an SVML-style interface to
34169 a library with vectorized intrinsics. */
34171 static tree
34172 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
34174 char name[20];
34175 tree fntype, new_fndecl, args;
34176 unsigned arity;
34177 const char *bname;
34178 enum machine_mode el_mode, in_mode;
34179 int n, in_n;
34181 /* The SVML is suitable for unsafe math only. */
34182 if (!flag_unsafe_math_optimizations)
34183 return NULL_TREE;
34185 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34186 n = TYPE_VECTOR_SUBPARTS (type_out);
34187 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34188 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34189 if (el_mode != in_mode
34190 || n != in_n)
34191 return NULL_TREE;
34193 switch (fn)
34195 case BUILT_IN_EXP:
34196 case BUILT_IN_LOG:
34197 case BUILT_IN_LOG10:
34198 case BUILT_IN_POW:
34199 case BUILT_IN_TANH:
34200 case BUILT_IN_TAN:
34201 case BUILT_IN_ATAN:
34202 case BUILT_IN_ATAN2:
34203 case BUILT_IN_ATANH:
34204 case BUILT_IN_CBRT:
34205 case BUILT_IN_SINH:
34206 case BUILT_IN_SIN:
34207 case BUILT_IN_ASINH:
34208 case BUILT_IN_ASIN:
34209 case BUILT_IN_COSH:
34210 case BUILT_IN_COS:
34211 case BUILT_IN_ACOSH:
34212 case BUILT_IN_ACOS:
34213 if (el_mode != DFmode || n != 2)
34214 return NULL_TREE;
34215 break;
34217 case BUILT_IN_EXPF:
34218 case BUILT_IN_LOGF:
34219 case BUILT_IN_LOG10F:
34220 case BUILT_IN_POWF:
34221 case BUILT_IN_TANHF:
34222 case BUILT_IN_TANF:
34223 case BUILT_IN_ATANF:
34224 case BUILT_IN_ATAN2F:
34225 case BUILT_IN_ATANHF:
34226 case BUILT_IN_CBRTF:
34227 case BUILT_IN_SINHF:
34228 case BUILT_IN_SINF:
34229 case BUILT_IN_ASINHF:
34230 case BUILT_IN_ASINF:
34231 case BUILT_IN_COSHF:
34232 case BUILT_IN_COSF:
34233 case BUILT_IN_ACOSHF:
34234 case BUILT_IN_ACOSF:
34235 if (el_mode != SFmode || n != 4)
34236 return NULL_TREE;
34237 break;
34239 default:
34240 return NULL_TREE;
34243 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34245 if (fn == BUILT_IN_LOGF)
34246 strcpy (name, "vmlsLn4");
34247 else if (fn == BUILT_IN_LOG)
34248 strcpy (name, "vmldLn2");
34249 else if (n == 4)
34251 sprintf (name, "vmls%s", bname+10);
34252 name[strlen (name)-1] = '4';
34254 else
34255 sprintf (name, "vmld%s2", bname+10);
34257 /* Convert to uppercase. */
34258 name[4] &= ~0x20;
34260 arity = 0;
34261 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34262 args;
34263 args = TREE_CHAIN (args))
34264 arity++;
34266 if (arity == 1)
34267 fntype = build_function_type_list (type_out, type_in, NULL);
34268 else
34269 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34271 /* Build a function declaration for the vectorized function. */
34272 new_fndecl = build_decl (BUILTINS_LOCATION,
34273 FUNCTION_DECL, get_identifier (name), fntype);
34274 TREE_PUBLIC (new_fndecl) = 1;
34275 DECL_EXTERNAL (new_fndecl) = 1;
34276 DECL_IS_NOVOPS (new_fndecl) = 1;
34277 TREE_READONLY (new_fndecl) = 1;
34279 return new_fndecl;
34282 /* Handler for an ACML-style interface to
34283 a library with vectorized intrinsics. */
34285 static tree
34286 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
34288 char name[20] = "__vr.._";
34289 tree fntype, new_fndecl, args;
34290 unsigned arity;
34291 const char *bname;
34292 enum machine_mode el_mode, in_mode;
34293 int n, in_n;
34295 /* The ACML is 64bits only and suitable for unsafe math only as
34296 it does not correctly support parts of IEEE with the required
34297 precision such as denormals. */
34298 if (!TARGET_64BIT
34299 || !flag_unsafe_math_optimizations)
34300 return NULL_TREE;
34302 el_mode = TYPE_MODE (TREE_TYPE (type_out));
34303 n = TYPE_VECTOR_SUBPARTS (type_out);
34304 in_mode = TYPE_MODE (TREE_TYPE (type_in));
34305 in_n = TYPE_VECTOR_SUBPARTS (type_in);
34306 if (el_mode != in_mode
34307 || n != in_n)
34308 return NULL_TREE;
34310 switch (fn)
34312 case BUILT_IN_SIN:
34313 case BUILT_IN_COS:
34314 case BUILT_IN_EXP:
34315 case BUILT_IN_LOG:
34316 case BUILT_IN_LOG2:
34317 case BUILT_IN_LOG10:
34318 name[4] = 'd';
34319 name[5] = '2';
34320 if (el_mode != DFmode
34321 || n != 2)
34322 return NULL_TREE;
34323 break;
34325 case BUILT_IN_SINF:
34326 case BUILT_IN_COSF:
34327 case BUILT_IN_EXPF:
34328 case BUILT_IN_POWF:
34329 case BUILT_IN_LOGF:
34330 case BUILT_IN_LOG2F:
34331 case BUILT_IN_LOG10F:
34332 name[4] = 's';
34333 name[5] = '4';
34334 if (el_mode != SFmode
34335 || n != 4)
34336 return NULL_TREE;
34337 break;
34339 default:
34340 return NULL_TREE;
34343 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34344 sprintf (name + 7, "%s", bname+10);
34346 arity = 0;
34347 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34348 args;
34349 args = TREE_CHAIN (args))
34350 arity++;
34352 if (arity == 1)
34353 fntype = build_function_type_list (type_out, type_in, NULL);
34354 else
34355 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34357 /* Build a function declaration for the vectorized function. */
34358 new_fndecl = build_decl (BUILTINS_LOCATION,
34359 FUNCTION_DECL, get_identifier (name), fntype);
34360 TREE_PUBLIC (new_fndecl) = 1;
34361 DECL_EXTERNAL (new_fndecl) = 1;
34362 DECL_IS_NOVOPS (new_fndecl) = 1;
34363 TREE_READONLY (new_fndecl) = 1;
34365 return new_fndecl;
34368 /* Returns a decl of a function that implements gather load with
34369 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
34370 Return NULL_TREE if it is not available. */
34372 static tree
34373 ix86_vectorize_builtin_gather (const_tree mem_vectype,
34374 const_tree index_type, int scale)
34376 bool si;
34377 enum ix86_builtins code;
34379 if (! TARGET_AVX2)
34380 return NULL_TREE;
34382 if ((TREE_CODE (index_type) != INTEGER_TYPE
34383 && !POINTER_TYPE_P (index_type))
34384 || (TYPE_MODE (index_type) != SImode
34385 && TYPE_MODE (index_type) != DImode))
34386 return NULL_TREE;
34388 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
34389 return NULL_TREE;
34391 /* v*gather* insn sign extends index to pointer mode. */
34392 if (TYPE_PRECISION (index_type) < POINTER_SIZE
34393 && TYPE_UNSIGNED (index_type))
34394 return NULL_TREE;
34396 if (scale <= 0
34397 || scale > 8
34398 || (scale & (scale - 1)) != 0)
34399 return NULL_TREE;
34401 si = TYPE_MODE (index_type) == SImode;
34402 switch (TYPE_MODE (mem_vectype))
34404 case V2DFmode:
34405 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
34406 break;
34407 case V4DFmode:
34408 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
34409 break;
34410 case V2DImode:
34411 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
34412 break;
34413 case V4DImode:
34414 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
34415 break;
34416 case V4SFmode:
34417 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
34418 break;
34419 case V8SFmode:
34420 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
34421 break;
34422 case V4SImode:
34423 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
34424 break;
34425 case V8SImode:
34426 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
34427 break;
34428 default:
34429 return NULL_TREE;
34432 return ix86_get_builtin (code);
34435 /* Returns a code for a target-specific builtin that implements
34436 reciprocal of the function, or NULL_TREE if not available. */
34438 static tree
34439 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
34440 bool sqrt ATTRIBUTE_UNUSED)
34442 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
34443 && flag_finite_math_only && !flag_trapping_math
34444 && flag_unsafe_math_optimizations))
34445 return NULL_TREE;
34447 if (md_fn)
34448 /* Machine dependent builtins. */
34449 switch (fn)
34451 /* Vectorized version of sqrt to rsqrt conversion. */
34452 case IX86_BUILTIN_SQRTPS_NR:
34453 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
34455 case IX86_BUILTIN_SQRTPS_NR256:
34456 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
34458 default:
34459 return NULL_TREE;
34461 else
34462 /* Normal builtins. */
34463 switch (fn)
34465 /* Sqrt to rsqrt conversion. */
34466 case BUILT_IN_SQRTF:
34467 return ix86_get_builtin (IX86_BUILTIN_RSQRTF);
34469 default:
34470 return NULL_TREE;
34474 /* Helper for avx_vpermilps256_operand et al. This is also used by
34475 the expansion functions to turn the parallel back into a mask.
34476 The return value is 0 for no match and the imm8+1 for a match. */
34479 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34481 unsigned i, nelt = GET_MODE_NUNITS (mode);
34482 unsigned mask = 0;
34483 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34485 if (XVECLEN (par, 0) != (int) nelt)
34486 return 0;
34488 /* Validate that all of the elements are constants, and not totally
34489 out of range. Copy the data into an integral array to make the
34490 subsequent checks easier. */
34491 for (i = 0; i < nelt; ++i)
34493 rtx er = XVECEXP (par, 0, i);
34494 unsigned HOST_WIDE_INT ei;
34496 if (!CONST_INT_P (er))
34497 return 0;
34498 ei = INTVAL (er);
34499 if (ei >= nelt)
34500 return 0;
34501 ipar[i] = ei;
34504 switch (mode)
34506 case V4DFmode:
34507 /* In the 256-bit DFmode case, we can only move elements within
34508 a 128-bit lane. */
34509 for (i = 0; i < 2; ++i)
34511 if (ipar[i] >= 2)
34512 return 0;
34513 mask |= ipar[i] << i;
34515 for (i = 2; i < 4; ++i)
34517 if (ipar[i] < 2)
34518 return 0;
34519 mask |= (ipar[i] - 2) << i;
34521 break;
34523 case V8SFmode:
34524 /* In the 256-bit SFmode case, we have full freedom of movement
34525 within the low 128-bit lane, but the high 128-bit lane must
34526 mirror the exact same pattern. */
34527 for (i = 0; i < 4; ++i)
34528 if (ipar[i] + 4 != ipar[i + 4])
34529 return 0;
34530 nelt = 4;
34531 /* FALLTHRU */
34533 case V2DFmode:
34534 case V4SFmode:
34535 /* In the 128-bit case, we've full freedom in the placement of
34536 the elements from the source operand. */
34537 for (i = 0; i < nelt; ++i)
34538 mask |= ipar[i] << (i * (nelt / 2));
34539 break;
34541 default:
34542 gcc_unreachable ();
34545 /* Make sure success has a non-zero value by adding one. */
34546 return mask + 1;
34549 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34550 the expansion functions to turn the parallel back into a mask.
34551 The return value is 0 for no match and the imm8+1 for a match. */
34554 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34556 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34557 unsigned mask = 0;
34558 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34560 if (XVECLEN (par, 0) != (int) nelt)
34561 return 0;
34563 /* Validate that all of the elements are constants, and not totally
34564 out of range. Copy the data into an integral array to make the
34565 subsequent checks easier. */
34566 for (i = 0; i < nelt; ++i)
34568 rtx er = XVECEXP (par, 0, i);
34569 unsigned HOST_WIDE_INT ei;
34571 if (!CONST_INT_P (er))
34572 return 0;
34573 ei = INTVAL (er);
34574 if (ei >= 2 * nelt)
34575 return 0;
34576 ipar[i] = ei;
34579 /* Validate that the halves of the permute are halves. */
34580 for (i = 0; i < nelt2 - 1; ++i)
34581 if (ipar[i] + 1 != ipar[i + 1])
34582 return 0;
34583 for (i = nelt2; i < nelt - 1; ++i)
34584 if (ipar[i] + 1 != ipar[i + 1])
34585 return 0;
34587 /* Reconstruct the mask. */
34588 for (i = 0; i < 2; ++i)
34590 unsigned e = ipar[i * nelt2];
34591 if (e % nelt2)
34592 return 0;
34593 e /= nelt2;
34594 mask |= e << (i * 4);
34597 /* Make sure success has a non-zero value by adding one. */
34598 return mask + 1;
34601 /* Store OPERAND to the memory after reload is completed. This means
34602 that we can't easily use assign_stack_local. */
34604 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34606 rtx result;
34608 gcc_assert (reload_completed);
34609 if (ix86_using_red_zone ())
34611 result = gen_rtx_MEM (mode,
34612 gen_rtx_PLUS (Pmode,
34613 stack_pointer_rtx,
34614 GEN_INT (-RED_ZONE_SIZE)));
34615 emit_move_insn (result, operand);
34617 else if (TARGET_64BIT)
34619 switch (mode)
34621 case HImode:
34622 case SImode:
34623 operand = gen_lowpart (DImode, operand);
34624 /* FALLTHRU */
34625 case DImode:
34626 emit_insn (
34627 gen_rtx_SET (VOIDmode,
34628 gen_rtx_MEM (DImode,
34629 gen_rtx_PRE_DEC (DImode,
34630 stack_pointer_rtx)),
34631 operand));
34632 break;
34633 default:
34634 gcc_unreachable ();
34636 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34638 else
34640 switch (mode)
34642 case DImode:
34644 rtx operands[2];
34645 split_double_mode (mode, &operand, 1, operands, operands + 1);
34646 emit_insn (
34647 gen_rtx_SET (VOIDmode,
34648 gen_rtx_MEM (SImode,
34649 gen_rtx_PRE_DEC (Pmode,
34650 stack_pointer_rtx)),
34651 operands[1]));
34652 emit_insn (
34653 gen_rtx_SET (VOIDmode,
34654 gen_rtx_MEM (SImode,
34655 gen_rtx_PRE_DEC (Pmode,
34656 stack_pointer_rtx)),
34657 operands[0]));
34659 break;
34660 case HImode:
34661 /* Store HImodes as SImodes. */
34662 operand = gen_lowpart (SImode, operand);
34663 /* FALLTHRU */
34664 case SImode:
34665 emit_insn (
34666 gen_rtx_SET (VOIDmode,
34667 gen_rtx_MEM (GET_MODE (operand),
34668 gen_rtx_PRE_DEC (SImode,
34669 stack_pointer_rtx)),
34670 operand));
34671 break;
34672 default:
34673 gcc_unreachable ();
34675 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34677 return result;
34680 /* Free operand from the memory. */
34681 void
34682 ix86_free_from_memory (enum machine_mode mode)
34684 if (!ix86_using_red_zone ())
34686 int size;
34688 if (mode == DImode || TARGET_64BIT)
34689 size = 8;
34690 else
34691 size = 4;
34692 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34693 to pop or add instruction if registers are available. */
34694 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34695 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34696 GEN_INT (size))));
34700 /* Return a register priority for hard reg REGNO. */
34701 static int
34702 ix86_register_priority (int hard_regno)
34704 /* ebp and r13 as the base always wants a displacement, r12 as the
34705 base always wants an index. So discourage their usage in an
34706 address. */
34707 if (hard_regno == R12_REG || hard_regno == R13_REG)
34708 return 0;
34709 if (hard_regno == BP_REG)
34710 return 1;
34711 /* New x86-64 int registers result in bigger code size. Discourage
34712 them. */
34713 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34714 return 2;
34715 /* New x86-64 SSE registers result in bigger code size. Discourage
34716 them. */
34717 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34718 return 2;
34719 /* Usage of AX register results in smaller code. Prefer it. */
34720 if (hard_regno == 0)
34721 return 4;
34722 return 3;
34725 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34727 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34728 QImode must go into class Q_REGS.
34729 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34730 movdf to do mem-to-mem moves through integer regs. */
34732 static reg_class_t
34733 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34735 enum machine_mode mode = GET_MODE (x);
34737 /* We're only allowed to return a subclass of CLASS. Many of the
34738 following checks fail for NO_REGS, so eliminate that early. */
34739 if (regclass == NO_REGS)
34740 return NO_REGS;
34742 /* All classes can load zeros. */
34743 if (x == CONST0_RTX (mode))
34744 return regclass;
34746 /* Force constants into memory if we are loading a (nonzero) constant into
34747 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34748 instructions to load from a constant. */
34749 if (CONSTANT_P (x)
34750 && (MAYBE_MMX_CLASS_P (regclass)
34751 || MAYBE_SSE_CLASS_P (regclass)
34752 || MAYBE_MASK_CLASS_P (regclass)))
34753 return NO_REGS;
34755 /* Prefer SSE regs only, if we can use them for math. */
34756 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34757 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34759 /* Floating-point constants need more complex checks. */
34760 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34762 /* General regs can load everything. */
34763 if (reg_class_subset_p (regclass, GENERAL_REGS))
34764 return regclass;
34766 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34767 zero above. We only want to wind up preferring 80387 registers if
34768 we plan on doing computation with them. */
34769 if (TARGET_80387
34770 && standard_80387_constant_p (x) > 0)
34772 /* Limit class to non-sse. */
34773 if (regclass == FLOAT_SSE_REGS)
34774 return FLOAT_REGS;
34775 if (regclass == FP_TOP_SSE_REGS)
34776 return FP_TOP_REG;
34777 if (regclass == FP_SECOND_SSE_REGS)
34778 return FP_SECOND_REG;
34779 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34780 return regclass;
34783 return NO_REGS;
34786 /* Generally when we see PLUS here, it's the function invariant
34787 (plus soft-fp const_int). Which can only be computed into general
34788 regs. */
34789 if (GET_CODE (x) == PLUS)
34790 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34792 /* QImode constants are easy to load, but non-constant QImode data
34793 must go into Q_REGS. */
34794 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34796 if (reg_class_subset_p (regclass, Q_REGS))
34797 return regclass;
34798 if (reg_class_subset_p (Q_REGS, regclass))
34799 return Q_REGS;
34800 return NO_REGS;
34803 return regclass;
34806 /* Discourage putting floating-point values in SSE registers unless
34807 SSE math is being used, and likewise for the 387 registers. */
34808 static reg_class_t
34809 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34811 enum machine_mode mode = GET_MODE (x);
34813 /* Restrict the output reload class to the register bank that we are doing
34814 math on. If we would like not to return a subset of CLASS, reject this
34815 alternative: if reload cannot do this, it will still use its choice. */
34816 mode = GET_MODE (x);
34817 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34818 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34820 if (X87_FLOAT_MODE_P (mode))
34822 if (regclass == FP_TOP_SSE_REGS)
34823 return FP_TOP_REG;
34824 else if (regclass == FP_SECOND_SSE_REGS)
34825 return FP_SECOND_REG;
34826 else
34827 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34830 return regclass;
34833 static reg_class_t
34834 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34835 enum machine_mode mode, secondary_reload_info *sri)
34837 /* Double-word spills from general registers to non-offsettable memory
34838 references (zero-extended addresses) require special handling. */
34839 if (TARGET_64BIT
34840 && MEM_P (x)
34841 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34842 && INTEGER_CLASS_P (rclass)
34843 && !offsettable_memref_p (x))
34845 sri->icode = (in_p
34846 ? CODE_FOR_reload_noff_load
34847 : CODE_FOR_reload_noff_store);
34848 /* Add the cost of moving address to a temporary. */
34849 sri->extra_cost = 1;
34851 return NO_REGS;
34854 /* QImode spills from non-QI registers require
34855 intermediate register on 32bit targets. */
34856 if (mode == QImode
34857 && (MAYBE_MASK_CLASS_P (rclass)
34858 || (!TARGET_64BIT && !in_p
34859 && INTEGER_CLASS_P (rclass)
34860 && MAYBE_NON_Q_CLASS_P (rclass))))
34862 int regno;
34864 if (REG_P (x))
34865 regno = REGNO (x);
34866 else
34867 regno = -1;
34869 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34870 regno = true_regnum (x);
34872 /* Return Q_REGS if the operand is in memory. */
34873 if (regno == -1)
34874 return Q_REGS;
34877 /* This condition handles corner case where an expression involving
34878 pointers gets vectorized. We're trying to use the address of a
34879 stack slot as a vector initializer.
34881 (set (reg:V2DI 74 [ vect_cst_.2 ])
34882 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34884 Eventually frame gets turned into sp+offset like this:
34886 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34887 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34888 (const_int 392 [0x188]))))
34890 That later gets turned into:
34892 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34893 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34894 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34896 We'll have the following reload recorded:
34898 Reload 0: reload_in (DI) =
34899 (plus:DI (reg/f:DI 7 sp)
34900 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34901 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34902 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34903 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34904 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34905 reload_reg_rtx: (reg:V2DI 22 xmm1)
34907 Which isn't going to work since SSE instructions can't handle scalar
34908 additions. Returning GENERAL_REGS forces the addition into integer
34909 register and reload can handle subsequent reloads without problems. */
34911 if (in_p && GET_CODE (x) == PLUS
34912 && SSE_CLASS_P (rclass)
34913 && SCALAR_INT_MODE_P (mode))
34914 return GENERAL_REGS;
34916 return NO_REGS;
34919 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34921 static bool
34922 ix86_class_likely_spilled_p (reg_class_t rclass)
34924 switch (rclass)
34926 case AREG:
34927 case DREG:
34928 case CREG:
34929 case BREG:
34930 case AD_REGS:
34931 case SIREG:
34932 case DIREG:
34933 case SSE_FIRST_REG:
34934 case FP_TOP_REG:
34935 case FP_SECOND_REG:
34936 return true;
34938 default:
34939 break;
34942 return false;
34945 /* If we are copying between general and FP registers, we need a memory
34946 location. The same is true for SSE and MMX registers.
34948 To optimize register_move_cost performance, allow inline variant.
34950 The macro can't work reliably when one of the CLASSES is class containing
34951 registers from multiple units (SSE, MMX, integer). We avoid this by never
34952 combining those units in single alternative in the machine description.
34953 Ensure that this constraint holds to avoid unexpected surprises.
34955 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34956 enforce these sanity checks. */
34958 static inline bool
34959 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34960 enum machine_mode mode, int strict)
34962 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34963 return false;
34964 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34965 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34966 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34967 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34968 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34969 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34971 gcc_assert (!strict || lra_in_progress);
34972 return true;
34975 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34976 return true;
34978 /* ??? This is a lie. We do have moves between mmx/general, and for
34979 mmx/sse2. But by saying we need secondary memory we discourage the
34980 register allocator from using the mmx registers unless needed. */
34981 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34982 return true;
34984 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34986 /* SSE1 doesn't have any direct moves from other classes. */
34987 if (!TARGET_SSE2)
34988 return true;
34990 /* If the target says that inter-unit moves are more expensive
34991 than moving through memory, then don't generate them. */
34992 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34993 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34994 return true;
34996 /* Between SSE and general, we have moves no larger than word size. */
34997 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34998 return true;
35001 return false;
35004 bool
35005 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
35006 enum machine_mode mode, int strict)
35008 return inline_secondary_memory_needed (class1, class2, mode, strict);
35011 /* Implement the TARGET_CLASS_MAX_NREGS hook.
35013 On the 80386, this is the size of MODE in words,
35014 except in the FP regs, where a single reg is always enough. */
35016 static unsigned char
35017 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
35019 if (MAYBE_INTEGER_CLASS_P (rclass))
35021 if (mode == XFmode)
35022 return (TARGET_64BIT ? 2 : 3);
35023 else if (mode == XCmode)
35024 return (TARGET_64BIT ? 4 : 6);
35025 else
35026 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
35028 else
35030 if (COMPLEX_MODE_P (mode))
35031 return 2;
35032 else
35033 return 1;
35037 /* Return true if the registers in CLASS cannot represent the change from
35038 modes FROM to TO. */
35040 bool
35041 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
35042 enum reg_class regclass)
35044 if (from == to)
35045 return false;
35047 /* x87 registers can't do subreg at all, as all values are reformatted
35048 to extended precision. */
35049 if (MAYBE_FLOAT_CLASS_P (regclass))
35050 return true;
35052 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
35054 /* Vector registers do not support QI or HImode loads. If we don't
35055 disallow a change to these modes, reload will assume it's ok to
35056 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
35057 the vec_dupv4hi pattern. */
35058 if (GET_MODE_SIZE (from) < 4)
35059 return true;
35061 /* Vector registers do not support subreg with nonzero offsets, which
35062 are otherwise valid for integer registers. Since we can't see
35063 whether we have a nonzero offset from here, prohibit all
35064 nonparadoxical subregs changing size. */
35065 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
35066 return true;
35069 return false;
35072 /* Return the cost of moving data of mode M between a
35073 register and memory. A value of 2 is the default; this cost is
35074 relative to those in `REGISTER_MOVE_COST'.
35076 This function is used extensively by register_move_cost that is used to
35077 build tables at startup. Make it inline in this case.
35078 When IN is 2, return maximum of in and out move cost.
35080 If moving between registers and memory is more expensive than
35081 between two registers, you should define this macro to express the
35082 relative cost.
35084 Model also increased moving costs of QImode registers in non
35085 Q_REGS classes.
35087 static inline int
35088 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
35089 int in)
35091 int cost;
35092 if (FLOAT_CLASS_P (regclass))
35094 int index;
35095 switch (mode)
35097 case SFmode:
35098 index = 0;
35099 break;
35100 case DFmode:
35101 index = 1;
35102 break;
35103 case XFmode:
35104 index = 2;
35105 break;
35106 default:
35107 return 100;
35109 if (in == 2)
35110 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
35111 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
35113 if (SSE_CLASS_P (regclass))
35115 int index;
35116 switch (GET_MODE_SIZE (mode))
35118 case 4:
35119 index = 0;
35120 break;
35121 case 8:
35122 index = 1;
35123 break;
35124 case 16:
35125 index = 2;
35126 break;
35127 default:
35128 return 100;
35130 if (in == 2)
35131 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
35132 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
35134 if (MMX_CLASS_P (regclass))
35136 int index;
35137 switch (GET_MODE_SIZE (mode))
35139 case 4:
35140 index = 0;
35141 break;
35142 case 8:
35143 index = 1;
35144 break;
35145 default:
35146 return 100;
35148 if (in)
35149 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
35150 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
35152 switch (GET_MODE_SIZE (mode))
35154 case 1:
35155 if (Q_CLASS_P (regclass) || TARGET_64BIT)
35157 if (!in)
35158 return ix86_cost->int_store[0];
35159 if (TARGET_PARTIAL_REG_DEPENDENCY
35160 && optimize_function_for_speed_p (cfun))
35161 cost = ix86_cost->movzbl_load;
35162 else
35163 cost = ix86_cost->int_load[0];
35164 if (in == 2)
35165 return MAX (cost, ix86_cost->int_store[0]);
35166 return cost;
35168 else
35170 if (in == 2)
35171 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
35172 if (in)
35173 return ix86_cost->movzbl_load;
35174 else
35175 return ix86_cost->int_store[0] + 4;
35177 break;
35178 case 2:
35179 if (in == 2)
35180 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
35181 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
35182 default:
35183 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
35184 if (mode == TFmode)
35185 mode = XFmode;
35186 if (in == 2)
35187 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
35188 else if (in)
35189 cost = ix86_cost->int_load[2];
35190 else
35191 cost = ix86_cost->int_store[2];
35192 return (cost * (((int) GET_MODE_SIZE (mode)
35193 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
35197 static int
35198 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
35199 bool in)
35201 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
35205 /* Return the cost of moving data from a register in class CLASS1 to
35206 one in class CLASS2.
35208 It is not required that the cost always equal 2 when FROM is the same as TO;
35209 on some machines it is expensive to move between registers if they are not
35210 general registers. */
35212 static int
35213 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
35214 reg_class_t class2_i)
35216 enum reg_class class1 = (enum reg_class) class1_i;
35217 enum reg_class class2 = (enum reg_class) class2_i;
35219 /* In case we require secondary memory, compute cost of the store followed
35220 by load. In order to avoid bad register allocation choices, we need
35221 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
35223 if (inline_secondary_memory_needed (class1, class2, mode, 0))
35225 int cost = 1;
35227 cost += inline_memory_move_cost (mode, class1, 2);
35228 cost += inline_memory_move_cost (mode, class2, 2);
35230 /* In case of copying from general_purpose_register we may emit multiple
35231 stores followed by single load causing memory size mismatch stall.
35232 Count this as arbitrarily high cost of 20. */
35233 if (targetm.class_max_nregs (class1, mode)
35234 > targetm.class_max_nregs (class2, mode))
35235 cost += 20;
35237 /* In the case of FP/MMX moves, the registers actually overlap, and we
35238 have to switch modes in order to treat them differently. */
35239 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
35240 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
35241 cost += 20;
35243 return cost;
35246 /* Moves between SSE/MMX and integer unit are expensive. */
35247 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
35248 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
35250 /* ??? By keeping returned value relatively high, we limit the number
35251 of moves between integer and MMX/SSE registers for all targets.
35252 Additionally, high value prevents problem with x86_modes_tieable_p(),
35253 where integer modes in MMX/SSE registers are not tieable
35254 because of missing QImode and HImode moves to, from or between
35255 MMX/SSE registers. */
35256 return MAX (8, ix86_cost->mmxsse_to_integer);
35258 if (MAYBE_FLOAT_CLASS_P (class1))
35259 return ix86_cost->fp_move;
35260 if (MAYBE_SSE_CLASS_P (class1))
35261 return ix86_cost->sse_move;
35262 if (MAYBE_MMX_CLASS_P (class1))
35263 return ix86_cost->mmx_move;
35264 return 2;
35267 /* Return TRUE if hard register REGNO can hold a value of machine-mode
35268 MODE. */
35270 bool
35271 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
35273 /* Flags and only flags can only hold CCmode values. */
35274 if (CC_REGNO_P (regno))
35275 return GET_MODE_CLASS (mode) == MODE_CC;
35276 if (GET_MODE_CLASS (mode) == MODE_CC
35277 || GET_MODE_CLASS (mode) == MODE_RANDOM
35278 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
35279 return false;
35280 if (STACK_REGNO_P (regno))
35281 return VALID_FP_MODE_P (mode);
35282 if (MASK_REGNO_P (regno))
35283 return VALID_MASK_REG_MODE (mode);
35284 if (SSE_REGNO_P (regno))
35286 /* We implement the move patterns for all vector modes into and
35287 out of SSE registers, even when no operation instructions
35288 are available. */
35290 /* For AVX-512 we allow, regardless of regno:
35291 - XI mode
35292 - any of 512-bit wide vector mode
35293 - any scalar mode. */
35294 if (TARGET_AVX512F
35295 && (mode == XImode
35296 || VALID_AVX512F_REG_MODE (mode)
35297 || VALID_AVX512F_SCALAR_MODE (mode)))
35298 return true;
35300 /* xmm16-xmm31 are only available for AVX-512. */
35301 if (EXT_REX_SSE_REGNO_P (regno))
35302 return false;
35304 /* OImode move is available only when AVX is enabled. */
35305 return ((TARGET_AVX && mode == OImode)
35306 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35307 || VALID_SSE_REG_MODE (mode)
35308 || VALID_SSE2_REG_MODE (mode)
35309 || VALID_MMX_REG_MODE (mode)
35310 || VALID_MMX_REG_MODE_3DNOW (mode));
35312 if (MMX_REGNO_P (regno))
35314 /* We implement the move patterns for 3DNOW modes even in MMX mode,
35315 so if the register is available at all, then we can move data of
35316 the given mode into or out of it. */
35317 return (VALID_MMX_REG_MODE (mode)
35318 || VALID_MMX_REG_MODE_3DNOW (mode));
35321 if (mode == QImode)
35323 /* Take care for QImode values - they can be in non-QI regs,
35324 but then they do cause partial register stalls. */
35325 if (ANY_QI_REGNO_P (regno))
35326 return true;
35327 if (!TARGET_PARTIAL_REG_STALL)
35328 return true;
35329 /* LRA checks if the hard register is OK for the given mode.
35330 QImode values can live in non-QI regs, so we allow all
35331 registers here. */
35332 if (lra_in_progress)
35333 return true;
35334 return !can_create_pseudo_p ();
35336 /* We handle both integer and floats in the general purpose registers. */
35337 else if (VALID_INT_MODE_P (mode))
35338 return true;
35339 else if (VALID_FP_MODE_P (mode))
35340 return true;
35341 else if (VALID_DFP_MODE_P (mode))
35342 return true;
35343 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
35344 on to use that value in smaller contexts, this can easily force a
35345 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
35346 supporting DImode, allow it. */
35347 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
35348 return true;
35350 return false;
35353 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
35354 tieable integer mode. */
35356 static bool
35357 ix86_tieable_integer_mode_p (enum machine_mode mode)
35359 switch (mode)
35361 case HImode:
35362 case SImode:
35363 return true;
35365 case QImode:
35366 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
35368 case DImode:
35369 return TARGET_64BIT;
35371 default:
35372 return false;
35376 /* Return true if MODE1 is accessible in a register that can hold MODE2
35377 without copying. That is, all register classes that can hold MODE2
35378 can also hold MODE1. */
35380 bool
35381 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
35383 if (mode1 == mode2)
35384 return true;
35386 if (ix86_tieable_integer_mode_p (mode1)
35387 && ix86_tieable_integer_mode_p (mode2))
35388 return true;
35390 /* MODE2 being XFmode implies fp stack or general regs, which means we
35391 can tie any smaller floating point modes to it. Note that we do not
35392 tie this with TFmode. */
35393 if (mode2 == XFmode)
35394 return mode1 == SFmode || mode1 == DFmode;
35396 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
35397 that we can tie it with SFmode. */
35398 if (mode2 == DFmode)
35399 return mode1 == SFmode;
35401 /* If MODE2 is only appropriate for an SSE register, then tie with
35402 any other mode acceptable to SSE registers. */
35403 if (GET_MODE_SIZE (mode2) == 32
35404 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35405 return (GET_MODE_SIZE (mode1) == 32
35406 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35407 if (GET_MODE_SIZE (mode2) == 16
35408 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35409 return (GET_MODE_SIZE (mode1) == 16
35410 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35412 /* If MODE2 is appropriate for an MMX register, then tie
35413 with any other mode acceptable to MMX registers. */
35414 if (GET_MODE_SIZE (mode2) == 8
35415 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
35416 return (GET_MODE_SIZE (mode1) == 8
35417 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
35419 return false;
35422 /* Return the cost of moving between two registers of mode MODE. */
35424 static int
35425 ix86_set_reg_reg_cost (enum machine_mode mode)
35427 unsigned int units = UNITS_PER_WORD;
35429 switch (GET_MODE_CLASS (mode))
35431 default:
35432 break;
35434 case MODE_CC:
35435 units = GET_MODE_SIZE (CCmode);
35436 break;
35438 case MODE_FLOAT:
35439 if ((TARGET_SSE && mode == TFmode)
35440 || (TARGET_80387 && mode == XFmode)
35441 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
35442 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
35443 units = GET_MODE_SIZE (mode);
35444 break;
35446 case MODE_COMPLEX_FLOAT:
35447 if ((TARGET_SSE && mode == TCmode)
35448 || (TARGET_80387 && mode == XCmode)
35449 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
35450 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
35451 units = GET_MODE_SIZE (mode);
35452 break;
35454 case MODE_VECTOR_INT:
35455 case MODE_VECTOR_FLOAT:
35456 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
35457 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35458 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35459 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35460 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
35461 units = GET_MODE_SIZE (mode);
35464 /* Return the cost of moving between two registers of mode MODE,
35465 assuming that the move will be in pieces of at most UNITS bytes. */
35466 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35469 /* Compute a (partial) cost for rtx X. Return true if the complete
35470 cost has been computed, and false if subexpressions should be
35471 scanned. In either case, *TOTAL contains the cost result. */
35473 static bool
35474 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35475 bool speed)
35477 enum rtx_code code = (enum rtx_code) code_i;
35478 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35479 enum machine_mode mode = GET_MODE (x);
35480 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35482 switch (code)
35484 case SET:
35485 if (register_operand (SET_DEST (x), VOIDmode)
35486 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35488 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35489 return true;
35491 return false;
35493 case CONST_INT:
35494 case CONST:
35495 case LABEL_REF:
35496 case SYMBOL_REF:
35497 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35498 *total = 3;
35499 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35500 *total = 2;
35501 else if (flag_pic && SYMBOLIC_CONST (x)
35502 && (!TARGET_64BIT
35503 || (!GET_CODE (x) != LABEL_REF
35504 && (GET_CODE (x) != SYMBOL_REF
35505 || !SYMBOL_REF_LOCAL_P (x)))))
35506 *total = 1;
35507 else
35508 *total = 0;
35509 return true;
35511 case CONST_DOUBLE:
35512 if (mode == VOIDmode)
35514 *total = 0;
35515 return true;
35517 switch (standard_80387_constant_p (x))
35519 case 1: /* 0.0 */
35520 *total = 1;
35521 return true;
35522 default: /* Other constants */
35523 *total = 2;
35524 return true;
35525 case 0:
35526 case -1:
35527 break;
35529 if (SSE_FLOAT_MODE_P (mode))
35531 case CONST_VECTOR:
35532 switch (standard_sse_constant_p (x))
35534 case 0:
35535 break;
35536 case 1: /* 0: xor eliminates false dependency */
35537 *total = 0;
35538 return true;
35539 default: /* -1: cmp contains false dependency */
35540 *total = 1;
35541 return true;
35544 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35545 it'll probably end up. Add a penalty for size. */
35546 *total = (COSTS_N_INSNS (1)
35547 + (flag_pic != 0 && !TARGET_64BIT)
35548 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35549 return true;
35551 case ZERO_EXTEND:
35552 /* The zero extensions is often completely free on x86_64, so make
35553 it as cheap as possible. */
35554 if (TARGET_64BIT && mode == DImode
35555 && GET_MODE (XEXP (x, 0)) == SImode)
35556 *total = 1;
35557 else if (TARGET_ZERO_EXTEND_WITH_AND)
35558 *total = cost->add;
35559 else
35560 *total = cost->movzx;
35561 return false;
35563 case SIGN_EXTEND:
35564 *total = cost->movsx;
35565 return false;
35567 case ASHIFT:
35568 if (SCALAR_INT_MODE_P (mode)
35569 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35570 && CONST_INT_P (XEXP (x, 1)))
35572 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35573 if (value == 1)
35575 *total = cost->add;
35576 return false;
35578 if ((value == 2 || value == 3)
35579 && cost->lea <= cost->shift_const)
35581 *total = cost->lea;
35582 return false;
35585 /* FALLTHRU */
35587 case ROTATE:
35588 case ASHIFTRT:
35589 case LSHIFTRT:
35590 case ROTATERT:
35591 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35593 /* ??? Should be SSE vector operation cost. */
35594 /* At least for published AMD latencies, this really is the same
35595 as the latency for a simple fpu operation like fabs. */
35596 /* V*QImode is emulated with 1-11 insns. */
35597 if (mode == V16QImode || mode == V32QImode)
35599 int count = 11;
35600 if (TARGET_XOP && mode == V16QImode)
35602 /* For XOP we use vpshab, which requires a broadcast of the
35603 value to the variable shift insn. For constants this
35604 means a V16Q const in mem; even when we can perform the
35605 shift with one insn set the cost to prefer paddb. */
35606 if (CONSTANT_P (XEXP (x, 1)))
35608 *total = (cost->fabs
35609 + rtx_cost (XEXP (x, 0), code, 0, speed)
35610 + (speed ? 2 : COSTS_N_BYTES (16)));
35611 return true;
35613 count = 3;
35615 else if (TARGET_SSSE3)
35616 count = 7;
35617 *total = cost->fabs * count;
35619 else
35620 *total = cost->fabs;
35622 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35624 if (CONST_INT_P (XEXP (x, 1)))
35626 if (INTVAL (XEXP (x, 1)) > 32)
35627 *total = cost->shift_const + COSTS_N_INSNS (2);
35628 else
35629 *total = cost->shift_const * 2;
35631 else
35633 if (GET_CODE (XEXP (x, 1)) == AND)
35634 *total = cost->shift_var * 2;
35635 else
35636 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35639 else
35641 if (CONST_INT_P (XEXP (x, 1)))
35642 *total = cost->shift_const;
35643 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35644 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35646 /* Return the cost after shift-and truncation. */
35647 *total = cost->shift_var;
35648 return true;
35650 else
35651 *total = cost->shift_var;
35653 return false;
35655 case FMA:
35657 rtx sub;
35659 gcc_assert (FLOAT_MODE_P (mode));
35660 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35662 /* ??? SSE scalar/vector cost should be used here. */
35663 /* ??? Bald assumption that fma has the same cost as fmul. */
35664 *total = cost->fmul;
35665 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35667 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35668 sub = XEXP (x, 0);
35669 if (GET_CODE (sub) == NEG)
35670 sub = XEXP (sub, 0);
35671 *total += rtx_cost (sub, FMA, 0, speed);
35673 sub = XEXP (x, 2);
35674 if (GET_CODE (sub) == NEG)
35675 sub = XEXP (sub, 0);
35676 *total += rtx_cost (sub, FMA, 2, speed);
35677 return true;
35680 case MULT:
35681 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35683 /* ??? SSE scalar cost should be used here. */
35684 *total = cost->fmul;
35685 return false;
35687 else if (X87_FLOAT_MODE_P (mode))
35689 *total = cost->fmul;
35690 return false;
35692 else if (FLOAT_MODE_P (mode))
35694 /* ??? SSE vector cost should be used here. */
35695 *total = cost->fmul;
35696 return false;
35698 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35700 /* V*QImode is emulated with 7-13 insns. */
35701 if (mode == V16QImode || mode == V32QImode)
35703 int extra = 11;
35704 if (TARGET_XOP && mode == V16QImode)
35705 extra = 5;
35706 else if (TARGET_SSSE3)
35707 extra = 6;
35708 *total = cost->fmul * 2 + cost->fabs * extra;
35710 /* V*DImode is emulated with 5-8 insns. */
35711 else if (mode == V2DImode || mode == V4DImode)
35713 if (TARGET_XOP && mode == V2DImode)
35714 *total = cost->fmul * 2 + cost->fabs * 3;
35715 else
35716 *total = cost->fmul * 3 + cost->fabs * 5;
35718 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35719 insns, including two PMULUDQ. */
35720 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35721 *total = cost->fmul * 2 + cost->fabs * 5;
35722 else
35723 *total = cost->fmul;
35724 return false;
35726 else
35728 rtx op0 = XEXP (x, 0);
35729 rtx op1 = XEXP (x, 1);
35730 int nbits;
35731 if (CONST_INT_P (XEXP (x, 1)))
35733 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35734 for (nbits = 0; value != 0; value &= value - 1)
35735 nbits++;
35737 else
35738 /* This is arbitrary. */
35739 nbits = 7;
35741 /* Compute costs correctly for widening multiplication. */
35742 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35743 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35744 == GET_MODE_SIZE (mode))
35746 int is_mulwiden = 0;
35747 enum machine_mode inner_mode = GET_MODE (op0);
35749 if (GET_CODE (op0) == GET_CODE (op1))
35750 is_mulwiden = 1, op1 = XEXP (op1, 0);
35751 else if (CONST_INT_P (op1))
35753 if (GET_CODE (op0) == SIGN_EXTEND)
35754 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35755 == INTVAL (op1);
35756 else
35757 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35760 if (is_mulwiden)
35761 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35764 *total = (cost->mult_init[MODE_INDEX (mode)]
35765 + nbits * cost->mult_bit
35766 + rtx_cost (op0, outer_code, opno, speed)
35767 + rtx_cost (op1, outer_code, opno, speed));
35769 return true;
35772 case DIV:
35773 case UDIV:
35774 case MOD:
35775 case UMOD:
35776 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35777 /* ??? SSE cost should be used here. */
35778 *total = cost->fdiv;
35779 else if (X87_FLOAT_MODE_P (mode))
35780 *total = cost->fdiv;
35781 else if (FLOAT_MODE_P (mode))
35782 /* ??? SSE vector cost should be used here. */
35783 *total = cost->fdiv;
35784 else
35785 *total = cost->divide[MODE_INDEX (mode)];
35786 return false;
35788 case PLUS:
35789 if (GET_MODE_CLASS (mode) == MODE_INT
35790 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35792 if (GET_CODE (XEXP (x, 0)) == PLUS
35793 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35794 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35795 && CONSTANT_P (XEXP (x, 1)))
35797 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35798 if (val == 2 || val == 4 || val == 8)
35800 *total = cost->lea;
35801 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35802 outer_code, opno, speed);
35803 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35804 outer_code, opno, speed);
35805 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35806 return true;
35809 else if (GET_CODE (XEXP (x, 0)) == MULT
35810 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35812 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35813 if (val == 2 || val == 4 || val == 8)
35815 *total = cost->lea;
35816 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35817 outer_code, opno, speed);
35818 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35819 return true;
35822 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35824 *total = cost->lea;
35825 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35826 outer_code, opno, speed);
35827 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35828 outer_code, opno, speed);
35829 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35830 return true;
35833 /* FALLTHRU */
35835 case MINUS:
35836 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35838 /* ??? SSE cost should be used here. */
35839 *total = cost->fadd;
35840 return false;
35842 else if (X87_FLOAT_MODE_P (mode))
35844 *total = cost->fadd;
35845 return false;
35847 else if (FLOAT_MODE_P (mode))
35849 /* ??? SSE vector cost should be used here. */
35850 *total = cost->fadd;
35851 return false;
35853 /* FALLTHRU */
35855 case AND:
35856 case IOR:
35857 case XOR:
35858 if (GET_MODE_CLASS (mode) == MODE_INT
35859 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35861 *total = (cost->add * 2
35862 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35863 << (GET_MODE (XEXP (x, 0)) != DImode))
35864 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35865 << (GET_MODE (XEXP (x, 1)) != DImode)));
35866 return true;
35868 /* FALLTHRU */
35870 case NEG:
35871 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35873 /* ??? SSE cost should be used here. */
35874 *total = cost->fchs;
35875 return false;
35877 else if (X87_FLOAT_MODE_P (mode))
35879 *total = cost->fchs;
35880 return false;
35882 else if (FLOAT_MODE_P (mode))
35884 /* ??? SSE vector cost should be used here. */
35885 *total = cost->fchs;
35886 return false;
35888 /* FALLTHRU */
35890 case NOT:
35891 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35893 /* ??? Should be SSE vector operation cost. */
35894 /* At least for published AMD latencies, this really is the same
35895 as the latency for a simple fpu operation like fabs. */
35896 *total = cost->fabs;
35898 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35899 *total = cost->add * 2;
35900 else
35901 *total = cost->add;
35902 return false;
35904 case COMPARE:
35905 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35906 && XEXP (XEXP (x, 0), 1) == const1_rtx
35907 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35908 && XEXP (x, 1) == const0_rtx)
35910 /* This kind of construct is implemented using test[bwl].
35911 Treat it as if we had an AND. */
35912 *total = (cost->add
35913 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35914 + rtx_cost (const1_rtx, outer_code, opno, speed));
35915 return true;
35917 return false;
35919 case FLOAT_EXTEND:
35920 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35921 *total = 0;
35922 return false;
35924 case ABS:
35925 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35926 /* ??? SSE cost should be used here. */
35927 *total = cost->fabs;
35928 else if (X87_FLOAT_MODE_P (mode))
35929 *total = cost->fabs;
35930 else if (FLOAT_MODE_P (mode))
35931 /* ??? SSE vector cost should be used here. */
35932 *total = cost->fabs;
35933 return false;
35935 case SQRT:
35936 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35937 /* ??? SSE cost should be used here. */
35938 *total = cost->fsqrt;
35939 else if (X87_FLOAT_MODE_P (mode))
35940 *total = cost->fsqrt;
35941 else if (FLOAT_MODE_P (mode))
35942 /* ??? SSE vector cost should be used here. */
35943 *total = cost->fsqrt;
35944 return false;
35946 case UNSPEC:
35947 if (XINT (x, 1) == UNSPEC_TP)
35948 *total = 0;
35949 return false;
35951 case VEC_SELECT:
35952 case VEC_CONCAT:
35953 case VEC_MERGE:
35954 case VEC_DUPLICATE:
35955 /* ??? Assume all of these vector manipulation patterns are
35956 recognizable. In which case they all pretty much have the
35957 same cost. */
35958 *total = cost->fabs;
35959 return true;
35961 default:
35962 return false;
35966 #if TARGET_MACHO
35968 static int current_machopic_label_num;
35970 /* Given a symbol name and its associated stub, write out the
35971 definition of the stub. */
35973 void
35974 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35976 unsigned int length;
35977 char *binder_name, *symbol_name, lazy_ptr_name[32];
35978 int label = ++current_machopic_label_num;
35980 /* For 64-bit we shouldn't get here. */
35981 gcc_assert (!TARGET_64BIT);
35983 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35984 symb = targetm.strip_name_encoding (symb);
35986 length = strlen (stub);
35987 binder_name = XALLOCAVEC (char, length + 32);
35988 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35990 length = strlen (symb);
35991 symbol_name = XALLOCAVEC (char, length + 32);
35992 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35994 sprintf (lazy_ptr_name, "L%d$lz", label);
35996 if (MACHOPIC_ATT_STUB)
35997 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35998 else if (MACHOPIC_PURE)
35999 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
36000 else
36001 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
36003 fprintf (file, "%s:\n", stub);
36004 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
36006 if (MACHOPIC_ATT_STUB)
36008 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
36010 else if (MACHOPIC_PURE)
36012 /* PIC stub. */
36013 /* 25-byte PIC stub using "CALL get_pc_thunk". */
36014 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
36015 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
36016 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
36017 label, lazy_ptr_name, label);
36018 fprintf (file, "\tjmp\t*%%ecx\n");
36020 else
36021 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
36023 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
36024 it needs no stub-binding-helper. */
36025 if (MACHOPIC_ATT_STUB)
36026 return;
36028 fprintf (file, "%s:\n", binder_name);
36030 if (MACHOPIC_PURE)
36032 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
36033 fprintf (file, "\tpushl\t%%ecx\n");
36035 else
36036 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
36038 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
36040 /* N.B. Keep the correspondence of these
36041 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
36042 old-pic/new-pic/non-pic stubs; altering this will break
36043 compatibility with existing dylibs. */
36044 if (MACHOPIC_PURE)
36046 /* 25-byte PIC stub using "CALL get_pc_thunk". */
36047 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
36049 else
36050 /* 16-byte -mdynamic-no-pic stub. */
36051 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
36053 fprintf (file, "%s:\n", lazy_ptr_name);
36054 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
36055 fprintf (file, ASM_LONG "%s\n", binder_name);
36057 #endif /* TARGET_MACHO */
36059 /* Order the registers for register allocator. */
36061 void
36062 x86_order_regs_for_local_alloc (void)
36064 int pos = 0;
36065 int i;
36067 /* First allocate the local general purpose registers. */
36068 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
36069 if (GENERAL_REGNO_P (i) && call_used_regs[i])
36070 reg_alloc_order [pos++] = i;
36072 /* Global general purpose registers. */
36073 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
36074 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
36075 reg_alloc_order [pos++] = i;
36077 /* x87 registers come first in case we are doing FP math
36078 using them. */
36079 if (!TARGET_SSE_MATH)
36080 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
36081 reg_alloc_order [pos++] = i;
36083 /* SSE registers. */
36084 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
36085 reg_alloc_order [pos++] = i;
36086 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
36087 reg_alloc_order [pos++] = i;
36089 /* Extended REX SSE registers. */
36090 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
36091 reg_alloc_order [pos++] = i;
36093 /* Mask register. */
36094 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
36095 reg_alloc_order [pos++] = i;
36097 /* x87 registers. */
36098 if (TARGET_SSE_MATH)
36099 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
36100 reg_alloc_order [pos++] = i;
36102 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
36103 reg_alloc_order [pos++] = i;
36105 /* Initialize the rest of array as we do not allocate some registers
36106 at all. */
36107 while (pos < FIRST_PSEUDO_REGISTER)
36108 reg_alloc_order [pos++] = 0;
36111 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
36112 in struct attribute_spec handler. */
36113 static tree
36114 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
36115 tree args,
36116 int flags ATTRIBUTE_UNUSED,
36117 bool *no_add_attrs)
36119 if (TREE_CODE (*node) != FUNCTION_TYPE
36120 && TREE_CODE (*node) != METHOD_TYPE
36121 && TREE_CODE (*node) != FIELD_DECL
36122 && TREE_CODE (*node) != TYPE_DECL)
36124 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36125 name);
36126 *no_add_attrs = true;
36127 return NULL_TREE;
36129 if (TARGET_64BIT)
36131 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
36132 name);
36133 *no_add_attrs = true;
36134 return NULL_TREE;
36136 if (is_attribute_p ("callee_pop_aggregate_return", name))
36138 tree cst;
36140 cst = TREE_VALUE (args);
36141 if (TREE_CODE (cst) != INTEGER_CST)
36143 warning (OPT_Wattributes,
36144 "%qE attribute requires an integer constant argument",
36145 name);
36146 *no_add_attrs = true;
36148 else if (compare_tree_int (cst, 0) != 0
36149 && compare_tree_int (cst, 1) != 0)
36151 warning (OPT_Wattributes,
36152 "argument to %qE attribute is neither zero, nor one",
36153 name);
36154 *no_add_attrs = true;
36157 return NULL_TREE;
36160 return NULL_TREE;
36163 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
36164 struct attribute_spec.handler. */
36165 static tree
36166 ix86_handle_abi_attribute (tree *node, tree name,
36167 tree args ATTRIBUTE_UNUSED,
36168 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36170 if (TREE_CODE (*node) != FUNCTION_TYPE
36171 && TREE_CODE (*node) != METHOD_TYPE
36172 && TREE_CODE (*node) != FIELD_DECL
36173 && TREE_CODE (*node) != TYPE_DECL)
36175 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36176 name);
36177 *no_add_attrs = true;
36178 return NULL_TREE;
36181 /* Can combine regparm with all attributes but fastcall. */
36182 if (is_attribute_p ("ms_abi", name))
36184 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
36186 error ("ms_abi and sysv_abi attributes are not compatible");
36189 return NULL_TREE;
36191 else if (is_attribute_p ("sysv_abi", name))
36193 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
36195 error ("ms_abi and sysv_abi attributes are not compatible");
36198 return NULL_TREE;
36201 return NULL_TREE;
36204 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
36205 struct attribute_spec.handler. */
36206 static tree
36207 ix86_handle_struct_attribute (tree *node, tree name,
36208 tree args ATTRIBUTE_UNUSED,
36209 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36211 tree *type = NULL;
36212 if (DECL_P (*node))
36214 if (TREE_CODE (*node) == TYPE_DECL)
36215 type = &TREE_TYPE (*node);
36217 else
36218 type = node;
36220 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
36222 warning (OPT_Wattributes, "%qE attribute ignored",
36223 name);
36224 *no_add_attrs = true;
36227 else if ((is_attribute_p ("ms_struct", name)
36228 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
36229 || ((is_attribute_p ("gcc_struct", name)
36230 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
36232 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
36233 name);
36234 *no_add_attrs = true;
36237 return NULL_TREE;
36240 static tree
36241 ix86_handle_fndecl_attribute (tree *node, tree name,
36242 tree args ATTRIBUTE_UNUSED,
36243 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
36245 if (TREE_CODE (*node) != FUNCTION_DECL)
36247 warning (OPT_Wattributes, "%qE attribute only applies to functions",
36248 name);
36249 *no_add_attrs = true;
36251 return NULL_TREE;
36254 static bool
36255 ix86_ms_bitfield_layout_p (const_tree record_type)
36257 return ((TARGET_MS_BITFIELD_LAYOUT
36258 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
36259 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
36262 /* Returns an expression indicating where the this parameter is
36263 located on entry to the FUNCTION. */
36265 static rtx
36266 x86_this_parameter (tree function)
36268 tree type = TREE_TYPE (function);
36269 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
36270 int nregs;
36272 if (TARGET_64BIT)
36274 const int *parm_regs;
36276 if (ix86_function_type_abi (type) == MS_ABI)
36277 parm_regs = x86_64_ms_abi_int_parameter_registers;
36278 else
36279 parm_regs = x86_64_int_parameter_registers;
36280 return gen_rtx_REG (Pmode, parm_regs[aggr]);
36283 nregs = ix86_function_regparm (type, function);
36285 if (nregs > 0 && !stdarg_p (type))
36287 int regno;
36288 unsigned int ccvt = ix86_get_callcvt (type);
36290 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36291 regno = aggr ? DX_REG : CX_REG;
36292 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36294 regno = CX_REG;
36295 if (aggr)
36296 return gen_rtx_MEM (SImode,
36297 plus_constant (Pmode, stack_pointer_rtx, 4));
36299 else
36301 regno = AX_REG;
36302 if (aggr)
36304 regno = DX_REG;
36305 if (nregs == 1)
36306 return gen_rtx_MEM (SImode,
36307 plus_constant (Pmode,
36308 stack_pointer_rtx, 4));
36311 return gen_rtx_REG (SImode, regno);
36314 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
36315 aggr ? 8 : 4));
36318 /* Determine whether x86_output_mi_thunk can succeed. */
36320 static bool
36321 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
36322 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
36323 HOST_WIDE_INT vcall_offset, const_tree function)
36325 /* 64-bit can handle anything. */
36326 if (TARGET_64BIT)
36327 return true;
36329 /* For 32-bit, everything's fine if we have one free register. */
36330 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
36331 return true;
36333 /* Need a free register for vcall_offset. */
36334 if (vcall_offset)
36335 return false;
36337 /* Need a free register for GOT references. */
36338 if (flag_pic && !targetm.binds_local_p (function))
36339 return false;
36341 /* Otherwise ok. */
36342 return true;
36345 /* Output the assembler code for a thunk function. THUNK_DECL is the
36346 declaration for the thunk function itself, FUNCTION is the decl for
36347 the target function. DELTA is an immediate constant offset to be
36348 added to THIS. If VCALL_OFFSET is nonzero, the word at
36349 *(*this + vcall_offset) should be added to THIS. */
36351 static void
36352 x86_output_mi_thunk (FILE *file,
36353 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
36354 HOST_WIDE_INT vcall_offset, tree function)
36356 rtx this_param = x86_this_parameter (function);
36357 rtx this_reg, tmp, fnaddr;
36358 unsigned int tmp_regno;
36360 if (TARGET_64BIT)
36361 tmp_regno = R10_REG;
36362 else
36364 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
36365 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36366 tmp_regno = AX_REG;
36367 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36368 tmp_regno = DX_REG;
36369 else
36370 tmp_regno = CX_REG;
36373 emit_note (NOTE_INSN_PROLOGUE_END);
36375 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
36376 pull it in now and let DELTA benefit. */
36377 if (REG_P (this_param))
36378 this_reg = this_param;
36379 else if (vcall_offset)
36381 /* Put the this parameter into %eax. */
36382 this_reg = gen_rtx_REG (Pmode, AX_REG);
36383 emit_move_insn (this_reg, this_param);
36385 else
36386 this_reg = NULL_RTX;
36388 /* Adjust the this parameter by a fixed constant. */
36389 if (delta)
36391 rtx delta_rtx = GEN_INT (delta);
36392 rtx delta_dst = this_reg ? this_reg : this_param;
36394 if (TARGET_64BIT)
36396 if (!x86_64_general_operand (delta_rtx, Pmode))
36398 tmp = gen_rtx_REG (Pmode, tmp_regno);
36399 emit_move_insn (tmp, delta_rtx);
36400 delta_rtx = tmp;
36404 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
36407 /* Adjust the this parameter by a value stored in the vtable. */
36408 if (vcall_offset)
36410 rtx vcall_addr, vcall_mem, this_mem;
36412 tmp = gen_rtx_REG (Pmode, tmp_regno);
36414 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
36415 if (Pmode != ptr_mode)
36416 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
36417 emit_move_insn (tmp, this_mem);
36419 /* Adjust the this parameter. */
36420 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
36421 if (TARGET_64BIT
36422 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
36424 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
36425 emit_move_insn (tmp2, GEN_INT (vcall_offset));
36426 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
36429 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
36430 if (Pmode != ptr_mode)
36431 emit_insn (gen_addsi_1_zext (this_reg,
36432 gen_rtx_REG (ptr_mode,
36433 REGNO (this_reg)),
36434 vcall_mem));
36435 else
36436 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
36439 /* If necessary, drop THIS back to its stack slot. */
36440 if (this_reg && this_reg != this_param)
36441 emit_move_insn (this_param, this_reg);
36443 fnaddr = XEXP (DECL_RTL (function), 0);
36444 if (TARGET_64BIT)
36446 if (!flag_pic || targetm.binds_local_p (function)
36447 || TARGET_PECOFF)
36449 else
36451 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
36452 tmp = gen_rtx_CONST (Pmode, tmp);
36453 fnaddr = gen_rtx_MEM (Pmode, tmp);
36456 else
36458 if (!flag_pic || targetm.binds_local_p (function))
36460 #if TARGET_MACHO
36461 else if (TARGET_MACHO)
36463 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36464 fnaddr = XEXP (fnaddr, 0);
36466 #endif /* TARGET_MACHO */
36467 else
36469 tmp = gen_rtx_REG (Pmode, CX_REG);
36470 output_set_got (tmp, NULL_RTX);
36472 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36473 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36474 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36478 /* Our sibling call patterns do not allow memories, because we have no
36479 predicate that can distinguish between frame and non-frame memory.
36480 For our purposes here, we can get away with (ab)using a jump pattern,
36481 because we're going to do no optimization. */
36482 if (MEM_P (fnaddr))
36483 emit_jump_insn (gen_indirect_jump (fnaddr));
36484 else
36486 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36487 fnaddr = legitimize_pic_address (fnaddr,
36488 gen_rtx_REG (Pmode, tmp_regno));
36490 if (!sibcall_insn_operand (fnaddr, word_mode))
36492 tmp = gen_rtx_REG (word_mode, tmp_regno);
36493 if (GET_MODE (fnaddr) != word_mode)
36494 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36495 emit_move_insn (tmp, fnaddr);
36496 fnaddr = tmp;
36499 tmp = gen_rtx_MEM (QImode, fnaddr);
36500 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36501 tmp = emit_call_insn (tmp);
36502 SIBLING_CALL_P (tmp) = 1;
36504 emit_barrier ();
36506 /* Emit just enough of rest_of_compilation to get the insns emitted.
36507 Note that use_thunk calls assemble_start_function et al. */
36508 tmp = get_insns ();
36509 shorten_branches (tmp);
36510 final_start_function (tmp, file, 1);
36511 final (tmp, file, 1);
36512 final_end_function ();
36515 static void
36516 x86_file_start (void)
36518 default_file_start ();
36519 #if TARGET_MACHO
36520 darwin_file_start ();
36521 #endif
36522 if (X86_FILE_START_VERSION_DIRECTIVE)
36523 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36524 if (X86_FILE_START_FLTUSED)
36525 fputs ("\t.global\t__fltused\n", asm_out_file);
36526 if (ix86_asm_dialect == ASM_INTEL)
36527 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36531 x86_field_alignment (tree field, int computed)
36533 enum machine_mode mode;
36534 tree type = TREE_TYPE (field);
36536 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36537 return computed;
36538 mode = TYPE_MODE (strip_array_types (type));
36539 if (mode == DFmode || mode == DCmode
36540 || GET_MODE_CLASS (mode) == MODE_INT
36541 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36542 return MIN (32, computed);
36543 return computed;
36546 /* Output assembler code to FILE to increment profiler label # LABELNO
36547 for profiling a function entry. */
36548 void
36549 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36551 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36552 : MCOUNT_NAME);
36554 if (TARGET_64BIT)
36556 #ifndef NO_PROFILE_COUNTERS
36557 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36558 #endif
36560 if (!TARGET_PECOFF && flag_pic)
36561 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36562 else
36563 fprintf (file, "\tcall\t%s\n", mcount_name);
36565 else if (flag_pic)
36567 #ifndef NO_PROFILE_COUNTERS
36568 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36569 LPREFIX, labelno);
36570 #endif
36571 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36573 else
36575 #ifndef NO_PROFILE_COUNTERS
36576 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36577 LPREFIX, labelno);
36578 #endif
36579 fprintf (file, "\tcall\t%s\n", mcount_name);
36583 /* We don't have exact information about the insn sizes, but we may assume
36584 quite safely that we are informed about all 1 byte insns and memory
36585 address sizes. This is enough to eliminate unnecessary padding in
36586 99% of cases. */
36588 static int
36589 min_insn_size (rtx insn)
36591 int l = 0, len;
36593 if (!INSN_P (insn) || !active_insn_p (insn))
36594 return 0;
36596 /* Discard alignments we've emit and jump instructions. */
36597 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36598 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36599 return 0;
36601 /* Important case - calls are always 5 bytes.
36602 It is common to have many calls in the row. */
36603 if (CALL_P (insn)
36604 && symbolic_reference_mentioned_p (PATTERN (insn))
36605 && !SIBLING_CALL_P (insn))
36606 return 5;
36607 len = get_attr_length (insn);
36608 if (len <= 1)
36609 return 1;
36611 /* For normal instructions we rely on get_attr_length being exact,
36612 with a few exceptions. */
36613 if (!JUMP_P (insn))
36615 enum attr_type type = get_attr_type (insn);
36617 switch (type)
36619 case TYPE_MULTI:
36620 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36621 || asm_noperands (PATTERN (insn)) >= 0)
36622 return 0;
36623 break;
36624 case TYPE_OTHER:
36625 case TYPE_FCMP:
36626 break;
36627 default:
36628 /* Otherwise trust get_attr_length. */
36629 return len;
36632 l = get_attr_length_address (insn);
36633 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36634 l = 4;
36636 if (l)
36637 return 1+l;
36638 else
36639 return 2;
36642 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36644 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36645 window. */
36647 static void
36648 ix86_avoid_jump_mispredicts (void)
36650 rtx insn, start = get_insns ();
36651 int nbytes = 0, njumps = 0;
36652 int isjump = 0;
36654 /* Look for all minimal intervals of instructions containing 4 jumps.
36655 The intervals are bounded by START and INSN. NBYTES is the total
36656 size of instructions in the interval including INSN and not including
36657 START. When the NBYTES is smaller than 16 bytes, it is possible
36658 that the end of START and INSN ends up in the same 16byte page.
36660 The smallest offset in the page INSN can start is the case where START
36661 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36662 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36664 for (insn = start; insn; insn = NEXT_INSN (insn))
36666 int min_size;
36668 if (LABEL_P (insn))
36670 int align = label_to_alignment (insn);
36671 int max_skip = label_to_max_skip (insn);
36673 if (max_skip > 15)
36674 max_skip = 15;
36675 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36676 already in the current 16 byte page, because otherwise
36677 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36678 bytes to reach 16 byte boundary. */
36679 if (align <= 0
36680 || (align <= 3 && max_skip != (1 << align) - 1))
36681 max_skip = 0;
36682 if (dump_file)
36683 fprintf (dump_file, "Label %i with max_skip %i\n",
36684 INSN_UID (insn), max_skip);
36685 if (max_skip)
36687 while (nbytes + max_skip >= 16)
36689 start = NEXT_INSN (start);
36690 if (JUMP_P (start) || CALL_P (start))
36691 njumps--, isjump = 1;
36692 else
36693 isjump = 0;
36694 nbytes -= min_insn_size (start);
36697 continue;
36700 min_size = min_insn_size (insn);
36701 nbytes += min_size;
36702 if (dump_file)
36703 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36704 INSN_UID (insn), min_size);
36705 if (JUMP_P (insn) || CALL_P (insn))
36706 njumps++;
36707 else
36708 continue;
36710 while (njumps > 3)
36712 start = NEXT_INSN (start);
36713 if (JUMP_P (start) || CALL_P (start))
36714 njumps--, isjump = 1;
36715 else
36716 isjump = 0;
36717 nbytes -= min_insn_size (start);
36719 gcc_assert (njumps >= 0);
36720 if (dump_file)
36721 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36722 INSN_UID (start), INSN_UID (insn), nbytes);
36724 if (njumps == 3 && isjump && nbytes < 16)
36726 int padsize = 15 - nbytes + min_insn_size (insn);
36728 if (dump_file)
36729 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36730 INSN_UID (insn), padsize);
36731 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36735 #endif
36737 /* AMD Athlon works faster
36738 when RET is not destination of conditional jump or directly preceded
36739 by other jump instruction. We avoid the penalty by inserting NOP just
36740 before the RET instructions in such cases. */
36741 static void
36742 ix86_pad_returns (void)
36744 edge e;
36745 edge_iterator ei;
36747 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36749 basic_block bb = e->src;
36750 rtx ret = BB_END (bb);
36751 rtx prev;
36752 bool replace = false;
36754 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36755 || optimize_bb_for_size_p (bb))
36756 continue;
36757 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36758 if (active_insn_p (prev) || LABEL_P (prev))
36759 break;
36760 if (prev && LABEL_P (prev))
36762 edge e;
36763 edge_iterator ei;
36765 FOR_EACH_EDGE (e, ei, bb->preds)
36766 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36767 && !(e->flags & EDGE_FALLTHRU))
36769 replace = true;
36770 break;
36773 if (!replace)
36775 prev = prev_active_insn (ret);
36776 if (prev
36777 && ((JUMP_P (prev) && any_condjump_p (prev))
36778 || CALL_P (prev)))
36779 replace = true;
36780 /* Empty functions get branch mispredict even when
36781 the jump destination is not visible to us. */
36782 if (!prev && !optimize_function_for_size_p (cfun))
36783 replace = true;
36785 if (replace)
36787 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36788 delete_insn (ret);
36793 /* Count the minimum number of instructions in BB. Return 4 if the
36794 number of instructions >= 4. */
36796 static int
36797 ix86_count_insn_bb (basic_block bb)
36799 rtx insn;
36800 int insn_count = 0;
36802 /* Count number of instructions in this block. Return 4 if the number
36803 of instructions >= 4. */
36804 FOR_BB_INSNS (bb, insn)
36806 /* Only happen in exit blocks. */
36807 if (JUMP_P (insn)
36808 && ANY_RETURN_P (PATTERN (insn)))
36809 break;
36811 if (NONDEBUG_INSN_P (insn)
36812 && GET_CODE (PATTERN (insn)) != USE
36813 && GET_CODE (PATTERN (insn)) != CLOBBER)
36815 insn_count++;
36816 if (insn_count >= 4)
36817 return insn_count;
36821 return insn_count;
36825 /* Count the minimum number of instructions in code path in BB.
36826 Return 4 if the number of instructions >= 4. */
36828 static int
36829 ix86_count_insn (basic_block bb)
36831 edge e;
36832 edge_iterator ei;
36833 int min_prev_count;
36835 /* Only bother counting instructions along paths with no
36836 more than 2 basic blocks between entry and exit. Given
36837 that BB has an edge to exit, determine if a predecessor
36838 of BB has an edge from entry. If so, compute the number
36839 of instructions in the predecessor block. If there
36840 happen to be multiple such blocks, compute the minimum. */
36841 min_prev_count = 4;
36842 FOR_EACH_EDGE (e, ei, bb->preds)
36844 edge prev_e;
36845 edge_iterator prev_ei;
36847 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
36849 min_prev_count = 0;
36850 break;
36852 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36854 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
36856 int count = ix86_count_insn_bb (e->src);
36857 if (count < min_prev_count)
36858 min_prev_count = count;
36859 break;
36864 if (min_prev_count < 4)
36865 min_prev_count += ix86_count_insn_bb (bb);
36867 return min_prev_count;
36870 /* Pad short function to 4 instructions. */
36872 static void
36873 ix86_pad_short_function (void)
36875 edge e;
36876 edge_iterator ei;
36878 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36880 rtx ret = BB_END (e->src);
36881 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36883 int insn_count = ix86_count_insn (e->src);
36885 /* Pad short function. */
36886 if (insn_count < 4)
36888 rtx insn = ret;
36890 /* Find epilogue. */
36891 while (insn
36892 && (!NOTE_P (insn)
36893 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36894 insn = PREV_INSN (insn);
36896 if (!insn)
36897 insn = ret;
36899 /* Two NOPs count as one instruction. */
36900 insn_count = 2 * (4 - insn_count);
36901 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36907 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36908 the epilogue, the Windows system unwinder will apply epilogue logic and
36909 produce incorrect offsets. This can be avoided by adding a nop between
36910 the last insn that can throw and the first insn of the epilogue. */
36912 static void
36913 ix86_seh_fixup_eh_fallthru (void)
36915 edge e;
36916 edge_iterator ei;
36918 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
36920 rtx insn, next;
36922 /* Find the beginning of the epilogue. */
36923 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36924 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36925 break;
36926 if (insn == NULL)
36927 continue;
36929 /* We only care about preceding insns that can throw. */
36930 insn = prev_active_insn (insn);
36931 if (insn == NULL || !can_throw_internal (insn))
36932 continue;
36934 /* Do not separate calls from their debug information. */
36935 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36936 if (NOTE_P (next)
36937 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36938 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36939 insn = next;
36940 else
36941 break;
36943 emit_insn_after (gen_nops (const1_rtx), insn);
36947 /* Implement machine specific optimizations. We implement padding of returns
36948 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36949 static void
36950 ix86_reorg (void)
36952 /* We are freeing block_for_insn in the toplev to keep compatibility
36953 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36954 compute_bb_for_insn ();
36956 if (TARGET_SEH && current_function_has_exception_handlers ())
36957 ix86_seh_fixup_eh_fallthru ();
36959 if (optimize && optimize_function_for_speed_p (cfun))
36961 if (TARGET_PAD_SHORT_FUNCTION)
36962 ix86_pad_short_function ();
36963 else if (TARGET_PAD_RETURNS)
36964 ix86_pad_returns ();
36965 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36966 if (TARGET_FOUR_JUMP_LIMIT)
36967 ix86_avoid_jump_mispredicts ();
36968 #endif
36972 /* Return nonzero when QImode register that must be represented via REX prefix
36973 is used. */
36974 bool
36975 x86_extended_QIreg_mentioned_p (rtx insn)
36977 int i;
36978 extract_insn_cached (insn);
36979 for (i = 0; i < recog_data.n_operands; i++)
36980 if (GENERAL_REG_P (recog_data.operand[i])
36981 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36982 return true;
36983 return false;
36986 /* Return nonzero when P points to register encoded via REX prefix.
36987 Called via for_each_rtx. */
36988 static int
36989 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36991 unsigned int regno;
36992 if (!REG_P (*p))
36993 return 0;
36994 regno = REGNO (*p);
36995 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36998 /* Return true when INSN mentions register that must be encoded using REX
36999 prefix. */
37000 bool
37001 x86_extended_reg_mentioned_p (rtx insn)
37003 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
37004 extended_reg_mentioned_1, NULL);
37007 /* If profitable, negate (without causing overflow) integer constant
37008 of mode MODE at location LOC. Return true in this case. */
37009 bool
37010 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
37012 HOST_WIDE_INT val;
37014 if (!CONST_INT_P (*loc))
37015 return false;
37017 switch (mode)
37019 case DImode:
37020 /* DImode x86_64 constants must fit in 32 bits. */
37021 gcc_assert (x86_64_immediate_operand (*loc, mode));
37023 mode = SImode;
37024 break;
37026 case SImode:
37027 case HImode:
37028 case QImode:
37029 break;
37031 default:
37032 gcc_unreachable ();
37035 /* Avoid overflows. */
37036 if (mode_signbit_p (mode, *loc))
37037 return false;
37039 val = INTVAL (*loc);
37041 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
37042 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
37043 if ((val < 0 && val != -128)
37044 || val == 128)
37046 *loc = GEN_INT (-val);
37047 return true;
37050 return false;
37053 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
37054 optabs would emit if we didn't have TFmode patterns. */
37056 void
37057 x86_emit_floatuns (rtx operands[2])
37059 rtx neglab, donelab, i0, i1, f0, in, out;
37060 enum machine_mode mode, inmode;
37062 inmode = GET_MODE (operands[1]);
37063 gcc_assert (inmode == SImode || inmode == DImode);
37065 out = operands[0];
37066 in = force_reg (inmode, operands[1]);
37067 mode = GET_MODE (out);
37068 neglab = gen_label_rtx ();
37069 donelab = gen_label_rtx ();
37070 f0 = gen_reg_rtx (mode);
37072 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
37074 expand_float (out, in, 0);
37076 emit_jump_insn (gen_jump (donelab));
37077 emit_barrier ();
37079 emit_label (neglab);
37081 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
37082 1, OPTAB_DIRECT);
37083 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
37084 1, OPTAB_DIRECT);
37085 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
37087 expand_float (f0, i0, 0);
37089 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
37091 emit_label (donelab);
37094 /* AVX512F does support 64-byte integer vector operations,
37095 thus the longest vector we are faced with is V64QImode. */
37096 #define MAX_VECT_LEN 64
37098 struct expand_vec_perm_d
37100 rtx target, op0, op1;
37101 unsigned char perm[MAX_VECT_LEN];
37102 enum machine_mode vmode;
37103 unsigned char nelt;
37104 bool one_operand_p;
37105 bool testing_p;
37108 static bool canonicalize_perm (struct expand_vec_perm_d *d);
37109 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
37110 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
37112 /* Get a vector mode of the same size as the original but with elements
37113 twice as wide. This is only guaranteed to apply to integral vectors. */
37115 static inline enum machine_mode
37116 get_mode_wider_vector (enum machine_mode o)
37118 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
37119 enum machine_mode n = GET_MODE_WIDER_MODE (o);
37120 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
37121 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
37122 return n;
37125 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37126 with all elements equal to VAR. Return true if successful. */
37128 static bool
37129 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
37130 rtx target, rtx val)
37132 bool ok;
37134 switch (mode)
37136 case V2SImode:
37137 case V2SFmode:
37138 if (!mmx_ok)
37139 return false;
37140 /* FALLTHRU */
37142 case V4DFmode:
37143 case V4DImode:
37144 case V8SFmode:
37145 case V8SImode:
37146 case V2DFmode:
37147 case V2DImode:
37148 case V4SFmode:
37149 case V4SImode:
37151 rtx insn, dup;
37153 /* First attempt to recognize VAL as-is. */
37154 dup = gen_rtx_VEC_DUPLICATE (mode, val);
37155 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
37156 if (recog_memoized (insn) < 0)
37158 rtx seq;
37159 /* If that fails, force VAL into a register. */
37161 start_sequence ();
37162 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
37163 seq = get_insns ();
37164 end_sequence ();
37165 if (seq)
37166 emit_insn_before (seq, insn);
37168 ok = recog_memoized (insn) >= 0;
37169 gcc_assert (ok);
37172 return true;
37174 case V4HImode:
37175 if (!mmx_ok)
37176 return false;
37177 if (TARGET_SSE || TARGET_3DNOW_A)
37179 rtx x;
37181 val = gen_lowpart (SImode, val);
37182 x = gen_rtx_TRUNCATE (HImode, val);
37183 x = gen_rtx_VEC_DUPLICATE (mode, x);
37184 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37185 return true;
37187 goto widen;
37189 case V8QImode:
37190 if (!mmx_ok)
37191 return false;
37192 goto widen;
37194 case V8HImode:
37195 if (TARGET_SSE2)
37197 struct expand_vec_perm_d dperm;
37198 rtx tmp1, tmp2;
37200 permute:
37201 memset (&dperm, 0, sizeof (dperm));
37202 dperm.target = target;
37203 dperm.vmode = mode;
37204 dperm.nelt = GET_MODE_NUNITS (mode);
37205 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
37206 dperm.one_operand_p = true;
37208 /* Extend to SImode using a paradoxical SUBREG. */
37209 tmp1 = gen_reg_rtx (SImode);
37210 emit_move_insn (tmp1, gen_lowpart (SImode, val));
37212 /* Insert the SImode value as low element of a V4SImode vector. */
37213 tmp2 = gen_reg_rtx (V4SImode);
37214 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
37215 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
37217 ok = (expand_vec_perm_1 (&dperm)
37218 || expand_vec_perm_broadcast_1 (&dperm));
37219 gcc_assert (ok);
37220 return ok;
37222 goto widen;
37224 case V16QImode:
37225 if (TARGET_SSE2)
37226 goto permute;
37227 goto widen;
37229 widen:
37230 /* Replicate the value once into the next wider mode and recurse. */
37232 enum machine_mode smode, wsmode, wvmode;
37233 rtx x;
37235 smode = GET_MODE_INNER (mode);
37236 wvmode = get_mode_wider_vector (mode);
37237 wsmode = GET_MODE_INNER (wvmode);
37239 val = convert_modes (wsmode, smode, val, true);
37240 x = expand_simple_binop (wsmode, ASHIFT, val,
37241 GEN_INT (GET_MODE_BITSIZE (smode)),
37242 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37243 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
37245 x = gen_reg_rtx (wvmode);
37246 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
37247 gcc_assert (ok);
37248 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
37249 return ok;
37252 case V16HImode:
37253 case V32QImode:
37255 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
37256 rtx x = gen_reg_rtx (hvmode);
37258 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
37259 gcc_assert (ok);
37261 x = gen_rtx_VEC_CONCAT (mode, x, x);
37262 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37264 return true;
37266 default:
37267 return false;
37271 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37272 whose ONE_VAR element is VAR, and other elements are zero. Return true
37273 if successful. */
37275 static bool
37276 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
37277 rtx target, rtx var, int one_var)
37279 enum machine_mode vsimode;
37280 rtx new_target;
37281 rtx x, tmp;
37282 bool use_vector_set = false;
37284 switch (mode)
37286 case V2DImode:
37287 /* For SSE4.1, we normally use vector set. But if the second
37288 element is zero and inter-unit moves are OK, we use movq
37289 instead. */
37290 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
37291 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
37292 && one_var == 0));
37293 break;
37294 case V16QImode:
37295 case V4SImode:
37296 case V4SFmode:
37297 use_vector_set = TARGET_SSE4_1;
37298 break;
37299 case V8HImode:
37300 use_vector_set = TARGET_SSE2;
37301 break;
37302 case V4HImode:
37303 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
37304 break;
37305 case V32QImode:
37306 case V16HImode:
37307 case V8SImode:
37308 case V8SFmode:
37309 case V4DFmode:
37310 use_vector_set = TARGET_AVX;
37311 break;
37312 case V4DImode:
37313 /* Use ix86_expand_vector_set in 64bit mode only. */
37314 use_vector_set = TARGET_AVX && TARGET_64BIT;
37315 break;
37316 default:
37317 break;
37320 if (use_vector_set)
37322 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
37323 var = force_reg (GET_MODE_INNER (mode), var);
37324 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37325 return true;
37328 switch (mode)
37330 case V2SFmode:
37331 case V2SImode:
37332 if (!mmx_ok)
37333 return false;
37334 /* FALLTHRU */
37336 case V2DFmode:
37337 case V2DImode:
37338 if (one_var != 0)
37339 return false;
37340 var = force_reg (GET_MODE_INNER (mode), var);
37341 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
37342 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37343 return true;
37345 case V4SFmode:
37346 case V4SImode:
37347 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
37348 new_target = gen_reg_rtx (mode);
37349 else
37350 new_target = target;
37351 var = force_reg (GET_MODE_INNER (mode), var);
37352 x = gen_rtx_VEC_DUPLICATE (mode, var);
37353 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
37354 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
37355 if (one_var != 0)
37357 /* We need to shuffle the value to the correct position, so
37358 create a new pseudo to store the intermediate result. */
37360 /* With SSE2, we can use the integer shuffle insns. */
37361 if (mode != V4SFmode && TARGET_SSE2)
37363 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
37364 const1_rtx,
37365 GEN_INT (one_var == 1 ? 0 : 1),
37366 GEN_INT (one_var == 2 ? 0 : 1),
37367 GEN_INT (one_var == 3 ? 0 : 1)));
37368 if (target != new_target)
37369 emit_move_insn (target, new_target);
37370 return true;
37373 /* Otherwise convert the intermediate result to V4SFmode and
37374 use the SSE1 shuffle instructions. */
37375 if (mode != V4SFmode)
37377 tmp = gen_reg_rtx (V4SFmode);
37378 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
37380 else
37381 tmp = new_target;
37383 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
37384 const1_rtx,
37385 GEN_INT (one_var == 1 ? 0 : 1),
37386 GEN_INT (one_var == 2 ? 0+4 : 1+4),
37387 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
37389 if (mode != V4SFmode)
37390 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
37391 else if (tmp != target)
37392 emit_move_insn (target, tmp);
37394 else if (target != new_target)
37395 emit_move_insn (target, new_target);
37396 return true;
37398 case V8HImode:
37399 case V16QImode:
37400 vsimode = V4SImode;
37401 goto widen;
37402 case V4HImode:
37403 case V8QImode:
37404 if (!mmx_ok)
37405 return false;
37406 vsimode = V2SImode;
37407 goto widen;
37408 widen:
37409 if (one_var != 0)
37410 return false;
37412 /* Zero extend the variable element to SImode and recurse. */
37413 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
37415 x = gen_reg_rtx (vsimode);
37416 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
37417 var, one_var))
37418 gcc_unreachable ();
37420 emit_move_insn (target, gen_lowpart (mode, x));
37421 return true;
37423 default:
37424 return false;
37428 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37429 consisting of the values in VALS. It is known that all elements
37430 except ONE_VAR are constants. Return true if successful. */
37432 static bool
37433 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
37434 rtx target, rtx vals, int one_var)
37436 rtx var = XVECEXP (vals, 0, one_var);
37437 enum machine_mode wmode;
37438 rtx const_vec, x;
37440 const_vec = copy_rtx (vals);
37441 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
37442 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
37444 switch (mode)
37446 case V2DFmode:
37447 case V2DImode:
37448 case V2SFmode:
37449 case V2SImode:
37450 /* For the two element vectors, it's just as easy to use
37451 the general case. */
37452 return false;
37454 case V4DImode:
37455 /* Use ix86_expand_vector_set in 64bit mode only. */
37456 if (!TARGET_64BIT)
37457 return false;
37458 case V4DFmode:
37459 case V8SFmode:
37460 case V8SImode:
37461 case V16HImode:
37462 case V32QImode:
37463 case V4SFmode:
37464 case V4SImode:
37465 case V8HImode:
37466 case V4HImode:
37467 break;
37469 case V16QImode:
37470 if (TARGET_SSE4_1)
37471 break;
37472 wmode = V8HImode;
37473 goto widen;
37474 case V8QImode:
37475 wmode = V4HImode;
37476 goto widen;
37477 widen:
37478 /* There's no way to set one QImode entry easily. Combine
37479 the variable value with its adjacent constant value, and
37480 promote to an HImode set. */
37481 x = XVECEXP (vals, 0, one_var ^ 1);
37482 if (one_var & 1)
37484 var = convert_modes (HImode, QImode, var, true);
37485 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37486 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37487 x = GEN_INT (INTVAL (x) & 0xff);
37489 else
37491 var = convert_modes (HImode, QImode, var, true);
37492 x = gen_int_mode (INTVAL (x) << 8, HImode);
37494 if (x != const0_rtx)
37495 var = expand_simple_binop (HImode, IOR, var, x, var,
37496 1, OPTAB_LIB_WIDEN);
37498 x = gen_reg_rtx (wmode);
37499 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37500 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37502 emit_move_insn (target, gen_lowpart (mode, x));
37503 return true;
37505 default:
37506 return false;
37509 emit_move_insn (target, const_vec);
37510 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37511 return true;
37514 /* A subroutine of ix86_expand_vector_init_general. Use vector
37515 concatenate to handle the most general case: all values variable,
37516 and none identical. */
37518 static void
37519 ix86_expand_vector_init_concat (enum machine_mode mode,
37520 rtx target, rtx *ops, int n)
37522 enum machine_mode cmode, hmode = VOIDmode;
37523 rtx first[8], second[4];
37524 rtvec v;
37525 int i, j;
37527 switch (n)
37529 case 2:
37530 switch (mode)
37532 case V8SImode:
37533 cmode = V4SImode;
37534 break;
37535 case V8SFmode:
37536 cmode = V4SFmode;
37537 break;
37538 case V4DImode:
37539 cmode = V2DImode;
37540 break;
37541 case V4DFmode:
37542 cmode = V2DFmode;
37543 break;
37544 case V4SImode:
37545 cmode = V2SImode;
37546 break;
37547 case V4SFmode:
37548 cmode = V2SFmode;
37549 break;
37550 case V2DImode:
37551 cmode = DImode;
37552 break;
37553 case V2SImode:
37554 cmode = SImode;
37555 break;
37556 case V2DFmode:
37557 cmode = DFmode;
37558 break;
37559 case V2SFmode:
37560 cmode = SFmode;
37561 break;
37562 default:
37563 gcc_unreachable ();
37566 if (!register_operand (ops[1], cmode))
37567 ops[1] = force_reg (cmode, ops[1]);
37568 if (!register_operand (ops[0], cmode))
37569 ops[0] = force_reg (cmode, ops[0]);
37570 emit_insn (gen_rtx_SET (VOIDmode, target,
37571 gen_rtx_VEC_CONCAT (mode, ops[0],
37572 ops[1])));
37573 break;
37575 case 4:
37576 switch (mode)
37578 case V4DImode:
37579 cmode = V2DImode;
37580 break;
37581 case V4DFmode:
37582 cmode = V2DFmode;
37583 break;
37584 case V4SImode:
37585 cmode = V2SImode;
37586 break;
37587 case V4SFmode:
37588 cmode = V2SFmode;
37589 break;
37590 default:
37591 gcc_unreachable ();
37593 goto half;
37595 case 8:
37596 switch (mode)
37598 case V8SImode:
37599 cmode = V2SImode;
37600 hmode = V4SImode;
37601 break;
37602 case V8SFmode:
37603 cmode = V2SFmode;
37604 hmode = V4SFmode;
37605 break;
37606 default:
37607 gcc_unreachable ();
37609 goto half;
37611 half:
37612 /* FIXME: We process inputs backward to help RA. PR 36222. */
37613 i = n - 1;
37614 j = (n >> 1) - 1;
37615 for (; i > 0; i -= 2, j--)
37617 first[j] = gen_reg_rtx (cmode);
37618 v = gen_rtvec (2, ops[i - 1], ops[i]);
37619 ix86_expand_vector_init (false, first[j],
37620 gen_rtx_PARALLEL (cmode, v));
37623 n >>= 1;
37624 if (n > 2)
37626 gcc_assert (hmode != VOIDmode);
37627 for (i = j = 0; i < n; i += 2, j++)
37629 second[j] = gen_reg_rtx (hmode);
37630 ix86_expand_vector_init_concat (hmode, second [j],
37631 &first [i], 2);
37633 n >>= 1;
37634 ix86_expand_vector_init_concat (mode, target, second, n);
37636 else
37637 ix86_expand_vector_init_concat (mode, target, first, n);
37638 break;
37640 default:
37641 gcc_unreachable ();
37645 /* A subroutine of ix86_expand_vector_init_general. Use vector
37646 interleave to handle the most general case: all values variable,
37647 and none identical. */
37649 static void
37650 ix86_expand_vector_init_interleave (enum machine_mode mode,
37651 rtx target, rtx *ops, int n)
37653 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37654 int i, j;
37655 rtx op0, op1;
37656 rtx (*gen_load_even) (rtx, rtx, rtx);
37657 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37658 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37660 switch (mode)
37662 case V8HImode:
37663 gen_load_even = gen_vec_setv8hi;
37664 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37665 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37666 inner_mode = HImode;
37667 first_imode = V4SImode;
37668 second_imode = V2DImode;
37669 third_imode = VOIDmode;
37670 break;
37671 case V16QImode:
37672 gen_load_even = gen_vec_setv16qi;
37673 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37674 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37675 inner_mode = QImode;
37676 first_imode = V8HImode;
37677 second_imode = V4SImode;
37678 third_imode = V2DImode;
37679 break;
37680 default:
37681 gcc_unreachable ();
37684 for (i = 0; i < n; i++)
37686 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37687 op0 = gen_reg_rtx (SImode);
37688 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37690 /* Insert the SImode value as low element of V4SImode vector. */
37691 op1 = gen_reg_rtx (V4SImode);
37692 op0 = gen_rtx_VEC_MERGE (V4SImode,
37693 gen_rtx_VEC_DUPLICATE (V4SImode,
37694 op0),
37695 CONST0_RTX (V4SImode),
37696 const1_rtx);
37697 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37699 /* Cast the V4SImode vector back to a vector in orignal mode. */
37700 op0 = gen_reg_rtx (mode);
37701 emit_move_insn (op0, gen_lowpart (mode, op1));
37703 /* Load even elements into the second position. */
37704 emit_insn (gen_load_even (op0,
37705 force_reg (inner_mode,
37706 ops [i + i + 1]),
37707 const1_rtx));
37709 /* Cast vector to FIRST_IMODE vector. */
37710 ops[i] = gen_reg_rtx (first_imode);
37711 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37714 /* Interleave low FIRST_IMODE vectors. */
37715 for (i = j = 0; i < n; i += 2, j++)
37717 op0 = gen_reg_rtx (first_imode);
37718 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37720 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37721 ops[j] = gen_reg_rtx (second_imode);
37722 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37725 /* Interleave low SECOND_IMODE vectors. */
37726 switch (second_imode)
37728 case V4SImode:
37729 for (i = j = 0; i < n / 2; i += 2, j++)
37731 op0 = gen_reg_rtx (second_imode);
37732 emit_insn (gen_interleave_second_low (op0, ops[i],
37733 ops[i + 1]));
37735 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37736 vector. */
37737 ops[j] = gen_reg_rtx (third_imode);
37738 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37740 second_imode = V2DImode;
37741 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37742 /* FALLTHRU */
37744 case V2DImode:
37745 op0 = gen_reg_rtx (second_imode);
37746 emit_insn (gen_interleave_second_low (op0, ops[0],
37747 ops[1]));
37749 /* Cast the SECOND_IMODE vector back to a vector on original
37750 mode. */
37751 emit_insn (gen_rtx_SET (VOIDmode, target,
37752 gen_lowpart (mode, op0)));
37753 break;
37755 default:
37756 gcc_unreachable ();
37760 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37761 all values variable, and none identical. */
37763 static void
37764 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37765 rtx target, rtx vals)
37767 rtx ops[32], op0, op1;
37768 enum machine_mode half_mode = VOIDmode;
37769 int n, i;
37771 switch (mode)
37773 case V2SFmode:
37774 case V2SImode:
37775 if (!mmx_ok && !TARGET_SSE)
37776 break;
37777 /* FALLTHRU */
37779 case V8SFmode:
37780 case V8SImode:
37781 case V4DFmode:
37782 case V4DImode:
37783 case V4SFmode:
37784 case V4SImode:
37785 case V2DFmode:
37786 case V2DImode:
37787 n = GET_MODE_NUNITS (mode);
37788 for (i = 0; i < n; i++)
37789 ops[i] = XVECEXP (vals, 0, i);
37790 ix86_expand_vector_init_concat (mode, target, ops, n);
37791 return;
37793 case V32QImode:
37794 half_mode = V16QImode;
37795 goto half;
37797 case V16HImode:
37798 half_mode = V8HImode;
37799 goto half;
37801 half:
37802 n = GET_MODE_NUNITS (mode);
37803 for (i = 0; i < n; i++)
37804 ops[i] = XVECEXP (vals, 0, i);
37805 op0 = gen_reg_rtx (half_mode);
37806 op1 = gen_reg_rtx (half_mode);
37807 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37808 n >> 2);
37809 ix86_expand_vector_init_interleave (half_mode, op1,
37810 &ops [n >> 1], n >> 2);
37811 emit_insn (gen_rtx_SET (VOIDmode, target,
37812 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37813 return;
37815 case V16QImode:
37816 if (!TARGET_SSE4_1)
37817 break;
37818 /* FALLTHRU */
37820 case V8HImode:
37821 if (!TARGET_SSE2)
37822 break;
37824 /* Don't use ix86_expand_vector_init_interleave if we can't
37825 move from GPR to SSE register directly. */
37826 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37827 break;
37829 n = GET_MODE_NUNITS (mode);
37830 for (i = 0; i < n; i++)
37831 ops[i] = XVECEXP (vals, 0, i);
37832 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37833 return;
37835 case V4HImode:
37836 case V8QImode:
37837 break;
37839 default:
37840 gcc_unreachable ();
37844 int i, j, n_elts, n_words, n_elt_per_word;
37845 enum machine_mode inner_mode;
37846 rtx words[4], shift;
37848 inner_mode = GET_MODE_INNER (mode);
37849 n_elts = GET_MODE_NUNITS (mode);
37850 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37851 n_elt_per_word = n_elts / n_words;
37852 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37854 for (i = 0; i < n_words; ++i)
37856 rtx word = NULL_RTX;
37858 for (j = 0; j < n_elt_per_word; ++j)
37860 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37861 elt = convert_modes (word_mode, inner_mode, elt, true);
37863 if (j == 0)
37864 word = elt;
37865 else
37867 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37868 word, 1, OPTAB_LIB_WIDEN);
37869 word = expand_simple_binop (word_mode, IOR, word, elt,
37870 word, 1, OPTAB_LIB_WIDEN);
37874 words[i] = word;
37877 if (n_words == 1)
37878 emit_move_insn (target, gen_lowpart (mode, words[0]));
37879 else if (n_words == 2)
37881 rtx tmp = gen_reg_rtx (mode);
37882 emit_clobber (tmp);
37883 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37884 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37885 emit_move_insn (target, tmp);
37887 else if (n_words == 4)
37889 rtx tmp = gen_reg_rtx (V4SImode);
37890 gcc_assert (word_mode == SImode);
37891 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37892 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37893 emit_move_insn (target, gen_lowpart (mode, tmp));
37895 else
37896 gcc_unreachable ();
37900 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37901 instructions unless MMX_OK is true. */
37903 void
37904 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37906 enum machine_mode mode = GET_MODE (target);
37907 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37908 int n_elts = GET_MODE_NUNITS (mode);
37909 int n_var = 0, one_var = -1;
37910 bool all_same = true, all_const_zero = true;
37911 int i;
37912 rtx x;
37914 for (i = 0; i < n_elts; ++i)
37916 x = XVECEXP (vals, 0, i);
37917 if (!(CONST_INT_P (x)
37918 || GET_CODE (x) == CONST_DOUBLE
37919 || GET_CODE (x) == CONST_FIXED))
37920 n_var++, one_var = i;
37921 else if (x != CONST0_RTX (inner_mode))
37922 all_const_zero = false;
37923 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37924 all_same = false;
37927 /* Constants are best loaded from the constant pool. */
37928 if (n_var == 0)
37930 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37931 return;
37934 /* If all values are identical, broadcast the value. */
37935 if (all_same
37936 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37937 XVECEXP (vals, 0, 0)))
37938 return;
37940 /* Values where only one field is non-constant are best loaded from
37941 the pool and overwritten via move later. */
37942 if (n_var == 1)
37944 if (all_const_zero
37945 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37946 XVECEXP (vals, 0, one_var),
37947 one_var))
37948 return;
37950 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37951 return;
37954 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37957 void
37958 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37960 enum machine_mode mode = GET_MODE (target);
37961 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37962 enum machine_mode half_mode;
37963 bool use_vec_merge = false;
37964 rtx tmp;
37965 static rtx (*gen_extract[6][2]) (rtx, rtx)
37967 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37968 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37969 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37970 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37971 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37972 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37974 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37976 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37977 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37978 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37979 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37980 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37981 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37983 int i, j, n;
37985 switch (mode)
37987 case V2SFmode:
37988 case V2SImode:
37989 if (mmx_ok)
37991 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37992 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37993 if (elt == 0)
37994 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37995 else
37996 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37997 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37998 return;
38000 break;
38002 case V2DImode:
38003 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
38004 if (use_vec_merge)
38005 break;
38007 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
38008 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
38009 if (elt == 0)
38010 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
38011 else
38012 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
38013 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38014 return;
38016 case V2DFmode:
38018 rtx op0, op1;
38020 /* For the two element vectors, we implement a VEC_CONCAT with
38021 the extraction of the other element. */
38023 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
38024 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
38026 if (elt == 0)
38027 op0 = val, op1 = tmp;
38028 else
38029 op0 = tmp, op1 = val;
38031 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
38032 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38034 return;
38036 case V4SFmode:
38037 use_vec_merge = TARGET_SSE4_1;
38038 if (use_vec_merge)
38039 break;
38041 switch (elt)
38043 case 0:
38044 use_vec_merge = true;
38045 break;
38047 case 1:
38048 /* tmp = target = A B C D */
38049 tmp = copy_to_reg (target);
38050 /* target = A A B B */
38051 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
38052 /* target = X A B B */
38053 ix86_expand_vector_set (false, target, val, 0);
38054 /* target = A X C D */
38055 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
38056 const1_rtx, const0_rtx,
38057 GEN_INT (2+4), GEN_INT (3+4)));
38058 return;
38060 case 2:
38061 /* tmp = target = A B C D */
38062 tmp = copy_to_reg (target);
38063 /* tmp = X B C D */
38064 ix86_expand_vector_set (false, tmp, val, 0);
38065 /* target = A B X D */
38066 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
38067 const0_rtx, const1_rtx,
38068 GEN_INT (0+4), GEN_INT (3+4)));
38069 return;
38071 case 3:
38072 /* tmp = target = A B C D */
38073 tmp = copy_to_reg (target);
38074 /* tmp = X B C D */
38075 ix86_expand_vector_set (false, tmp, val, 0);
38076 /* target = A B X D */
38077 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
38078 const0_rtx, const1_rtx,
38079 GEN_INT (2+4), GEN_INT (0+4)));
38080 return;
38082 default:
38083 gcc_unreachable ();
38085 break;
38087 case V4SImode:
38088 use_vec_merge = TARGET_SSE4_1;
38089 if (use_vec_merge)
38090 break;
38092 /* Element 0 handled by vec_merge below. */
38093 if (elt == 0)
38095 use_vec_merge = true;
38096 break;
38099 if (TARGET_SSE2)
38101 /* With SSE2, use integer shuffles to swap element 0 and ELT,
38102 store into element 0, then shuffle them back. */
38104 rtx order[4];
38106 order[0] = GEN_INT (elt);
38107 order[1] = const1_rtx;
38108 order[2] = const2_rtx;
38109 order[3] = GEN_INT (3);
38110 order[elt] = const0_rtx;
38112 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
38113 order[1], order[2], order[3]));
38115 ix86_expand_vector_set (false, target, val, 0);
38117 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
38118 order[1], order[2], order[3]));
38120 else
38122 /* For SSE1, we have to reuse the V4SF code. */
38123 rtx t = gen_reg_rtx (V4SFmode);
38124 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
38125 emit_move_insn (target, gen_lowpart (mode, t));
38127 return;
38129 case V8HImode:
38130 use_vec_merge = TARGET_SSE2;
38131 break;
38132 case V4HImode:
38133 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
38134 break;
38136 case V16QImode:
38137 use_vec_merge = TARGET_SSE4_1;
38138 break;
38140 case V8QImode:
38141 break;
38143 case V32QImode:
38144 half_mode = V16QImode;
38145 j = 0;
38146 n = 16;
38147 goto half;
38149 case V16HImode:
38150 half_mode = V8HImode;
38151 j = 1;
38152 n = 8;
38153 goto half;
38155 case V8SImode:
38156 half_mode = V4SImode;
38157 j = 2;
38158 n = 4;
38159 goto half;
38161 case V4DImode:
38162 half_mode = V2DImode;
38163 j = 3;
38164 n = 2;
38165 goto half;
38167 case V8SFmode:
38168 half_mode = V4SFmode;
38169 j = 4;
38170 n = 4;
38171 goto half;
38173 case V4DFmode:
38174 half_mode = V2DFmode;
38175 j = 5;
38176 n = 2;
38177 goto half;
38179 half:
38180 /* Compute offset. */
38181 i = elt / n;
38182 elt %= n;
38184 gcc_assert (i <= 1);
38186 /* Extract the half. */
38187 tmp = gen_reg_rtx (half_mode);
38188 emit_insn (gen_extract[j][i] (tmp, target));
38190 /* Put val in tmp at elt. */
38191 ix86_expand_vector_set (false, tmp, val, elt);
38193 /* Put it back. */
38194 emit_insn (gen_insert[j][i] (target, target, tmp));
38195 return;
38197 default:
38198 break;
38201 if (use_vec_merge)
38203 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
38204 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
38205 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38207 else
38209 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38211 emit_move_insn (mem, target);
38213 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38214 emit_move_insn (tmp, val);
38216 emit_move_insn (target, mem);
38220 void
38221 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
38223 enum machine_mode mode = GET_MODE (vec);
38224 enum machine_mode inner_mode = GET_MODE_INNER (mode);
38225 bool use_vec_extr = false;
38226 rtx tmp;
38228 switch (mode)
38230 case V2SImode:
38231 case V2SFmode:
38232 if (!mmx_ok)
38233 break;
38234 /* FALLTHRU */
38236 case V2DFmode:
38237 case V2DImode:
38238 use_vec_extr = true;
38239 break;
38241 case V4SFmode:
38242 use_vec_extr = TARGET_SSE4_1;
38243 if (use_vec_extr)
38244 break;
38246 switch (elt)
38248 case 0:
38249 tmp = vec;
38250 break;
38252 case 1:
38253 case 3:
38254 tmp = gen_reg_rtx (mode);
38255 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
38256 GEN_INT (elt), GEN_INT (elt),
38257 GEN_INT (elt+4), GEN_INT (elt+4)));
38258 break;
38260 case 2:
38261 tmp = gen_reg_rtx (mode);
38262 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
38263 break;
38265 default:
38266 gcc_unreachable ();
38268 vec = tmp;
38269 use_vec_extr = true;
38270 elt = 0;
38271 break;
38273 case V4SImode:
38274 use_vec_extr = TARGET_SSE4_1;
38275 if (use_vec_extr)
38276 break;
38278 if (TARGET_SSE2)
38280 switch (elt)
38282 case 0:
38283 tmp = vec;
38284 break;
38286 case 1:
38287 case 3:
38288 tmp = gen_reg_rtx (mode);
38289 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
38290 GEN_INT (elt), GEN_INT (elt),
38291 GEN_INT (elt), GEN_INT (elt)));
38292 break;
38294 case 2:
38295 tmp = gen_reg_rtx (mode);
38296 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
38297 break;
38299 default:
38300 gcc_unreachable ();
38302 vec = tmp;
38303 use_vec_extr = true;
38304 elt = 0;
38306 else
38308 /* For SSE1, we have to reuse the V4SF code. */
38309 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
38310 gen_lowpart (V4SFmode, vec), elt);
38311 return;
38313 break;
38315 case V8HImode:
38316 use_vec_extr = TARGET_SSE2;
38317 break;
38318 case V4HImode:
38319 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
38320 break;
38322 case V16QImode:
38323 use_vec_extr = TARGET_SSE4_1;
38324 break;
38326 case V8SFmode:
38327 if (TARGET_AVX)
38329 tmp = gen_reg_rtx (V4SFmode);
38330 if (elt < 4)
38331 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
38332 else
38333 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
38334 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38335 return;
38337 break;
38339 case V4DFmode:
38340 if (TARGET_AVX)
38342 tmp = gen_reg_rtx (V2DFmode);
38343 if (elt < 2)
38344 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
38345 else
38346 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
38347 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38348 return;
38350 break;
38352 case V32QImode:
38353 if (TARGET_AVX)
38355 tmp = gen_reg_rtx (V16QImode);
38356 if (elt < 16)
38357 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
38358 else
38359 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
38360 ix86_expand_vector_extract (false, target, tmp, elt & 15);
38361 return;
38363 break;
38365 case V16HImode:
38366 if (TARGET_AVX)
38368 tmp = gen_reg_rtx (V8HImode);
38369 if (elt < 8)
38370 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
38371 else
38372 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
38373 ix86_expand_vector_extract (false, target, tmp, elt & 7);
38374 return;
38376 break;
38378 case V8SImode:
38379 if (TARGET_AVX)
38381 tmp = gen_reg_rtx (V4SImode);
38382 if (elt < 4)
38383 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
38384 else
38385 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
38386 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38387 return;
38389 break;
38391 case V4DImode:
38392 if (TARGET_AVX)
38394 tmp = gen_reg_rtx (V2DImode);
38395 if (elt < 2)
38396 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
38397 else
38398 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
38399 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38400 return;
38402 break;
38404 case V8QImode:
38405 /* ??? Could extract the appropriate HImode element and shift. */
38406 default:
38407 break;
38410 if (use_vec_extr)
38412 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
38413 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
38415 /* Let the rtl optimizers know about the zero extension performed. */
38416 if (inner_mode == QImode || inner_mode == HImode)
38418 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
38419 target = gen_lowpart (SImode, target);
38422 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38424 else
38426 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38428 emit_move_insn (mem, vec);
38430 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38431 emit_move_insn (target, tmp);
38435 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
38436 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
38437 The upper bits of DEST are undefined, though they shouldn't cause
38438 exceptions (some bits from src or all zeros are ok). */
38440 static void
38441 emit_reduc_half (rtx dest, rtx src, int i)
38443 rtx tem, d = dest;
38444 switch (GET_MODE (src))
38446 case V4SFmode:
38447 if (i == 128)
38448 tem = gen_sse_movhlps (dest, src, src);
38449 else
38450 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
38451 GEN_INT (1 + 4), GEN_INT (1 + 4));
38452 break;
38453 case V2DFmode:
38454 tem = gen_vec_interleave_highv2df (dest, src, src);
38455 break;
38456 case V16QImode:
38457 case V8HImode:
38458 case V4SImode:
38459 case V2DImode:
38460 d = gen_reg_rtx (V1TImode);
38461 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
38462 GEN_INT (i / 2));
38463 break;
38464 case V8SFmode:
38465 if (i == 256)
38466 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38467 else
38468 tem = gen_avx_shufps256 (dest, src, src,
38469 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38470 break;
38471 case V4DFmode:
38472 if (i == 256)
38473 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38474 else
38475 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38476 break;
38477 case V32QImode:
38478 case V16HImode:
38479 case V8SImode:
38480 case V4DImode:
38481 if (i == 256)
38483 if (GET_MODE (dest) != V4DImode)
38484 d = gen_reg_rtx (V4DImode);
38485 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38486 gen_lowpart (V4DImode, src),
38487 const1_rtx);
38489 else
38491 d = gen_reg_rtx (V2TImode);
38492 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38493 GEN_INT (i / 2));
38495 break;
38496 default:
38497 gcc_unreachable ();
38499 emit_insn (tem);
38500 if (d != dest)
38501 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38504 /* Expand a vector reduction. FN is the binary pattern to reduce;
38505 DEST is the destination; IN is the input vector. */
38507 void
38508 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38510 rtx half, dst, vec = in;
38511 enum machine_mode mode = GET_MODE (in);
38512 int i;
38514 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38515 if (TARGET_SSE4_1
38516 && mode == V8HImode
38517 && fn == gen_uminv8hi3)
38519 emit_insn (gen_sse4_1_phminposuw (dest, in));
38520 return;
38523 for (i = GET_MODE_BITSIZE (mode);
38524 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38525 i >>= 1)
38527 half = gen_reg_rtx (mode);
38528 emit_reduc_half (half, vec, i);
38529 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38530 dst = dest;
38531 else
38532 dst = gen_reg_rtx (mode);
38533 emit_insn (fn (dst, half, vec));
38534 vec = dst;
38538 /* Target hook for scalar_mode_supported_p. */
38539 static bool
38540 ix86_scalar_mode_supported_p (enum machine_mode mode)
38542 if (DECIMAL_FLOAT_MODE_P (mode))
38543 return default_decimal_float_supported_p ();
38544 else if (mode == TFmode)
38545 return true;
38546 else
38547 return default_scalar_mode_supported_p (mode);
38550 /* Implements target hook vector_mode_supported_p. */
38551 static bool
38552 ix86_vector_mode_supported_p (enum machine_mode mode)
38554 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38555 return true;
38556 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38557 return true;
38558 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38559 return true;
38560 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38561 return true;
38562 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38563 return true;
38564 return false;
38567 /* Target hook for c_mode_for_suffix. */
38568 static enum machine_mode
38569 ix86_c_mode_for_suffix (char suffix)
38571 if (suffix == 'q')
38572 return TFmode;
38573 if (suffix == 'w')
38574 return XFmode;
38576 return VOIDmode;
38579 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38581 We do this in the new i386 backend to maintain source compatibility
38582 with the old cc0-based compiler. */
38584 static tree
38585 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38586 tree inputs ATTRIBUTE_UNUSED,
38587 tree clobbers)
38589 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38590 clobbers);
38591 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38592 clobbers);
38593 return clobbers;
38596 /* Implements target vector targetm.asm.encode_section_info. */
38598 static void ATTRIBUTE_UNUSED
38599 ix86_encode_section_info (tree decl, rtx rtl, int first)
38601 default_encode_section_info (decl, rtl, first);
38603 if (TREE_CODE (decl) == VAR_DECL
38604 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38605 && ix86_in_large_data_p (decl))
38606 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38609 /* Worker function for REVERSE_CONDITION. */
38611 enum rtx_code
38612 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38614 return (mode != CCFPmode && mode != CCFPUmode
38615 ? reverse_condition (code)
38616 : reverse_condition_maybe_unordered (code));
38619 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38620 to OPERANDS[0]. */
38622 const char *
38623 output_387_reg_move (rtx insn, rtx *operands)
38625 if (REG_P (operands[0]))
38627 if (REG_P (operands[1])
38628 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38630 if (REGNO (operands[0]) == FIRST_STACK_REG)
38631 return output_387_ffreep (operands, 0);
38632 return "fstp\t%y0";
38634 if (STACK_TOP_P (operands[0]))
38635 return "fld%Z1\t%y1";
38636 return "fst\t%y0";
38638 else if (MEM_P (operands[0]))
38640 gcc_assert (REG_P (operands[1]));
38641 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38642 return "fstp%Z0\t%y0";
38643 else
38645 /* There is no non-popping store to memory for XFmode.
38646 So if we need one, follow the store with a load. */
38647 if (GET_MODE (operands[0]) == XFmode)
38648 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38649 else
38650 return "fst%Z0\t%y0";
38653 else
38654 gcc_unreachable();
38657 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38658 FP status register is set. */
38660 void
38661 ix86_emit_fp_unordered_jump (rtx label)
38663 rtx reg = gen_reg_rtx (HImode);
38664 rtx temp;
38666 emit_insn (gen_x86_fnstsw_1 (reg));
38668 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38670 emit_insn (gen_x86_sahf_1 (reg));
38672 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38673 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38675 else
38677 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38679 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38680 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38683 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38684 gen_rtx_LABEL_REF (VOIDmode, label),
38685 pc_rtx);
38686 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38688 emit_jump_insn (temp);
38689 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38692 /* Output code to perform a log1p XFmode calculation. */
38694 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38696 rtx label1 = gen_label_rtx ();
38697 rtx label2 = gen_label_rtx ();
38699 rtx tmp = gen_reg_rtx (XFmode);
38700 rtx tmp2 = gen_reg_rtx (XFmode);
38701 rtx test;
38703 emit_insn (gen_absxf2 (tmp, op1));
38704 test = gen_rtx_GE (VOIDmode, tmp,
38705 CONST_DOUBLE_FROM_REAL_VALUE (
38706 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38707 XFmode));
38708 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38710 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38711 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38712 emit_jump (label2);
38714 emit_label (label1);
38715 emit_move_insn (tmp, CONST1_RTX (XFmode));
38716 emit_insn (gen_addxf3 (tmp, op1, tmp));
38717 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38718 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38720 emit_label (label2);
38723 /* Emit code for round calculation. */
38724 void ix86_emit_i387_round (rtx op0, rtx op1)
38726 enum machine_mode inmode = GET_MODE (op1);
38727 enum machine_mode outmode = GET_MODE (op0);
38728 rtx e1, e2, res, tmp, tmp1, half;
38729 rtx scratch = gen_reg_rtx (HImode);
38730 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38731 rtx jump_label = gen_label_rtx ();
38732 rtx insn;
38733 rtx (*gen_abs) (rtx, rtx);
38734 rtx (*gen_neg) (rtx, rtx);
38736 switch (inmode)
38738 case SFmode:
38739 gen_abs = gen_abssf2;
38740 break;
38741 case DFmode:
38742 gen_abs = gen_absdf2;
38743 break;
38744 case XFmode:
38745 gen_abs = gen_absxf2;
38746 break;
38747 default:
38748 gcc_unreachable ();
38751 switch (outmode)
38753 case SFmode:
38754 gen_neg = gen_negsf2;
38755 break;
38756 case DFmode:
38757 gen_neg = gen_negdf2;
38758 break;
38759 case XFmode:
38760 gen_neg = gen_negxf2;
38761 break;
38762 case HImode:
38763 gen_neg = gen_neghi2;
38764 break;
38765 case SImode:
38766 gen_neg = gen_negsi2;
38767 break;
38768 case DImode:
38769 gen_neg = gen_negdi2;
38770 break;
38771 default:
38772 gcc_unreachable ();
38775 e1 = gen_reg_rtx (inmode);
38776 e2 = gen_reg_rtx (inmode);
38777 res = gen_reg_rtx (outmode);
38779 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38781 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38783 /* scratch = fxam(op1) */
38784 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38785 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38786 UNSPEC_FXAM)));
38787 /* e1 = fabs(op1) */
38788 emit_insn (gen_abs (e1, op1));
38790 /* e2 = e1 + 0.5 */
38791 half = force_reg (inmode, half);
38792 emit_insn (gen_rtx_SET (VOIDmode, e2,
38793 gen_rtx_PLUS (inmode, e1, half)));
38795 /* res = floor(e2) */
38796 if (inmode != XFmode)
38798 tmp1 = gen_reg_rtx (XFmode);
38800 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38801 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38803 else
38804 tmp1 = e2;
38806 switch (outmode)
38808 case SFmode:
38809 case DFmode:
38811 rtx tmp0 = gen_reg_rtx (XFmode);
38813 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38815 emit_insn (gen_rtx_SET (VOIDmode, res,
38816 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38817 UNSPEC_TRUNC_NOOP)));
38819 break;
38820 case XFmode:
38821 emit_insn (gen_frndintxf2_floor (res, tmp1));
38822 break;
38823 case HImode:
38824 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38825 break;
38826 case SImode:
38827 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38828 break;
38829 case DImode:
38830 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38831 break;
38832 default:
38833 gcc_unreachable ();
38836 /* flags = signbit(a) */
38837 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38839 /* if (flags) then res = -res */
38840 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38841 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38842 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38843 pc_rtx);
38844 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38845 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38846 JUMP_LABEL (insn) = jump_label;
38848 emit_insn (gen_neg (res, res));
38850 emit_label (jump_label);
38851 LABEL_NUSES (jump_label) = 1;
38853 emit_move_insn (op0, res);
38856 /* Output code to perform a Newton-Rhapson approximation of a single precision
38857 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38859 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38861 rtx x0, x1, e0, e1;
38863 x0 = gen_reg_rtx (mode);
38864 e0 = gen_reg_rtx (mode);
38865 e1 = gen_reg_rtx (mode);
38866 x1 = gen_reg_rtx (mode);
38868 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38870 b = force_reg (mode, b);
38872 /* x0 = rcp(b) estimate */
38873 emit_insn (gen_rtx_SET (VOIDmode, x0,
38874 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38875 UNSPEC_RCP)));
38876 /* e0 = x0 * b */
38877 emit_insn (gen_rtx_SET (VOIDmode, e0,
38878 gen_rtx_MULT (mode, x0, b)));
38880 /* e0 = x0 * e0 */
38881 emit_insn (gen_rtx_SET (VOIDmode, e0,
38882 gen_rtx_MULT (mode, x0, e0)));
38884 /* e1 = x0 + x0 */
38885 emit_insn (gen_rtx_SET (VOIDmode, e1,
38886 gen_rtx_PLUS (mode, x0, x0)));
38888 /* x1 = e1 - e0 */
38889 emit_insn (gen_rtx_SET (VOIDmode, x1,
38890 gen_rtx_MINUS (mode, e1, e0)));
38892 /* res = a * x1 */
38893 emit_insn (gen_rtx_SET (VOIDmode, res,
38894 gen_rtx_MULT (mode, a, x1)));
38897 /* Output code to perform a Newton-Rhapson approximation of a
38898 single precision floating point [reciprocal] square root. */
38900 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38901 bool recip)
38903 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38904 REAL_VALUE_TYPE r;
38906 x0 = gen_reg_rtx (mode);
38907 e0 = gen_reg_rtx (mode);
38908 e1 = gen_reg_rtx (mode);
38909 e2 = gen_reg_rtx (mode);
38910 e3 = gen_reg_rtx (mode);
38912 real_from_integer (&r, VOIDmode, -3, -1, 0);
38913 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38915 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38916 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38918 if (VECTOR_MODE_P (mode))
38920 mthree = ix86_build_const_vector (mode, true, mthree);
38921 mhalf = ix86_build_const_vector (mode, true, mhalf);
38924 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38925 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38927 a = force_reg (mode, a);
38929 /* x0 = rsqrt(a) estimate */
38930 emit_insn (gen_rtx_SET (VOIDmode, x0,
38931 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38932 UNSPEC_RSQRT)));
38934 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38935 if (!recip)
38937 rtx zero, mask;
38939 zero = gen_reg_rtx (mode);
38940 mask = gen_reg_rtx (mode);
38942 zero = force_reg (mode, CONST0_RTX(mode));
38943 emit_insn (gen_rtx_SET (VOIDmode, mask,
38944 gen_rtx_NE (mode, zero, a)));
38946 emit_insn (gen_rtx_SET (VOIDmode, x0,
38947 gen_rtx_AND (mode, x0, mask)));
38950 /* e0 = x0 * a */
38951 emit_insn (gen_rtx_SET (VOIDmode, e0,
38952 gen_rtx_MULT (mode, x0, a)));
38953 /* e1 = e0 * x0 */
38954 emit_insn (gen_rtx_SET (VOIDmode, e1,
38955 gen_rtx_MULT (mode, e0, x0)));
38957 /* e2 = e1 - 3. */
38958 mthree = force_reg (mode, mthree);
38959 emit_insn (gen_rtx_SET (VOIDmode, e2,
38960 gen_rtx_PLUS (mode, e1, mthree)));
38962 mhalf = force_reg (mode, mhalf);
38963 if (recip)
38964 /* e3 = -.5 * x0 */
38965 emit_insn (gen_rtx_SET (VOIDmode, e3,
38966 gen_rtx_MULT (mode, x0, mhalf)));
38967 else
38968 /* e3 = -.5 * e0 */
38969 emit_insn (gen_rtx_SET (VOIDmode, e3,
38970 gen_rtx_MULT (mode, e0, mhalf)));
38971 /* ret = e2 * e3 */
38972 emit_insn (gen_rtx_SET (VOIDmode, res,
38973 gen_rtx_MULT (mode, e2, e3)));
38976 #ifdef TARGET_SOLARIS
38977 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38979 static void
38980 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38981 tree decl)
38983 /* With Binutils 2.15, the "@unwind" marker must be specified on
38984 every occurrence of the ".eh_frame" section, not just the first
38985 one. */
38986 if (TARGET_64BIT
38987 && strcmp (name, ".eh_frame") == 0)
38989 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38990 flags & SECTION_WRITE ? "aw" : "a");
38991 return;
38994 #ifndef USE_GAS
38995 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38997 solaris_elf_asm_comdat_section (name, flags, decl);
38998 return;
39000 #endif
39002 default_elf_asm_named_section (name, flags, decl);
39004 #endif /* TARGET_SOLARIS */
39006 /* Return the mangling of TYPE if it is an extended fundamental type. */
39008 static const char *
39009 ix86_mangle_type (const_tree type)
39011 type = TYPE_MAIN_VARIANT (type);
39013 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
39014 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
39015 return NULL;
39017 switch (TYPE_MODE (type))
39019 case TFmode:
39020 /* __float128 is "g". */
39021 return "g";
39022 case XFmode:
39023 /* "long double" or __float80 is "e". */
39024 return "e";
39025 default:
39026 return NULL;
39030 /* For 32-bit code we can save PIC register setup by using
39031 __stack_chk_fail_local hidden function instead of calling
39032 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
39033 register, so it is better to call __stack_chk_fail directly. */
39035 static tree ATTRIBUTE_UNUSED
39036 ix86_stack_protect_fail (void)
39038 return TARGET_64BIT
39039 ? default_external_stack_protect_fail ()
39040 : default_hidden_stack_protect_fail ();
39043 /* Select a format to encode pointers in exception handling data. CODE
39044 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
39045 true if the symbol may be affected by dynamic relocations.
39047 ??? All x86 object file formats are capable of representing this.
39048 After all, the relocation needed is the same as for the call insn.
39049 Whether or not a particular assembler allows us to enter such, I
39050 guess we'll have to see. */
39052 asm_preferred_eh_data_format (int code, int global)
39054 if (flag_pic)
39056 int type = DW_EH_PE_sdata8;
39057 if (!TARGET_64BIT
39058 || ix86_cmodel == CM_SMALL_PIC
39059 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
39060 type = DW_EH_PE_sdata4;
39061 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
39063 if (ix86_cmodel == CM_SMALL
39064 || (ix86_cmodel == CM_MEDIUM && code))
39065 return DW_EH_PE_udata4;
39066 return DW_EH_PE_absptr;
39069 /* Expand copysign from SIGN to the positive value ABS_VALUE
39070 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
39071 the sign-bit. */
39072 static void
39073 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
39075 enum machine_mode mode = GET_MODE (sign);
39076 rtx sgn = gen_reg_rtx (mode);
39077 if (mask == NULL_RTX)
39079 enum machine_mode vmode;
39081 if (mode == SFmode)
39082 vmode = V4SFmode;
39083 else if (mode == DFmode)
39084 vmode = V2DFmode;
39085 else
39086 vmode = mode;
39088 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
39089 if (!VECTOR_MODE_P (mode))
39091 /* We need to generate a scalar mode mask in this case. */
39092 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
39093 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
39094 mask = gen_reg_rtx (mode);
39095 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
39098 else
39099 mask = gen_rtx_NOT (mode, mask);
39100 emit_insn (gen_rtx_SET (VOIDmode, sgn,
39101 gen_rtx_AND (mode, mask, sign)));
39102 emit_insn (gen_rtx_SET (VOIDmode, result,
39103 gen_rtx_IOR (mode, abs_value, sgn)));
39106 /* Expand fabs (OP0) and return a new rtx that holds the result. The
39107 mask for masking out the sign-bit is stored in *SMASK, if that is
39108 non-null. */
39109 static rtx
39110 ix86_expand_sse_fabs (rtx op0, rtx *smask)
39112 enum machine_mode vmode, mode = GET_MODE (op0);
39113 rtx xa, mask;
39115 xa = gen_reg_rtx (mode);
39116 if (mode == SFmode)
39117 vmode = V4SFmode;
39118 else if (mode == DFmode)
39119 vmode = V2DFmode;
39120 else
39121 vmode = mode;
39122 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
39123 if (!VECTOR_MODE_P (mode))
39125 /* We need to generate a scalar mode mask in this case. */
39126 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
39127 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
39128 mask = gen_reg_rtx (mode);
39129 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
39131 emit_insn (gen_rtx_SET (VOIDmode, xa,
39132 gen_rtx_AND (mode, op0, mask)));
39134 if (smask)
39135 *smask = mask;
39137 return xa;
39140 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
39141 swapping the operands if SWAP_OPERANDS is true. The expanded
39142 code is a forward jump to a newly created label in case the
39143 comparison is true. The generated label rtx is returned. */
39144 static rtx
39145 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
39146 bool swap_operands)
39148 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
39149 rtx label, tmp;
39151 if (swap_operands)
39153 tmp = op0;
39154 op0 = op1;
39155 op1 = tmp;
39158 label = gen_label_rtx ();
39159 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
39160 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39161 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
39162 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
39163 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
39164 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
39165 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
39166 JUMP_LABEL (tmp) = label;
39168 return label;
39171 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
39172 using comparison code CODE. Operands are swapped for the comparison if
39173 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
39174 static rtx
39175 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
39176 bool swap_operands)
39178 rtx (*insn)(rtx, rtx, rtx, rtx);
39179 enum machine_mode mode = GET_MODE (op0);
39180 rtx mask = gen_reg_rtx (mode);
39182 if (swap_operands)
39184 rtx tmp = op0;
39185 op0 = op1;
39186 op1 = tmp;
39189 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
39191 emit_insn (insn (mask, op0, op1,
39192 gen_rtx_fmt_ee (code, mode, op0, op1)));
39193 return mask;
39196 /* Generate and return a rtx of mode MODE for 2**n where n is the number
39197 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
39198 static rtx
39199 ix86_gen_TWO52 (enum machine_mode mode)
39201 REAL_VALUE_TYPE TWO52r;
39202 rtx TWO52;
39204 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
39205 TWO52 = const_double_from_real_value (TWO52r, mode);
39206 TWO52 = force_reg (mode, TWO52);
39208 return TWO52;
39211 /* Expand SSE sequence for computing lround from OP1 storing
39212 into OP0. */
39213 void
39214 ix86_expand_lround (rtx op0, rtx op1)
39216 /* C code for the stuff we're doing below:
39217 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
39218 return (long)tmp;
39220 enum machine_mode mode = GET_MODE (op1);
39221 const struct real_format *fmt;
39222 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39223 rtx adj;
39225 /* load nextafter (0.5, 0.0) */
39226 fmt = REAL_MODE_FORMAT (mode);
39227 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39228 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39230 /* adj = copysign (0.5, op1) */
39231 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
39232 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
39234 /* adj = op1 + adj */
39235 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
39237 /* op0 = (imode)adj */
39238 expand_fix (op0, adj, 0);
39241 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
39242 into OPERAND0. */
39243 void
39244 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
39246 /* C code for the stuff we're doing below (for do_floor):
39247 xi = (long)op1;
39248 xi -= (double)xi > op1 ? 1 : 0;
39249 return xi;
39251 enum machine_mode fmode = GET_MODE (op1);
39252 enum machine_mode imode = GET_MODE (op0);
39253 rtx ireg, freg, label, tmp;
39255 /* reg = (long)op1 */
39256 ireg = gen_reg_rtx (imode);
39257 expand_fix (ireg, op1, 0);
39259 /* freg = (double)reg */
39260 freg = gen_reg_rtx (fmode);
39261 expand_float (freg, ireg, 0);
39263 /* ireg = (freg > op1) ? ireg - 1 : ireg */
39264 label = ix86_expand_sse_compare_and_jump (UNLE,
39265 freg, op1, !do_floor);
39266 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
39267 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
39268 emit_move_insn (ireg, tmp);
39270 emit_label (label);
39271 LABEL_NUSES (label) = 1;
39273 emit_move_insn (op0, ireg);
39276 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
39277 result in OPERAND0. */
39278 void
39279 ix86_expand_rint (rtx operand0, rtx operand1)
39281 /* C code for the stuff we're doing below:
39282 xa = fabs (operand1);
39283 if (!isless (xa, 2**52))
39284 return operand1;
39285 xa = xa + 2**52 - 2**52;
39286 return copysign (xa, operand1);
39288 enum machine_mode mode = GET_MODE (operand0);
39289 rtx res, xa, label, TWO52, mask;
39291 res = gen_reg_rtx (mode);
39292 emit_move_insn (res, operand1);
39294 /* xa = abs (operand1) */
39295 xa = ix86_expand_sse_fabs (res, &mask);
39297 /* if (!isless (xa, TWO52)) goto label; */
39298 TWO52 = ix86_gen_TWO52 (mode);
39299 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39301 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39302 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39304 ix86_sse_copysign_to_positive (res, xa, res, mask);
39306 emit_label (label);
39307 LABEL_NUSES (label) = 1;
39309 emit_move_insn (operand0, res);
39312 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39313 into OPERAND0. */
39314 void
39315 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
39317 /* C code for the stuff we expand below.
39318 double xa = fabs (x), x2;
39319 if (!isless (xa, TWO52))
39320 return x;
39321 xa = xa + TWO52 - TWO52;
39322 x2 = copysign (xa, x);
39323 Compensate. Floor:
39324 if (x2 > x)
39325 x2 -= 1;
39326 Compensate. Ceil:
39327 if (x2 < x)
39328 x2 -= -1;
39329 return x2;
39331 enum machine_mode mode = GET_MODE (operand0);
39332 rtx xa, TWO52, tmp, label, one, res, mask;
39334 TWO52 = ix86_gen_TWO52 (mode);
39336 /* Temporary for holding the result, initialized to the input
39337 operand to ease control flow. */
39338 res = gen_reg_rtx (mode);
39339 emit_move_insn (res, operand1);
39341 /* xa = abs (operand1) */
39342 xa = ix86_expand_sse_fabs (res, &mask);
39344 /* if (!isless (xa, TWO52)) goto label; */
39345 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39347 /* xa = xa + TWO52 - TWO52; */
39348 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39349 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39351 /* xa = copysign (xa, operand1) */
39352 ix86_sse_copysign_to_positive (xa, xa, res, mask);
39354 /* generate 1.0 or -1.0 */
39355 one = force_reg (mode,
39356 const_double_from_real_value (do_floor
39357 ? dconst1 : dconstm1, mode));
39359 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39360 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39361 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39362 gen_rtx_AND (mode, one, tmp)));
39363 /* We always need to subtract here to preserve signed zero. */
39364 tmp = expand_simple_binop (mode, MINUS,
39365 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39366 emit_move_insn (res, tmp);
39368 emit_label (label);
39369 LABEL_NUSES (label) = 1;
39371 emit_move_insn (operand0, res);
39374 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39375 into OPERAND0. */
39376 void
39377 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
39379 /* C code for the stuff we expand below.
39380 double xa = fabs (x), x2;
39381 if (!isless (xa, TWO52))
39382 return x;
39383 x2 = (double)(long)x;
39384 Compensate. Floor:
39385 if (x2 > x)
39386 x2 -= 1;
39387 Compensate. Ceil:
39388 if (x2 < x)
39389 x2 += 1;
39390 if (HONOR_SIGNED_ZEROS (mode))
39391 return copysign (x2, x);
39392 return x2;
39394 enum machine_mode mode = GET_MODE (operand0);
39395 rtx xa, xi, TWO52, tmp, label, one, res, mask;
39397 TWO52 = ix86_gen_TWO52 (mode);
39399 /* Temporary for holding the result, initialized to the input
39400 operand to ease control flow. */
39401 res = gen_reg_rtx (mode);
39402 emit_move_insn (res, operand1);
39404 /* xa = abs (operand1) */
39405 xa = ix86_expand_sse_fabs (res, &mask);
39407 /* if (!isless (xa, TWO52)) goto label; */
39408 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39410 /* xa = (double)(long)x */
39411 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39412 expand_fix (xi, res, 0);
39413 expand_float (xa, xi, 0);
39415 /* generate 1.0 */
39416 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39418 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39419 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39420 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39421 gen_rtx_AND (mode, one, tmp)));
39422 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
39423 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39424 emit_move_insn (res, tmp);
39426 if (HONOR_SIGNED_ZEROS (mode))
39427 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39429 emit_label (label);
39430 LABEL_NUSES (label) = 1;
39432 emit_move_insn (operand0, res);
39435 /* Expand SSE sequence for computing round from OPERAND1 storing
39436 into OPERAND0. Sequence that works without relying on DImode truncation
39437 via cvttsd2siq that is only available on 64bit targets. */
39438 void
39439 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
39441 /* C code for the stuff we expand below.
39442 double xa = fabs (x), xa2, x2;
39443 if (!isless (xa, TWO52))
39444 return x;
39445 Using the absolute value and copying back sign makes
39446 -0.0 -> -0.0 correct.
39447 xa2 = xa + TWO52 - TWO52;
39448 Compensate.
39449 dxa = xa2 - xa;
39450 if (dxa <= -0.5)
39451 xa2 += 1;
39452 else if (dxa > 0.5)
39453 xa2 -= 1;
39454 x2 = copysign (xa2, x);
39455 return x2;
39457 enum machine_mode mode = GET_MODE (operand0);
39458 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
39460 TWO52 = ix86_gen_TWO52 (mode);
39462 /* Temporary for holding the result, initialized to the input
39463 operand to ease control flow. */
39464 res = gen_reg_rtx (mode);
39465 emit_move_insn (res, operand1);
39467 /* xa = abs (operand1) */
39468 xa = ix86_expand_sse_fabs (res, &mask);
39470 /* if (!isless (xa, TWO52)) goto label; */
39471 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39473 /* xa2 = xa + TWO52 - TWO52; */
39474 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39475 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39477 /* dxa = xa2 - xa; */
39478 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39480 /* generate 0.5, 1.0 and -0.5 */
39481 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39482 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39483 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39484 0, OPTAB_DIRECT);
39486 /* Compensate. */
39487 tmp = gen_reg_rtx (mode);
39488 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39489 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39490 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39491 gen_rtx_AND (mode, one, tmp)));
39492 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39493 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39494 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39495 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39496 gen_rtx_AND (mode, one, tmp)));
39497 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39499 /* res = copysign (xa2, operand1) */
39500 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39502 emit_label (label);
39503 LABEL_NUSES (label) = 1;
39505 emit_move_insn (operand0, res);
39508 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39509 into OPERAND0. */
39510 void
39511 ix86_expand_trunc (rtx operand0, rtx operand1)
39513 /* C code for SSE variant we expand below.
39514 double xa = fabs (x), x2;
39515 if (!isless (xa, TWO52))
39516 return x;
39517 x2 = (double)(long)x;
39518 if (HONOR_SIGNED_ZEROS (mode))
39519 return copysign (x2, x);
39520 return x2;
39522 enum machine_mode mode = GET_MODE (operand0);
39523 rtx xa, xi, TWO52, label, res, mask;
39525 TWO52 = ix86_gen_TWO52 (mode);
39527 /* Temporary for holding the result, initialized to the input
39528 operand to ease control flow. */
39529 res = gen_reg_rtx (mode);
39530 emit_move_insn (res, operand1);
39532 /* xa = abs (operand1) */
39533 xa = ix86_expand_sse_fabs (res, &mask);
39535 /* if (!isless (xa, TWO52)) goto label; */
39536 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39538 /* x = (double)(long)x */
39539 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39540 expand_fix (xi, res, 0);
39541 expand_float (res, xi, 0);
39543 if (HONOR_SIGNED_ZEROS (mode))
39544 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39546 emit_label (label);
39547 LABEL_NUSES (label) = 1;
39549 emit_move_insn (operand0, res);
39552 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39553 into OPERAND0. */
39554 void
39555 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39557 enum machine_mode mode = GET_MODE (operand0);
39558 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39560 /* C code for SSE variant we expand below.
39561 double xa = fabs (x), x2;
39562 if (!isless (xa, TWO52))
39563 return x;
39564 xa2 = xa + TWO52 - TWO52;
39565 Compensate:
39566 if (xa2 > xa)
39567 xa2 -= 1.0;
39568 x2 = copysign (xa2, x);
39569 return x2;
39572 TWO52 = ix86_gen_TWO52 (mode);
39574 /* Temporary for holding the result, initialized to the input
39575 operand to ease control flow. */
39576 res = gen_reg_rtx (mode);
39577 emit_move_insn (res, operand1);
39579 /* xa = abs (operand1) */
39580 xa = ix86_expand_sse_fabs (res, &smask);
39582 /* if (!isless (xa, TWO52)) goto label; */
39583 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39585 /* res = xa + TWO52 - TWO52; */
39586 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39587 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39588 emit_move_insn (res, tmp);
39590 /* generate 1.0 */
39591 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39593 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39594 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39595 emit_insn (gen_rtx_SET (VOIDmode, mask,
39596 gen_rtx_AND (mode, mask, one)));
39597 tmp = expand_simple_binop (mode, MINUS,
39598 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39599 emit_move_insn (res, tmp);
39601 /* res = copysign (res, operand1) */
39602 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39604 emit_label (label);
39605 LABEL_NUSES (label) = 1;
39607 emit_move_insn (operand0, res);
39610 /* Expand SSE sequence for computing round from OPERAND1 storing
39611 into OPERAND0. */
39612 void
39613 ix86_expand_round (rtx operand0, rtx operand1)
39615 /* C code for the stuff we're doing below:
39616 double xa = fabs (x);
39617 if (!isless (xa, TWO52))
39618 return x;
39619 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39620 return copysign (xa, x);
39622 enum machine_mode mode = GET_MODE (operand0);
39623 rtx res, TWO52, xa, label, xi, half, mask;
39624 const struct real_format *fmt;
39625 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39627 /* Temporary for holding the result, initialized to the input
39628 operand to ease control flow. */
39629 res = gen_reg_rtx (mode);
39630 emit_move_insn (res, operand1);
39632 TWO52 = ix86_gen_TWO52 (mode);
39633 xa = ix86_expand_sse_fabs (res, &mask);
39634 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39636 /* load nextafter (0.5, 0.0) */
39637 fmt = REAL_MODE_FORMAT (mode);
39638 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39639 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39641 /* xa = xa + 0.5 */
39642 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39643 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39645 /* xa = (double)(int64_t)xa */
39646 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39647 expand_fix (xi, xa, 0);
39648 expand_float (xa, xi, 0);
39650 /* res = copysign (xa, operand1) */
39651 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39653 emit_label (label);
39654 LABEL_NUSES (label) = 1;
39656 emit_move_insn (operand0, res);
39659 /* Expand SSE sequence for computing round
39660 from OP1 storing into OP0 using sse4 round insn. */
39661 void
39662 ix86_expand_round_sse4 (rtx op0, rtx op1)
39664 enum machine_mode mode = GET_MODE (op0);
39665 rtx e1, e2, res, half;
39666 const struct real_format *fmt;
39667 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39668 rtx (*gen_copysign) (rtx, rtx, rtx);
39669 rtx (*gen_round) (rtx, rtx, rtx);
39671 switch (mode)
39673 case SFmode:
39674 gen_copysign = gen_copysignsf3;
39675 gen_round = gen_sse4_1_roundsf2;
39676 break;
39677 case DFmode:
39678 gen_copysign = gen_copysigndf3;
39679 gen_round = gen_sse4_1_rounddf2;
39680 break;
39681 default:
39682 gcc_unreachable ();
39685 /* round (a) = trunc (a + copysign (0.5, a)) */
39687 /* load nextafter (0.5, 0.0) */
39688 fmt = REAL_MODE_FORMAT (mode);
39689 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39690 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39691 half = const_double_from_real_value (pred_half, mode);
39693 /* e1 = copysign (0.5, op1) */
39694 e1 = gen_reg_rtx (mode);
39695 emit_insn (gen_copysign (e1, half, op1));
39697 /* e2 = op1 + e1 */
39698 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39700 /* res = trunc (e2) */
39701 res = gen_reg_rtx (mode);
39702 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39704 emit_move_insn (op0, res);
39708 /* Table of valid machine attributes. */
39709 static const struct attribute_spec ix86_attribute_table[] =
39711 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39712 affects_type_identity } */
39713 /* Stdcall attribute says callee is responsible for popping arguments
39714 if they are not variable. */
39715 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39716 true },
39717 /* Fastcall attribute says callee is responsible for popping arguments
39718 if they are not variable. */
39719 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39720 true },
39721 /* Thiscall attribute says callee is responsible for popping arguments
39722 if they are not variable. */
39723 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39724 true },
39725 /* Cdecl attribute says the callee is a normal C declaration */
39726 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39727 true },
39728 /* Regparm attribute specifies how many integer arguments are to be
39729 passed in registers. */
39730 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39731 true },
39732 /* Sseregparm attribute says we are using x86_64 calling conventions
39733 for FP arguments. */
39734 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39735 true },
39736 /* The transactional memory builtins are implicitly regparm or fastcall
39737 depending on the ABI. Override the generic do-nothing attribute that
39738 these builtins were declared with. */
39739 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39740 true },
39741 /* force_align_arg_pointer says this function realigns the stack at entry. */
39742 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39743 false, true, true, ix86_handle_cconv_attribute, false },
39744 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39745 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39746 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39747 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39748 false },
39749 #endif
39750 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39751 false },
39752 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39753 false },
39754 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39755 SUBTARGET_ATTRIBUTE_TABLE,
39756 #endif
39757 /* ms_abi and sysv_abi calling convention function attributes. */
39758 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39759 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39760 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39761 false },
39762 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39763 ix86_handle_callee_pop_aggregate_return, true },
39764 /* End element. */
39765 { NULL, 0, 0, false, false, false, NULL, false }
39768 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39769 static int
39770 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39771 tree vectype,
39772 int misalign ATTRIBUTE_UNUSED)
39774 unsigned elements;
39776 switch (type_of_cost)
39778 case scalar_stmt:
39779 return ix86_cost->scalar_stmt_cost;
39781 case scalar_load:
39782 return ix86_cost->scalar_load_cost;
39784 case scalar_store:
39785 return ix86_cost->scalar_store_cost;
39787 case vector_stmt:
39788 return ix86_cost->vec_stmt_cost;
39790 case vector_load:
39791 return ix86_cost->vec_align_load_cost;
39793 case vector_store:
39794 return ix86_cost->vec_store_cost;
39796 case vec_to_scalar:
39797 return ix86_cost->vec_to_scalar_cost;
39799 case scalar_to_vec:
39800 return ix86_cost->scalar_to_vec_cost;
39802 case unaligned_load:
39803 case unaligned_store:
39804 return ix86_cost->vec_unalign_load_cost;
39806 case cond_branch_taken:
39807 return ix86_cost->cond_taken_branch_cost;
39809 case cond_branch_not_taken:
39810 return ix86_cost->cond_not_taken_branch_cost;
39812 case vec_perm:
39813 case vec_promote_demote:
39814 return ix86_cost->vec_stmt_cost;
39816 case vec_construct:
39817 elements = TYPE_VECTOR_SUBPARTS (vectype);
39818 return elements / 2 + 1;
39820 default:
39821 gcc_unreachable ();
39825 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39826 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39827 insn every time. */
39829 static GTY(()) rtx vselect_insn;
39831 /* Initialize vselect_insn. */
39833 static void
39834 init_vselect_insn (void)
39836 unsigned i;
39837 rtx x;
39839 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39840 for (i = 0; i < MAX_VECT_LEN; ++i)
39841 XVECEXP (x, 0, i) = const0_rtx;
39842 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39843 const0_rtx), x);
39844 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39845 start_sequence ();
39846 vselect_insn = emit_insn (x);
39847 end_sequence ();
39850 /* Construct (set target (vec_select op0 (parallel perm))) and
39851 return true if that's a valid instruction in the active ISA. */
39853 static bool
39854 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39855 unsigned nelt, bool testing_p)
39857 unsigned int i;
39858 rtx x, save_vconcat;
39859 int icode;
39861 if (vselect_insn == NULL_RTX)
39862 init_vselect_insn ();
39864 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39865 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39866 for (i = 0; i < nelt; ++i)
39867 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39868 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39869 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39870 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39871 SET_DEST (PATTERN (vselect_insn)) = target;
39872 icode = recog_memoized (vselect_insn);
39874 if (icode >= 0 && !testing_p)
39875 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39877 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39878 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39879 INSN_CODE (vselect_insn) = -1;
39881 return icode >= 0;
39884 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39886 static bool
39887 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39888 const unsigned char *perm, unsigned nelt,
39889 bool testing_p)
39891 enum machine_mode v2mode;
39892 rtx x;
39893 bool ok;
39895 if (vselect_insn == NULL_RTX)
39896 init_vselect_insn ();
39898 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39899 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39900 PUT_MODE (x, v2mode);
39901 XEXP (x, 0) = op0;
39902 XEXP (x, 1) = op1;
39903 ok = expand_vselect (target, x, perm, nelt, testing_p);
39904 XEXP (x, 0) = const0_rtx;
39905 XEXP (x, 1) = const0_rtx;
39906 return ok;
39909 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39910 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39912 static bool
39913 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39915 enum machine_mode vmode = d->vmode;
39916 unsigned i, mask, nelt = d->nelt;
39917 rtx target, op0, op1, x;
39918 rtx rperm[32], vperm;
39920 if (d->one_operand_p)
39921 return false;
39922 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39924 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39926 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39928 else
39929 return false;
39931 /* This is a blend, not a permute. Elements must stay in their
39932 respective lanes. */
39933 for (i = 0; i < nelt; ++i)
39935 unsigned e = d->perm[i];
39936 if (!(e == i || e == i + nelt))
39937 return false;
39940 if (d->testing_p)
39941 return true;
39943 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39944 decision should be extracted elsewhere, so that we only try that
39945 sequence once all budget==3 options have been tried. */
39946 target = d->target;
39947 op0 = d->op0;
39948 op1 = d->op1;
39949 mask = 0;
39951 switch (vmode)
39953 case V4DFmode:
39954 case V8SFmode:
39955 case V2DFmode:
39956 case V4SFmode:
39957 case V8HImode:
39958 case V8SImode:
39959 for (i = 0; i < nelt; ++i)
39960 mask |= (d->perm[i] >= nelt) << i;
39961 break;
39963 case V2DImode:
39964 for (i = 0; i < 2; ++i)
39965 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39966 vmode = V8HImode;
39967 goto do_subreg;
39969 case V4SImode:
39970 for (i = 0; i < 4; ++i)
39971 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39972 vmode = V8HImode;
39973 goto do_subreg;
39975 case V16QImode:
39976 /* See if bytes move in pairs so we can use pblendw with
39977 an immediate argument, rather than pblendvb with a vector
39978 argument. */
39979 for (i = 0; i < 16; i += 2)
39980 if (d->perm[i] + 1 != d->perm[i + 1])
39982 use_pblendvb:
39983 for (i = 0; i < nelt; ++i)
39984 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39986 finish_pblendvb:
39987 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39988 vperm = force_reg (vmode, vperm);
39990 if (GET_MODE_SIZE (vmode) == 16)
39991 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39992 else
39993 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39994 if (target != d->target)
39995 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39996 return true;
39999 for (i = 0; i < 8; ++i)
40000 mask |= (d->perm[i * 2] >= 16) << i;
40001 vmode = V8HImode;
40002 /* FALLTHRU */
40004 do_subreg:
40005 target = gen_reg_rtx (vmode);
40006 op0 = gen_lowpart (vmode, op0);
40007 op1 = gen_lowpart (vmode, op1);
40008 break;
40010 case V32QImode:
40011 /* See if bytes move in pairs. If not, vpblendvb must be used. */
40012 for (i = 0; i < 32; i += 2)
40013 if (d->perm[i] + 1 != d->perm[i + 1])
40014 goto use_pblendvb;
40015 /* See if bytes move in quadruplets. If yes, vpblendd
40016 with immediate can be used. */
40017 for (i = 0; i < 32; i += 4)
40018 if (d->perm[i] + 2 != d->perm[i + 2])
40019 break;
40020 if (i < 32)
40022 /* See if bytes move the same in both lanes. If yes,
40023 vpblendw with immediate can be used. */
40024 for (i = 0; i < 16; i += 2)
40025 if (d->perm[i] + 16 != d->perm[i + 16])
40026 goto use_pblendvb;
40028 /* Use vpblendw. */
40029 for (i = 0; i < 16; ++i)
40030 mask |= (d->perm[i * 2] >= 32) << i;
40031 vmode = V16HImode;
40032 goto do_subreg;
40035 /* Use vpblendd. */
40036 for (i = 0; i < 8; ++i)
40037 mask |= (d->perm[i * 4] >= 32) << i;
40038 vmode = V8SImode;
40039 goto do_subreg;
40041 case V16HImode:
40042 /* See if words move in pairs. If yes, vpblendd can be used. */
40043 for (i = 0; i < 16; i += 2)
40044 if (d->perm[i] + 1 != d->perm[i + 1])
40045 break;
40046 if (i < 16)
40048 /* See if words move the same in both lanes. If not,
40049 vpblendvb must be used. */
40050 for (i = 0; i < 8; i++)
40051 if (d->perm[i] + 8 != d->perm[i + 8])
40053 /* Use vpblendvb. */
40054 for (i = 0; i < 32; ++i)
40055 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
40057 vmode = V32QImode;
40058 nelt = 32;
40059 target = gen_reg_rtx (vmode);
40060 op0 = gen_lowpart (vmode, op0);
40061 op1 = gen_lowpart (vmode, op1);
40062 goto finish_pblendvb;
40065 /* Use vpblendw. */
40066 for (i = 0; i < 16; ++i)
40067 mask |= (d->perm[i] >= 16) << i;
40068 break;
40071 /* Use vpblendd. */
40072 for (i = 0; i < 8; ++i)
40073 mask |= (d->perm[i * 2] >= 16) << i;
40074 vmode = V8SImode;
40075 goto do_subreg;
40077 case V4DImode:
40078 /* Use vpblendd. */
40079 for (i = 0; i < 4; ++i)
40080 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
40081 vmode = V8SImode;
40082 goto do_subreg;
40084 default:
40085 gcc_unreachable ();
40088 /* This matches five different patterns with the different modes. */
40089 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
40090 x = gen_rtx_SET (VOIDmode, target, x);
40091 emit_insn (x);
40092 if (target != d->target)
40093 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40095 return true;
40098 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40099 in terms of the variable form of vpermilps.
40101 Note that we will have already failed the immediate input vpermilps,
40102 which requires that the high and low part shuffle be identical; the
40103 variable form doesn't require that. */
40105 static bool
40106 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
40108 rtx rperm[8], vperm;
40109 unsigned i;
40111 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
40112 return false;
40114 /* We can only permute within the 128-bit lane. */
40115 for (i = 0; i < 8; ++i)
40117 unsigned e = d->perm[i];
40118 if (i < 4 ? e >= 4 : e < 4)
40119 return false;
40122 if (d->testing_p)
40123 return true;
40125 for (i = 0; i < 8; ++i)
40127 unsigned e = d->perm[i];
40129 /* Within each 128-bit lane, the elements of op0 are numbered
40130 from 0 and the elements of op1 are numbered from 4. */
40131 if (e >= 8 + 4)
40132 e -= 8;
40133 else if (e >= 4)
40134 e -= 4;
40136 rperm[i] = GEN_INT (e);
40139 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
40140 vperm = force_reg (V8SImode, vperm);
40141 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
40143 return true;
40146 /* Return true if permutation D can be performed as VMODE permutation
40147 instead. */
40149 static bool
40150 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
40152 unsigned int i, j, chunk;
40154 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
40155 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
40156 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
40157 return false;
40159 if (GET_MODE_NUNITS (vmode) >= d->nelt)
40160 return true;
40162 chunk = d->nelt / GET_MODE_NUNITS (vmode);
40163 for (i = 0; i < d->nelt; i += chunk)
40164 if (d->perm[i] & (chunk - 1))
40165 return false;
40166 else
40167 for (j = 1; j < chunk; ++j)
40168 if (d->perm[i] + j != d->perm[i + j])
40169 return false;
40171 return true;
40174 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40175 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
40177 static bool
40178 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
40180 unsigned i, nelt, eltsz, mask;
40181 unsigned char perm[32];
40182 enum machine_mode vmode = V16QImode;
40183 rtx rperm[32], vperm, target, op0, op1;
40185 nelt = d->nelt;
40187 if (!d->one_operand_p)
40189 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
40191 if (TARGET_AVX2
40192 && valid_perm_using_mode_p (V2TImode, d))
40194 if (d->testing_p)
40195 return true;
40197 /* Use vperm2i128 insn. The pattern uses
40198 V4DImode instead of V2TImode. */
40199 target = d->target;
40200 if (d->vmode != V4DImode)
40201 target = gen_reg_rtx (V4DImode);
40202 op0 = gen_lowpart (V4DImode, d->op0);
40203 op1 = gen_lowpart (V4DImode, d->op1);
40204 rperm[0]
40205 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
40206 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
40207 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
40208 if (target != d->target)
40209 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40210 return true;
40212 return false;
40215 else
40217 if (GET_MODE_SIZE (d->vmode) == 16)
40219 if (!TARGET_SSSE3)
40220 return false;
40222 else if (GET_MODE_SIZE (d->vmode) == 32)
40224 if (!TARGET_AVX2)
40225 return false;
40227 /* V4DImode should be already handled through
40228 expand_vselect by vpermq instruction. */
40229 gcc_assert (d->vmode != V4DImode);
40231 vmode = V32QImode;
40232 if (d->vmode == V8SImode
40233 || d->vmode == V16HImode
40234 || d->vmode == V32QImode)
40236 /* First see if vpermq can be used for
40237 V8SImode/V16HImode/V32QImode. */
40238 if (valid_perm_using_mode_p (V4DImode, d))
40240 for (i = 0; i < 4; i++)
40241 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
40242 if (d->testing_p)
40243 return true;
40244 target = gen_reg_rtx (V4DImode);
40245 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
40246 perm, 4, false))
40248 emit_move_insn (d->target,
40249 gen_lowpart (d->vmode, target));
40250 return true;
40252 return false;
40255 /* Next see if vpermd can be used. */
40256 if (valid_perm_using_mode_p (V8SImode, d))
40257 vmode = V8SImode;
40259 /* Or if vpermps can be used. */
40260 else if (d->vmode == V8SFmode)
40261 vmode = V8SImode;
40263 if (vmode == V32QImode)
40265 /* vpshufb only works intra lanes, it is not
40266 possible to shuffle bytes in between the lanes. */
40267 for (i = 0; i < nelt; ++i)
40268 if ((d->perm[i] ^ i) & (nelt / 2))
40269 return false;
40272 else
40273 return false;
40276 if (d->testing_p)
40277 return true;
40279 if (vmode == V8SImode)
40280 for (i = 0; i < 8; ++i)
40281 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
40282 else
40284 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40285 if (!d->one_operand_p)
40286 mask = 2 * nelt - 1;
40287 else if (vmode == V16QImode)
40288 mask = nelt - 1;
40289 else
40290 mask = nelt / 2 - 1;
40292 for (i = 0; i < nelt; ++i)
40294 unsigned j, e = d->perm[i] & mask;
40295 for (j = 0; j < eltsz; ++j)
40296 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
40300 vperm = gen_rtx_CONST_VECTOR (vmode,
40301 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
40302 vperm = force_reg (vmode, vperm);
40304 target = d->target;
40305 if (d->vmode != vmode)
40306 target = gen_reg_rtx (vmode);
40307 op0 = gen_lowpart (vmode, d->op0);
40308 if (d->one_operand_p)
40310 if (vmode == V16QImode)
40311 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
40312 else if (vmode == V32QImode)
40313 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
40314 else if (vmode == V8SFmode)
40315 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
40316 else
40317 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
40319 else
40321 op1 = gen_lowpart (vmode, d->op1);
40322 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
40324 if (target != d->target)
40325 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
40327 return true;
40330 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
40331 in a single instruction. */
40333 static bool
40334 expand_vec_perm_1 (struct expand_vec_perm_d *d)
40336 unsigned i, nelt = d->nelt;
40337 unsigned char perm2[MAX_VECT_LEN];
40339 /* Check plain VEC_SELECT first, because AVX has instructions that could
40340 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
40341 input where SEL+CONCAT may not. */
40342 if (d->one_operand_p)
40344 int mask = nelt - 1;
40345 bool identity_perm = true;
40346 bool broadcast_perm = true;
40348 for (i = 0; i < nelt; i++)
40350 perm2[i] = d->perm[i] & mask;
40351 if (perm2[i] != i)
40352 identity_perm = false;
40353 if (perm2[i])
40354 broadcast_perm = false;
40357 if (identity_perm)
40359 if (!d->testing_p)
40360 emit_move_insn (d->target, d->op0);
40361 return true;
40363 else if (broadcast_perm && TARGET_AVX2)
40365 /* Use vpbroadcast{b,w,d}. */
40366 rtx (*gen) (rtx, rtx) = NULL;
40367 switch (d->vmode)
40369 case V32QImode:
40370 gen = gen_avx2_pbroadcastv32qi_1;
40371 break;
40372 case V16HImode:
40373 gen = gen_avx2_pbroadcastv16hi_1;
40374 break;
40375 case V8SImode:
40376 gen = gen_avx2_pbroadcastv8si_1;
40377 break;
40378 case V16QImode:
40379 gen = gen_avx2_pbroadcastv16qi;
40380 break;
40381 case V8HImode:
40382 gen = gen_avx2_pbroadcastv8hi;
40383 break;
40384 case V8SFmode:
40385 gen = gen_avx2_vec_dupv8sf_1;
40386 break;
40387 /* For other modes prefer other shuffles this function creates. */
40388 default: break;
40390 if (gen != NULL)
40392 if (!d->testing_p)
40393 emit_insn (gen (d->target, d->op0));
40394 return true;
40398 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
40399 return true;
40401 /* There are plenty of patterns in sse.md that are written for
40402 SEL+CONCAT and are not replicated for a single op. Perhaps
40403 that should be changed, to avoid the nastiness here. */
40405 /* Recognize interleave style patterns, which means incrementing
40406 every other permutation operand. */
40407 for (i = 0; i < nelt; i += 2)
40409 perm2[i] = d->perm[i] & mask;
40410 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
40412 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40413 d->testing_p))
40414 return true;
40416 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
40417 if (nelt >= 4)
40419 for (i = 0; i < nelt; i += 4)
40421 perm2[i + 0] = d->perm[i + 0] & mask;
40422 perm2[i + 1] = d->perm[i + 1] & mask;
40423 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
40424 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
40427 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40428 d->testing_p))
40429 return true;
40433 /* Finally, try the fully general two operand permute. */
40434 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
40435 d->testing_p))
40436 return true;
40438 /* Recognize interleave style patterns with reversed operands. */
40439 if (!d->one_operand_p)
40441 for (i = 0; i < nelt; ++i)
40443 unsigned e = d->perm[i];
40444 if (e >= nelt)
40445 e -= nelt;
40446 else
40447 e += nelt;
40448 perm2[i] = e;
40451 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
40452 d->testing_p))
40453 return true;
40456 /* Try the SSE4.1 blend variable merge instructions. */
40457 if (expand_vec_perm_blend (d))
40458 return true;
40460 /* Try one of the AVX vpermil variable permutations. */
40461 if (expand_vec_perm_vpermil (d))
40462 return true;
40464 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40465 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40466 if (expand_vec_perm_pshufb (d))
40467 return true;
40469 return false;
40472 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40473 in terms of a pair of pshuflw + pshufhw instructions. */
40475 static bool
40476 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40478 unsigned char perm2[MAX_VECT_LEN];
40479 unsigned i;
40480 bool ok;
40482 if (d->vmode != V8HImode || !d->one_operand_p)
40483 return false;
40485 /* The two permutations only operate in 64-bit lanes. */
40486 for (i = 0; i < 4; ++i)
40487 if (d->perm[i] >= 4)
40488 return false;
40489 for (i = 4; i < 8; ++i)
40490 if (d->perm[i] < 4)
40491 return false;
40493 if (d->testing_p)
40494 return true;
40496 /* Emit the pshuflw. */
40497 memcpy (perm2, d->perm, 4);
40498 for (i = 4; i < 8; ++i)
40499 perm2[i] = i;
40500 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40501 gcc_assert (ok);
40503 /* Emit the pshufhw. */
40504 memcpy (perm2 + 4, d->perm + 4, 4);
40505 for (i = 0; i < 4; ++i)
40506 perm2[i] = i;
40507 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40508 gcc_assert (ok);
40510 return true;
40513 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40514 the permutation using the SSSE3 palignr instruction. This succeeds
40515 when all of the elements in PERM fit within one vector and we merely
40516 need to shift them down so that a single vector permutation has a
40517 chance to succeed. */
40519 static bool
40520 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40522 unsigned i, nelt = d->nelt;
40523 unsigned min, max;
40524 bool in_order, ok;
40525 rtx shift, target;
40526 struct expand_vec_perm_d dcopy;
40528 /* Even with AVX, palignr only operates on 128-bit vectors. */
40529 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40530 return false;
40532 min = nelt, max = 0;
40533 for (i = 0; i < nelt; ++i)
40535 unsigned e = d->perm[i];
40536 if (e < min)
40537 min = e;
40538 if (e > max)
40539 max = e;
40541 if (min == 0 || max - min >= nelt)
40542 return false;
40544 /* Given that we have SSSE3, we know we'll be able to implement the
40545 single operand permutation after the palignr with pshufb. */
40546 if (d->testing_p)
40547 return true;
40549 dcopy = *d;
40550 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40551 target = gen_reg_rtx (TImode);
40552 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40553 gen_lowpart (TImode, d->op0), shift));
40555 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40556 dcopy.one_operand_p = true;
40558 in_order = true;
40559 for (i = 0; i < nelt; ++i)
40561 unsigned e = dcopy.perm[i] - min;
40562 if (e != i)
40563 in_order = false;
40564 dcopy.perm[i] = e;
40567 /* Test for the degenerate case where the alignment by itself
40568 produces the desired permutation. */
40569 if (in_order)
40571 emit_move_insn (d->target, dcopy.op0);
40572 return true;
40575 ok = expand_vec_perm_1 (&dcopy);
40576 gcc_assert (ok);
40578 return ok;
40581 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40583 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40584 a two vector permutation into a single vector permutation by using
40585 an interleave operation to merge the vectors. */
40587 static bool
40588 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40590 struct expand_vec_perm_d dremap, dfinal;
40591 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40592 unsigned HOST_WIDE_INT contents;
40593 unsigned char remap[2 * MAX_VECT_LEN];
40594 rtx seq;
40595 bool ok, same_halves = false;
40597 if (GET_MODE_SIZE (d->vmode) == 16)
40599 if (d->one_operand_p)
40600 return false;
40602 else if (GET_MODE_SIZE (d->vmode) == 32)
40604 if (!TARGET_AVX)
40605 return false;
40606 /* For 32-byte modes allow even d->one_operand_p.
40607 The lack of cross-lane shuffling in some instructions
40608 might prevent a single insn shuffle. */
40609 dfinal = *d;
40610 dfinal.testing_p = true;
40611 /* If expand_vec_perm_interleave3 can expand this into
40612 a 3 insn sequence, give up and let it be expanded as
40613 3 insn sequence. While that is one insn longer,
40614 it doesn't need a memory operand and in the common
40615 case that both interleave low and high permutations
40616 with the same operands are adjacent needs 4 insns
40617 for both after CSE. */
40618 if (expand_vec_perm_interleave3 (&dfinal))
40619 return false;
40621 else
40622 return false;
40624 /* Examine from whence the elements come. */
40625 contents = 0;
40626 for (i = 0; i < nelt; ++i)
40627 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40629 memset (remap, 0xff, sizeof (remap));
40630 dremap = *d;
40632 if (GET_MODE_SIZE (d->vmode) == 16)
40634 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40636 /* Split the two input vectors into 4 halves. */
40637 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40638 h2 = h1 << nelt2;
40639 h3 = h2 << nelt2;
40640 h4 = h3 << nelt2;
40642 /* If the elements from the low halves use interleave low, and similarly
40643 for interleave high. If the elements are from mis-matched halves, we
40644 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40645 if ((contents & (h1 | h3)) == contents)
40647 /* punpckl* */
40648 for (i = 0; i < nelt2; ++i)
40650 remap[i] = i * 2;
40651 remap[i + nelt] = i * 2 + 1;
40652 dremap.perm[i * 2] = i;
40653 dremap.perm[i * 2 + 1] = i + nelt;
40655 if (!TARGET_SSE2 && d->vmode == V4SImode)
40656 dremap.vmode = V4SFmode;
40658 else if ((contents & (h2 | h4)) == contents)
40660 /* punpckh* */
40661 for (i = 0; i < nelt2; ++i)
40663 remap[i + nelt2] = i * 2;
40664 remap[i + nelt + nelt2] = i * 2 + 1;
40665 dremap.perm[i * 2] = i + nelt2;
40666 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40668 if (!TARGET_SSE2 && d->vmode == V4SImode)
40669 dremap.vmode = V4SFmode;
40671 else if ((contents & (h1 | h4)) == contents)
40673 /* shufps */
40674 for (i = 0; i < nelt2; ++i)
40676 remap[i] = i;
40677 remap[i + nelt + nelt2] = i + nelt2;
40678 dremap.perm[i] = i;
40679 dremap.perm[i + nelt2] = i + nelt + nelt2;
40681 if (nelt != 4)
40683 /* shufpd */
40684 dremap.vmode = V2DImode;
40685 dremap.nelt = 2;
40686 dremap.perm[0] = 0;
40687 dremap.perm[1] = 3;
40690 else if ((contents & (h2 | h3)) == contents)
40692 /* shufps */
40693 for (i = 0; i < nelt2; ++i)
40695 remap[i + nelt2] = i;
40696 remap[i + nelt] = i + nelt2;
40697 dremap.perm[i] = i + nelt2;
40698 dremap.perm[i + nelt2] = i + nelt;
40700 if (nelt != 4)
40702 /* shufpd */
40703 dremap.vmode = V2DImode;
40704 dremap.nelt = 2;
40705 dremap.perm[0] = 1;
40706 dremap.perm[1] = 2;
40709 else
40710 return false;
40712 else
40714 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40715 unsigned HOST_WIDE_INT q[8];
40716 unsigned int nonzero_halves[4];
40718 /* Split the two input vectors into 8 quarters. */
40719 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40720 for (i = 1; i < 8; ++i)
40721 q[i] = q[0] << (nelt4 * i);
40722 for (i = 0; i < 4; ++i)
40723 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40725 nonzero_halves[nzcnt] = i;
40726 ++nzcnt;
40729 if (nzcnt == 1)
40731 gcc_assert (d->one_operand_p);
40732 nonzero_halves[1] = nonzero_halves[0];
40733 same_halves = true;
40735 else if (d->one_operand_p)
40737 gcc_assert (nonzero_halves[0] == 0);
40738 gcc_assert (nonzero_halves[1] == 1);
40741 if (nzcnt <= 2)
40743 if (d->perm[0] / nelt2 == nonzero_halves[1])
40745 /* Attempt to increase the likelihood that dfinal
40746 shuffle will be intra-lane. */
40747 char tmph = nonzero_halves[0];
40748 nonzero_halves[0] = nonzero_halves[1];
40749 nonzero_halves[1] = tmph;
40752 /* vperm2f128 or vperm2i128. */
40753 for (i = 0; i < nelt2; ++i)
40755 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40756 remap[i + nonzero_halves[0] * nelt2] = i;
40757 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40758 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40761 if (d->vmode != V8SFmode
40762 && d->vmode != V4DFmode
40763 && d->vmode != V8SImode)
40765 dremap.vmode = V8SImode;
40766 dremap.nelt = 8;
40767 for (i = 0; i < 4; ++i)
40769 dremap.perm[i] = i + nonzero_halves[0] * 4;
40770 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40774 else if (d->one_operand_p)
40775 return false;
40776 else if (TARGET_AVX2
40777 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40779 /* vpunpckl* */
40780 for (i = 0; i < nelt4; ++i)
40782 remap[i] = i * 2;
40783 remap[i + nelt] = i * 2 + 1;
40784 remap[i + nelt2] = i * 2 + nelt2;
40785 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40786 dremap.perm[i * 2] = i;
40787 dremap.perm[i * 2 + 1] = i + nelt;
40788 dremap.perm[i * 2 + nelt2] = i + nelt2;
40789 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40792 else if (TARGET_AVX2
40793 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40795 /* vpunpckh* */
40796 for (i = 0; i < nelt4; ++i)
40798 remap[i + nelt4] = i * 2;
40799 remap[i + nelt + nelt4] = i * 2 + 1;
40800 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40801 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40802 dremap.perm[i * 2] = i + nelt4;
40803 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40804 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40805 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40808 else
40809 return false;
40812 /* Use the remapping array set up above to move the elements from their
40813 swizzled locations into their final destinations. */
40814 dfinal = *d;
40815 for (i = 0; i < nelt; ++i)
40817 unsigned e = remap[d->perm[i]];
40818 gcc_assert (e < nelt);
40819 /* If same_halves is true, both halves of the remapped vector are the
40820 same. Avoid cross-lane accesses if possible. */
40821 if (same_halves && i >= nelt2)
40823 gcc_assert (e < nelt2);
40824 dfinal.perm[i] = e + nelt2;
40826 else
40827 dfinal.perm[i] = e;
40829 dremap.target = gen_reg_rtx (dremap.vmode);
40830 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40831 dfinal.op1 = dfinal.op0;
40832 dfinal.one_operand_p = true;
40834 /* Test if the final remap can be done with a single insn. For V4SFmode or
40835 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40836 start_sequence ();
40837 ok = expand_vec_perm_1 (&dfinal);
40838 seq = get_insns ();
40839 end_sequence ();
40841 if (!ok)
40842 return false;
40844 if (d->testing_p)
40845 return true;
40847 if (dremap.vmode != dfinal.vmode)
40849 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40850 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40853 ok = expand_vec_perm_1 (&dremap);
40854 gcc_assert (ok);
40856 emit_insn (seq);
40857 return true;
40860 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40861 a single vector cross-lane permutation into vpermq followed
40862 by any of the single insn permutations. */
40864 static bool
40865 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40867 struct expand_vec_perm_d dremap, dfinal;
40868 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40869 unsigned contents[2];
40870 bool ok;
40872 if (!(TARGET_AVX2
40873 && (d->vmode == V32QImode || d->vmode == V16HImode)
40874 && d->one_operand_p))
40875 return false;
40877 contents[0] = 0;
40878 contents[1] = 0;
40879 for (i = 0; i < nelt2; ++i)
40881 contents[0] |= 1u << (d->perm[i] / nelt4);
40882 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40885 for (i = 0; i < 2; ++i)
40887 unsigned int cnt = 0;
40888 for (j = 0; j < 4; ++j)
40889 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40890 return false;
40893 if (d->testing_p)
40894 return true;
40896 dremap = *d;
40897 dremap.vmode = V4DImode;
40898 dremap.nelt = 4;
40899 dremap.target = gen_reg_rtx (V4DImode);
40900 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40901 dremap.op1 = dremap.op0;
40902 dremap.one_operand_p = true;
40903 for (i = 0; i < 2; ++i)
40905 unsigned int cnt = 0;
40906 for (j = 0; j < 4; ++j)
40907 if ((contents[i] & (1u << j)) != 0)
40908 dremap.perm[2 * i + cnt++] = j;
40909 for (; cnt < 2; ++cnt)
40910 dremap.perm[2 * i + cnt] = 0;
40913 dfinal = *d;
40914 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40915 dfinal.op1 = dfinal.op0;
40916 dfinal.one_operand_p = true;
40917 for (i = 0, j = 0; i < nelt; ++i)
40919 if (i == nelt2)
40920 j = 2;
40921 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40922 if ((d->perm[i] / nelt4) == dremap.perm[j])
40924 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40925 dfinal.perm[i] |= nelt4;
40926 else
40927 gcc_unreachable ();
40930 ok = expand_vec_perm_1 (&dremap);
40931 gcc_assert (ok);
40933 ok = expand_vec_perm_1 (&dfinal);
40934 gcc_assert (ok);
40936 return true;
40939 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40940 a vector permutation using two instructions, vperm2f128 resp.
40941 vperm2i128 followed by any single in-lane permutation. */
40943 static bool
40944 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40946 struct expand_vec_perm_d dfirst, dsecond;
40947 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40948 bool ok;
40950 if (!TARGET_AVX
40951 || GET_MODE_SIZE (d->vmode) != 32
40952 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40953 return false;
40955 dsecond = *d;
40956 dsecond.one_operand_p = false;
40957 dsecond.testing_p = true;
40959 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40960 immediate. For perm < 16 the second permutation uses
40961 d->op0 as first operand, for perm >= 16 it uses d->op1
40962 as first operand. The second operand is the result of
40963 vperm2[fi]128. */
40964 for (perm = 0; perm < 32; perm++)
40966 /* Ignore permutations which do not move anything cross-lane. */
40967 if (perm < 16)
40969 /* The second shuffle for e.g. V4DFmode has
40970 0123 and ABCD operands.
40971 Ignore AB23, as 23 is already in the second lane
40972 of the first operand. */
40973 if ((perm & 0xc) == (1 << 2)) continue;
40974 /* And 01CD, as 01 is in the first lane of the first
40975 operand. */
40976 if ((perm & 3) == 0) continue;
40977 /* And 4567, as then the vperm2[fi]128 doesn't change
40978 anything on the original 4567 second operand. */
40979 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40981 else
40983 /* The second shuffle for e.g. V4DFmode has
40984 4567 and ABCD operands.
40985 Ignore AB67, as 67 is already in the second lane
40986 of the first operand. */
40987 if ((perm & 0xc) == (3 << 2)) continue;
40988 /* And 45CD, as 45 is in the first lane of the first
40989 operand. */
40990 if ((perm & 3) == 2) continue;
40991 /* And 0123, as then the vperm2[fi]128 doesn't change
40992 anything on the original 0123 first operand. */
40993 if ((perm & 0xf) == (1 << 2)) continue;
40996 for (i = 0; i < nelt; i++)
40998 j = d->perm[i] / nelt2;
40999 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
41000 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
41001 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
41002 dsecond.perm[i] = d->perm[i] & (nelt - 1);
41003 else
41004 break;
41007 if (i == nelt)
41009 start_sequence ();
41010 ok = expand_vec_perm_1 (&dsecond);
41011 end_sequence ();
41013 else
41014 ok = false;
41016 if (ok)
41018 if (d->testing_p)
41019 return true;
41021 /* Found a usable second shuffle. dfirst will be
41022 vperm2f128 on d->op0 and d->op1. */
41023 dsecond.testing_p = false;
41024 dfirst = *d;
41025 dfirst.target = gen_reg_rtx (d->vmode);
41026 for (i = 0; i < nelt; i++)
41027 dfirst.perm[i] = (i & (nelt2 - 1))
41028 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
41030 ok = expand_vec_perm_1 (&dfirst);
41031 gcc_assert (ok);
41033 /* And dsecond is some single insn shuffle, taking
41034 d->op0 and result of vperm2f128 (if perm < 16) or
41035 d->op1 and result of vperm2f128 (otherwise). */
41036 dsecond.op1 = dfirst.target;
41037 if (perm >= 16)
41038 dsecond.op0 = dfirst.op1;
41040 ok = expand_vec_perm_1 (&dsecond);
41041 gcc_assert (ok);
41043 return true;
41046 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
41047 if (d->one_operand_p)
41048 return false;
41051 return false;
41054 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
41055 a two vector permutation using 2 intra-lane interleave insns
41056 and cross-lane shuffle for 32-byte vectors. */
41058 static bool
41059 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
41061 unsigned i, nelt;
41062 rtx (*gen) (rtx, rtx, rtx);
41064 if (d->one_operand_p)
41065 return false;
41066 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
41068 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
41070 else
41071 return false;
41073 nelt = d->nelt;
41074 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
41075 return false;
41076 for (i = 0; i < nelt; i += 2)
41077 if (d->perm[i] != d->perm[0] + i / 2
41078 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
41079 return false;
41081 if (d->testing_p)
41082 return true;
41084 switch (d->vmode)
41086 case V32QImode:
41087 if (d->perm[0])
41088 gen = gen_vec_interleave_highv32qi;
41089 else
41090 gen = gen_vec_interleave_lowv32qi;
41091 break;
41092 case V16HImode:
41093 if (d->perm[0])
41094 gen = gen_vec_interleave_highv16hi;
41095 else
41096 gen = gen_vec_interleave_lowv16hi;
41097 break;
41098 case V8SImode:
41099 if (d->perm[0])
41100 gen = gen_vec_interleave_highv8si;
41101 else
41102 gen = gen_vec_interleave_lowv8si;
41103 break;
41104 case V4DImode:
41105 if (d->perm[0])
41106 gen = gen_vec_interleave_highv4di;
41107 else
41108 gen = gen_vec_interleave_lowv4di;
41109 break;
41110 case V8SFmode:
41111 if (d->perm[0])
41112 gen = gen_vec_interleave_highv8sf;
41113 else
41114 gen = gen_vec_interleave_lowv8sf;
41115 break;
41116 case V4DFmode:
41117 if (d->perm[0])
41118 gen = gen_vec_interleave_highv4df;
41119 else
41120 gen = gen_vec_interleave_lowv4df;
41121 break;
41122 default:
41123 gcc_unreachable ();
41126 emit_insn (gen (d->target, d->op0, d->op1));
41127 return true;
41130 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
41131 a single vector permutation using a single intra-lane vector
41132 permutation, vperm2f128 swapping the lanes and vblend* insn blending
41133 the non-swapped and swapped vectors together. */
41135 static bool
41136 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
41138 struct expand_vec_perm_d dfirst, dsecond;
41139 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
41140 rtx seq;
41141 bool ok;
41142 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
41144 if (!TARGET_AVX
41145 || TARGET_AVX2
41146 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
41147 || !d->one_operand_p)
41148 return false;
41150 dfirst = *d;
41151 for (i = 0; i < nelt; i++)
41152 dfirst.perm[i] = 0xff;
41153 for (i = 0, msk = 0; i < nelt; i++)
41155 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
41156 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
41157 return false;
41158 dfirst.perm[j] = d->perm[i];
41159 if (j != i)
41160 msk |= (1 << i);
41162 for (i = 0; i < nelt; i++)
41163 if (dfirst.perm[i] == 0xff)
41164 dfirst.perm[i] = i;
41166 if (!d->testing_p)
41167 dfirst.target = gen_reg_rtx (dfirst.vmode);
41169 start_sequence ();
41170 ok = expand_vec_perm_1 (&dfirst);
41171 seq = get_insns ();
41172 end_sequence ();
41174 if (!ok)
41175 return false;
41177 if (d->testing_p)
41178 return true;
41180 emit_insn (seq);
41182 dsecond = *d;
41183 dsecond.op0 = dfirst.target;
41184 dsecond.op1 = dfirst.target;
41185 dsecond.one_operand_p = true;
41186 dsecond.target = gen_reg_rtx (dsecond.vmode);
41187 for (i = 0; i < nelt; i++)
41188 dsecond.perm[i] = i ^ nelt2;
41190 ok = expand_vec_perm_1 (&dsecond);
41191 gcc_assert (ok);
41193 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
41194 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
41195 return true;
41198 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
41199 permutation using two vperm2f128, followed by a vshufpd insn blending
41200 the two vectors together. */
41202 static bool
41203 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
41205 struct expand_vec_perm_d dfirst, dsecond, dthird;
41206 bool ok;
41208 if (!TARGET_AVX || (d->vmode != V4DFmode))
41209 return false;
41211 if (d->testing_p)
41212 return true;
41214 dfirst = *d;
41215 dsecond = *d;
41216 dthird = *d;
41218 dfirst.perm[0] = (d->perm[0] & ~1);
41219 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
41220 dfirst.perm[2] = (d->perm[2] & ~1);
41221 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
41222 dsecond.perm[0] = (d->perm[1] & ~1);
41223 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
41224 dsecond.perm[2] = (d->perm[3] & ~1);
41225 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
41226 dthird.perm[0] = (d->perm[0] % 2);
41227 dthird.perm[1] = (d->perm[1] % 2) + 4;
41228 dthird.perm[2] = (d->perm[2] % 2) + 2;
41229 dthird.perm[3] = (d->perm[3] % 2) + 6;
41231 dfirst.target = gen_reg_rtx (dfirst.vmode);
41232 dsecond.target = gen_reg_rtx (dsecond.vmode);
41233 dthird.op0 = dfirst.target;
41234 dthird.op1 = dsecond.target;
41235 dthird.one_operand_p = false;
41237 canonicalize_perm (&dfirst);
41238 canonicalize_perm (&dsecond);
41240 ok = expand_vec_perm_1 (&dfirst)
41241 && expand_vec_perm_1 (&dsecond)
41242 && expand_vec_perm_1 (&dthird);
41244 gcc_assert (ok);
41246 return true;
41249 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
41250 permutation with two pshufb insns and an ior. We should have already
41251 failed all two instruction sequences. */
41253 static bool
41254 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
41256 rtx rperm[2][16], vperm, l, h, op, m128;
41257 unsigned int i, nelt, eltsz;
41259 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
41260 return false;
41261 gcc_assert (!d->one_operand_p);
41263 nelt = d->nelt;
41264 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41266 /* Generate two permutation masks. If the required element is within
41267 the given vector it is shuffled into the proper lane. If the required
41268 element is in the other vector, force a zero into the lane by setting
41269 bit 7 in the permutation mask. */
41270 m128 = GEN_INT (-128);
41271 for (i = 0; i < nelt; ++i)
41273 unsigned j, e = d->perm[i];
41274 unsigned which = (e >= nelt);
41275 if (e >= nelt)
41276 e -= nelt;
41278 for (j = 0; j < eltsz; ++j)
41280 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
41281 rperm[1-which][i*eltsz + j] = m128;
41285 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
41286 vperm = force_reg (V16QImode, vperm);
41288 l = gen_reg_rtx (V16QImode);
41289 op = gen_lowpart (V16QImode, d->op0);
41290 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
41292 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
41293 vperm = force_reg (V16QImode, vperm);
41295 h = gen_reg_rtx (V16QImode);
41296 op = gen_lowpart (V16QImode, d->op1);
41297 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
41299 op = d->target;
41300 if (d->vmode != V16QImode)
41301 op = gen_reg_rtx (V16QImode);
41302 emit_insn (gen_iorv16qi3 (op, l, h));
41303 if (op != d->target)
41304 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41306 return true;
41309 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
41310 with two vpshufb insns, vpermq and vpor. We should have already failed
41311 all two or three instruction sequences. */
41313 static bool
41314 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
41316 rtx rperm[2][32], vperm, l, h, hp, op, m128;
41317 unsigned int i, nelt, eltsz;
41319 if (!TARGET_AVX2
41320 || !d->one_operand_p
41321 || (d->vmode != V32QImode && d->vmode != V16HImode))
41322 return false;
41324 if (d->testing_p)
41325 return true;
41327 nelt = d->nelt;
41328 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41330 /* Generate two permutation masks. If the required element is within
41331 the same lane, it is shuffled in. If the required element from the
41332 other lane, force a zero by setting bit 7 in the permutation mask.
41333 In the other mask the mask has non-negative elements if element
41334 is requested from the other lane, but also moved to the other lane,
41335 so that the result of vpshufb can have the two V2TImode halves
41336 swapped. */
41337 m128 = GEN_INT (-128);
41338 for (i = 0; i < nelt; ++i)
41340 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41341 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41343 for (j = 0; j < eltsz; ++j)
41345 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
41346 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
41350 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41351 vperm = force_reg (V32QImode, vperm);
41353 h = gen_reg_rtx (V32QImode);
41354 op = gen_lowpart (V32QImode, d->op0);
41355 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41357 /* Swap the 128-byte lanes of h into hp. */
41358 hp = gen_reg_rtx (V4DImode);
41359 op = gen_lowpart (V4DImode, h);
41360 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
41361 const1_rtx));
41363 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41364 vperm = force_reg (V32QImode, vperm);
41366 l = gen_reg_rtx (V32QImode);
41367 op = gen_lowpart (V32QImode, d->op0);
41368 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41370 op = d->target;
41371 if (d->vmode != V32QImode)
41372 op = gen_reg_rtx (V32QImode);
41373 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
41374 if (op != d->target)
41375 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41377 return true;
41380 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
41381 and extract-odd permutations of two V32QImode and V16QImode operand
41382 with two vpshufb insns, vpor and vpermq. We should have already
41383 failed all two or three instruction sequences. */
41385 static bool
41386 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
41388 rtx rperm[2][32], vperm, l, h, ior, op, m128;
41389 unsigned int i, nelt, eltsz;
41391 if (!TARGET_AVX2
41392 || d->one_operand_p
41393 || (d->vmode != V32QImode && d->vmode != V16HImode))
41394 return false;
41396 for (i = 0; i < d->nelt; ++i)
41397 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
41398 return false;
41400 if (d->testing_p)
41401 return true;
41403 nelt = d->nelt;
41404 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41406 /* Generate two permutation masks. In the first permutation mask
41407 the first quarter will contain indexes for the first half
41408 of the op0, the second quarter will contain bit 7 set, third quarter
41409 will contain indexes for the second half of the op0 and the
41410 last quarter bit 7 set. In the second permutation mask
41411 the first quarter will contain bit 7 set, the second quarter
41412 indexes for the first half of the op1, the third quarter bit 7 set
41413 and last quarter indexes for the second half of the op1.
41414 I.e. the first mask e.g. for V32QImode extract even will be:
41415 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
41416 (all values masked with 0xf except for -128) and second mask
41417 for extract even will be
41418 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
41419 m128 = GEN_INT (-128);
41420 for (i = 0; i < nelt; ++i)
41422 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41423 unsigned which = d->perm[i] >= nelt;
41424 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
41426 for (j = 0; j < eltsz; ++j)
41428 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
41429 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
41433 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41434 vperm = force_reg (V32QImode, vperm);
41436 l = gen_reg_rtx (V32QImode);
41437 op = gen_lowpart (V32QImode, d->op0);
41438 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41440 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41441 vperm = force_reg (V32QImode, vperm);
41443 h = gen_reg_rtx (V32QImode);
41444 op = gen_lowpart (V32QImode, d->op1);
41445 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41447 ior = gen_reg_rtx (V32QImode);
41448 emit_insn (gen_iorv32qi3 (ior, l, h));
41450 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
41451 op = gen_reg_rtx (V4DImode);
41452 ior = gen_lowpart (V4DImode, ior);
41453 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
41454 const1_rtx, GEN_INT (3)));
41455 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41457 return true;
41460 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
41461 and extract-odd permutations. */
41463 static bool
41464 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41466 rtx t1, t2, t3, t4, t5;
41468 switch (d->vmode)
41470 case V4DFmode:
41471 t1 = gen_reg_rtx (V4DFmode);
41472 t2 = gen_reg_rtx (V4DFmode);
41474 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41475 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41476 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41478 /* Now an unpck[lh]pd will produce the result required. */
41479 if (odd)
41480 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41481 else
41482 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41483 emit_insn (t3);
41484 break;
41486 case V8SFmode:
41488 int mask = odd ? 0xdd : 0x88;
41490 t1 = gen_reg_rtx (V8SFmode);
41491 t2 = gen_reg_rtx (V8SFmode);
41492 t3 = gen_reg_rtx (V8SFmode);
41494 /* Shuffle within the 128-bit lanes to produce:
41495 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41496 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41497 GEN_INT (mask)));
41499 /* Shuffle the lanes around to produce:
41500 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41501 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41502 GEN_INT (0x3)));
41504 /* Shuffle within the 128-bit lanes to produce:
41505 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41506 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41508 /* Shuffle within the 128-bit lanes to produce:
41509 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41510 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41512 /* Shuffle the lanes around to produce:
41513 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41514 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41515 GEN_INT (0x20)));
41517 break;
41519 case V2DFmode:
41520 case V4SFmode:
41521 case V2DImode:
41522 case V4SImode:
41523 /* These are always directly implementable by expand_vec_perm_1. */
41524 gcc_unreachable ();
41526 case V8HImode:
41527 if (TARGET_SSSE3)
41528 return expand_vec_perm_pshufb2 (d);
41529 else
41531 /* We need 2*log2(N)-1 operations to achieve odd/even
41532 with interleave. */
41533 t1 = gen_reg_rtx (V8HImode);
41534 t2 = gen_reg_rtx (V8HImode);
41535 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41536 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41537 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41538 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41539 if (odd)
41540 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41541 else
41542 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41543 emit_insn (t3);
41545 break;
41547 case V16QImode:
41548 if (TARGET_SSSE3)
41549 return expand_vec_perm_pshufb2 (d);
41550 else
41552 t1 = gen_reg_rtx (V16QImode);
41553 t2 = gen_reg_rtx (V16QImode);
41554 t3 = gen_reg_rtx (V16QImode);
41555 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41556 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41557 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41558 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41559 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41560 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41561 if (odd)
41562 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41563 else
41564 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41565 emit_insn (t3);
41567 break;
41569 case V16HImode:
41570 case V32QImode:
41571 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41573 case V4DImode:
41574 if (!TARGET_AVX2)
41576 struct expand_vec_perm_d d_copy = *d;
41577 d_copy.vmode = V4DFmode;
41578 d_copy.target = gen_reg_rtx (V4DFmode);
41579 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41580 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41581 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41583 if (!d->testing_p)
41584 emit_move_insn (d->target,
41585 gen_lowpart (V4DImode, d_copy.target));
41586 return true;
41588 return false;
41591 t1 = gen_reg_rtx (V4DImode);
41592 t2 = gen_reg_rtx (V4DImode);
41594 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41595 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41596 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41598 /* Now an vpunpck[lh]qdq will produce the result required. */
41599 if (odd)
41600 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41601 else
41602 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41603 emit_insn (t3);
41604 break;
41606 case V8SImode:
41607 if (!TARGET_AVX2)
41609 struct expand_vec_perm_d d_copy = *d;
41610 d_copy.vmode = V8SFmode;
41611 d_copy.target = gen_reg_rtx (V8SFmode);
41612 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41613 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41614 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41616 if (!d->testing_p)
41617 emit_move_insn (d->target,
41618 gen_lowpart (V8SImode, d_copy.target));
41619 return true;
41621 return false;
41624 t1 = gen_reg_rtx (V8SImode);
41625 t2 = gen_reg_rtx (V8SImode);
41626 t3 = gen_reg_rtx (V4DImode);
41627 t4 = gen_reg_rtx (V4DImode);
41628 t5 = gen_reg_rtx (V4DImode);
41630 /* Shuffle the lanes around into
41631 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41632 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41633 gen_lowpart (V4DImode, d->op1),
41634 GEN_INT (0x20)));
41635 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41636 gen_lowpart (V4DImode, d->op1),
41637 GEN_INT (0x31)));
41639 /* Swap the 2nd and 3rd position in each lane into
41640 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41641 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41642 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41643 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41644 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41646 /* Now an vpunpck[lh]qdq will produce
41647 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41648 if (odd)
41649 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41650 gen_lowpart (V4DImode, t2));
41651 else
41652 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41653 gen_lowpart (V4DImode, t2));
41654 emit_insn (t3);
41655 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41656 break;
41658 default:
41659 gcc_unreachable ();
41662 return true;
41665 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41666 extract-even and extract-odd permutations. */
41668 static bool
41669 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41671 unsigned i, odd, nelt = d->nelt;
41673 odd = d->perm[0];
41674 if (odd != 0 && odd != 1)
41675 return false;
41677 for (i = 1; i < nelt; ++i)
41678 if (d->perm[i] != 2 * i + odd)
41679 return false;
41681 return expand_vec_perm_even_odd_1 (d, odd);
41684 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41685 permutations. We assume that expand_vec_perm_1 has already failed. */
41687 static bool
41688 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41690 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41691 enum machine_mode vmode = d->vmode;
41692 unsigned char perm2[4];
41693 rtx op0 = d->op0, dest;
41694 bool ok;
41696 switch (vmode)
41698 case V4DFmode:
41699 case V8SFmode:
41700 /* These are special-cased in sse.md so that we can optionally
41701 use the vbroadcast instruction. They expand to two insns
41702 if the input happens to be in a register. */
41703 gcc_unreachable ();
41705 case V2DFmode:
41706 case V2DImode:
41707 case V4SFmode:
41708 case V4SImode:
41709 /* These are always implementable using standard shuffle patterns. */
41710 gcc_unreachable ();
41712 case V8HImode:
41713 case V16QImode:
41714 /* These can be implemented via interleave. We save one insn by
41715 stopping once we have promoted to V4SImode and then use pshufd. */
41718 rtx dest;
41719 rtx (*gen) (rtx, rtx, rtx)
41720 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41721 : gen_vec_interleave_lowv8hi;
41723 if (elt >= nelt2)
41725 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41726 : gen_vec_interleave_highv8hi;
41727 elt -= nelt2;
41729 nelt2 /= 2;
41731 dest = gen_reg_rtx (vmode);
41732 emit_insn (gen (dest, op0, op0));
41733 vmode = get_mode_wider_vector (vmode);
41734 op0 = gen_lowpart (vmode, dest);
41736 while (vmode != V4SImode);
41738 memset (perm2, elt, 4);
41739 dest = gen_reg_rtx (V4SImode);
41740 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41741 gcc_assert (ok);
41742 if (!d->testing_p)
41743 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41744 return true;
41746 case V32QImode:
41747 case V16HImode:
41748 case V8SImode:
41749 case V4DImode:
41750 /* For AVX2 broadcasts of the first element vpbroadcast* or
41751 vpermq should be used by expand_vec_perm_1. */
41752 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41753 return false;
41755 default:
41756 gcc_unreachable ();
41760 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41761 broadcast permutations. */
41763 static bool
41764 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41766 unsigned i, elt, nelt = d->nelt;
41768 if (!d->one_operand_p)
41769 return false;
41771 elt = d->perm[0];
41772 for (i = 1; i < nelt; ++i)
41773 if (d->perm[i] != elt)
41774 return false;
41776 return expand_vec_perm_broadcast_1 (d);
41779 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41780 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41781 all the shorter instruction sequences. */
41783 static bool
41784 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41786 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41787 unsigned int i, nelt, eltsz;
41788 bool used[4];
41790 if (!TARGET_AVX2
41791 || d->one_operand_p
41792 || (d->vmode != V32QImode && d->vmode != V16HImode))
41793 return false;
41795 if (d->testing_p)
41796 return true;
41798 nelt = d->nelt;
41799 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41801 /* Generate 4 permutation masks. If the required element is within
41802 the same lane, it is shuffled in. If the required element from the
41803 other lane, force a zero by setting bit 7 in the permutation mask.
41804 In the other mask the mask has non-negative elements if element
41805 is requested from the other lane, but also moved to the other lane,
41806 so that the result of vpshufb can have the two V2TImode halves
41807 swapped. */
41808 m128 = GEN_INT (-128);
41809 for (i = 0; i < 32; ++i)
41811 rperm[0][i] = m128;
41812 rperm[1][i] = m128;
41813 rperm[2][i] = m128;
41814 rperm[3][i] = m128;
41816 used[0] = false;
41817 used[1] = false;
41818 used[2] = false;
41819 used[3] = false;
41820 for (i = 0; i < nelt; ++i)
41822 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41823 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41824 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41826 for (j = 0; j < eltsz; ++j)
41827 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41828 used[which] = true;
41831 for (i = 0; i < 2; ++i)
41833 if (!used[2 * i + 1])
41835 h[i] = NULL_RTX;
41836 continue;
41838 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41839 gen_rtvec_v (32, rperm[2 * i + 1]));
41840 vperm = force_reg (V32QImode, vperm);
41841 h[i] = gen_reg_rtx (V32QImode);
41842 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41843 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41846 /* Swap the 128-byte lanes of h[X]. */
41847 for (i = 0; i < 2; ++i)
41849 if (h[i] == NULL_RTX)
41850 continue;
41851 op = gen_reg_rtx (V4DImode);
41852 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41853 const2_rtx, GEN_INT (3), const0_rtx,
41854 const1_rtx));
41855 h[i] = gen_lowpart (V32QImode, op);
41858 for (i = 0; i < 2; ++i)
41860 if (!used[2 * i])
41862 l[i] = NULL_RTX;
41863 continue;
41865 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41866 vperm = force_reg (V32QImode, vperm);
41867 l[i] = gen_reg_rtx (V32QImode);
41868 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41869 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41872 for (i = 0; i < 2; ++i)
41874 if (h[i] && l[i])
41876 op = gen_reg_rtx (V32QImode);
41877 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41878 l[i] = op;
41880 else if (h[i])
41881 l[i] = h[i];
41884 gcc_assert (l[0] && l[1]);
41885 op = d->target;
41886 if (d->vmode != V32QImode)
41887 op = gen_reg_rtx (V32QImode);
41888 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41889 if (op != d->target)
41890 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41891 return true;
41894 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41895 With all of the interface bits taken care of, perform the expansion
41896 in D and return true on success. */
41898 static bool
41899 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41901 /* Try a single instruction expansion. */
41902 if (expand_vec_perm_1 (d))
41903 return true;
41905 /* Try sequences of two instructions. */
41907 if (expand_vec_perm_pshuflw_pshufhw (d))
41908 return true;
41910 if (expand_vec_perm_palignr (d))
41911 return true;
41913 if (expand_vec_perm_interleave2 (d))
41914 return true;
41916 if (expand_vec_perm_broadcast (d))
41917 return true;
41919 if (expand_vec_perm_vpermq_perm_1 (d))
41920 return true;
41922 if (expand_vec_perm_vperm2f128 (d))
41923 return true;
41925 /* Try sequences of three instructions. */
41927 if (expand_vec_perm_2vperm2f128_vshuf (d))
41928 return true;
41930 if (expand_vec_perm_pshufb2 (d))
41931 return true;
41933 if (expand_vec_perm_interleave3 (d))
41934 return true;
41936 if (expand_vec_perm_vperm2f128_vblend (d))
41937 return true;
41939 /* Try sequences of four instructions. */
41941 if (expand_vec_perm_vpshufb2_vpermq (d))
41942 return true;
41944 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41945 return true;
41947 /* ??? Look for narrow permutations whose element orderings would
41948 allow the promotion to a wider mode. */
41950 /* ??? Look for sequences of interleave or a wider permute that place
41951 the data into the correct lanes for a half-vector shuffle like
41952 pshuf[lh]w or vpermilps. */
41954 /* ??? Look for sequences of interleave that produce the desired results.
41955 The combinatorics of punpck[lh] get pretty ugly... */
41957 if (expand_vec_perm_even_odd (d))
41958 return true;
41960 /* Even longer sequences. */
41961 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41962 return true;
41964 return false;
41967 /* If a permutation only uses one operand, make it clear. Returns true
41968 if the permutation references both operands. */
41970 static bool
41971 canonicalize_perm (struct expand_vec_perm_d *d)
41973 int i, which, nelt = d->nelt;
41975 for (i = which = 0; i < nelt; ++i)
41976 which |= (d->perm[i] < nelt ? 1 : 2);
41978 d->one_operand_p = true;
41979 switch (which)
41981 default:
41982 gcc_unreachable();
41984 case 3:
41985 if (!rtx_equal_p (d->op0, d->op1))
41987 d->one_operand_p = false;
41988 break;
41990 /* The elements of PERM do not suggest that only the first operand
41991 is used, but both operands are identical. Allow easier matching
41992 of the permutation by folding the permutation into the single
41993 input vector. */
41994 /* FALLTHRU */
41996 case 2:
41997 for (i = 0; i < nelt; ++i)
41998 d->perm[i] &= nelt - 1;
41999 d->op0 = d->op1;
42000 break;
42002 case 1:
42003 d->op1 = d->op0;
42004 break;
42007 return (which == 3);
42010 bool
42011 ix86_expand_vec_perm_const (rtx operands[4])
42013 struct expand_vec_perm_d d;
42014 unsigned char perm[MAX_VECT_LEN];
42015 int i, nelt;
42016 bool two_args;
42017 rtx sel;
42019 d.target = operands[0];
42020 d.op0 = operands[1];
42021 d.op1 = operands[2];
42022 sel = operands[3];
42024 d.vmode = GET_MODE (d.target);
42025 gcc_assert (VECTOR_MODE_P (d.vmode));
42026 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
42027 d.testing_p = false;
42029 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
42030 gcc_assert (XVECLEN (sel, 0) == nelt);
42031 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
42033 for (i = 0; i < nelt; ++i)
42035 rtx e = XVECEXP (sel, 0, i);
42036 int ei = INTVAL (e) & (2 * nelt - 1);
42037 d.perm[i] = ei;
42038 perm[i] = ei;
42041 two_args = canonicalize_perm (&d);
42043 if (ix86_expand_vec_perm_const_1 (&d))
42044 return true;
42046 /* If the selector says both arguments are needed, but the operands are the
42047 same, the above tried to expand with one_operand_p and flattened selector.
42048 If that didn't work, retry without one_operand_p; we succeeded with that
42049 during testing. */
42050 if (two_args && d.one_operand_p)
42052 d.one_operand_p = false;
42053 memcpy (d.perm, perm, sizeof (perm));
42054 return ix86_expand_vec_perm_const_1 (&d);
42057 return false;
42060 /* Implement targetm.vectorize.vec_perm_const_ok. */
42062 static bool
42063 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
42064 const unsigned char *sel)
42066 struct expand_vec_perm_d d;
42067 unsigned int i, nelt, which;
42068 bool ret;
42070 d.vmode = vmode;
42071 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
42072 d.testing_p = true;
42074 /* Given sufficient ISA support we can just return true here
42075 for selected vector modes. */
42076 if (GET_MODE_SIZE (d.vmode) == 16)
42078 /* All implementable with a single vpperm insn. */
42079 if (TARGET_XOP)
42080 return true;
42081 /* All implementable with 2 pshufb + 1 ior. */
42082 if (TARGET_SSSE3)
42083 return true;
42084 /* All implementable with shufpd or unpck[lh]pd. */
42085 if (d.nelt == 2)
42086 return true;
42089 /* Extract the values from the vector CST into the permutation
42090 array in D. */
42091 memcpy (d.perm, sel, nelt);
42092 for (i = which = 0; i < nelt; ++i)
42094 unsigned char e = d.perm[i];
42095 gcc_assert (e < 2 * nelt);
42096 which |= (e < nelt ? 1 : 2);
42099 /* For all elements from second vector, fold the elements to first. */
42100 if (which == 2)
42101 for (i = 0; i < nelt; ++i)
42102 d.perm[i] -= nelt;
42104 /* Check whether the mask can be applied to the vector type. */
42105 d.one_operand_p = (which != 3);
42107 /* Implementable with shufps or pshufd. */
42108 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
42109 return true;
42111 /* Otherwise we have to go through the motions and see if we can
42112 figure out how to generate the requested permutation. */
42113 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
42114 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
42115 if (!d.one_operand_p)
42116 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
42118 start_sequence ();
42119 ret = ix86_expand_vec_perm_const_1 (&d);
42120 end_sequence ();
42122 return ret;
42125 void
42126 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
42128 struct expand_vec_perm_d d;
42129 unsigned i, nelt;
42131 d.target = targ;
42132 d.op0 = op0;
42133 d.op1 = op1;
42134 d.vmode = GET_MODE (targ);
42135 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
42136 d.one_operand_p = false;
42137 d.testing_p = false;
42139 for (i = 0; i < nelt; ++i)
42140 d.perm[i] = i * 2 + odd;
42142 /* We'll either be able to implement the permutation directly... */
42143 if (expand_vec_perm_1 (&d))
42144 return;
42146 /* ... or we use the special-case patterns. */
42147 expand_vec_perm_even_odd_1 (&d, odd);
42150 static void
42151 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
42153 struct expand_vec_perm_d d;
42154 unsigned i, nelt, base;
42155 bool ok;
42157 d.target = targ;
42158 d.op0 = op0;
42159 d.op1 = op1;
42160 d.vmode = GET_MODE (targ);
42161 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
42162 d.one_operand_p = false;
42163 d.testing_p = false;
42165 base = high_p ? nelt / 2 : 0;
42166 for (i = 0; i < nelt / 2; ++i)
42168 d.perm[i * 2] = i + base;
42169 d.perm[i * 2 + 1] = i + base + nelt;
42172 /* Note that for AVX this isn't one instruction. */
42173 ok = ix86_expand_vec_perm_const_1 (&d);
42174 gcc_assert (ok);
42178 /* Expand a vector operation CODE for a V*QImode in terms of the
42179 same operation on V*HImode. */
42181 void
42182 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
42184 enum machine_mode qimode = GET_MODE (dest);
42185 enum machine_mode himode;
42186 rtx (*gen_il) (rtx, rtx, rtx);
42187 rtx (*gen_ih) (rtx, rtx, rtx);
42188 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
42189 struct expand_vec_perm_d d;
42190 bool ok, full_interleave;
42191 bool uns_p = false;
42192 int i;
42194 switch (qimode)
42196 case V16QImode:
42197 himode = V8HImode;
42198 gen_il = gen_vec_interleave_lowv16qi;
42199 gen_ih = gen_vec_interleave_highv16qi;
42200 break;
42201 case V32QImode:
42202 himode = V16HImode;
42203 gen_il = gen_avx2_interleave_lowv32qi;
42204 gen_ih = gen_avx2_interleave_highv32qi;
42205 break;
42206 default:
42207 gcc_unreachable ();
42210 op2_l = op2_h = op2;
42211 switch (code)
42213 case MULT:
42214 /* Unpack data such that we've got a source byte in each low byte of
42215 each word. We don't care what goes into the high byte of each word.
42216 Rather than trying to get zero in there, most convenient is to let
42217 it be a copy of the low byte. */
42218 op2_l = gen_reg_rtx (qimode);
42219 op2_h = gen_reg_rtx (qimode);
42220 emit_insn (gen_il (op2_l, op2, op2));
42221 emit_insn (gen_ih (op2_h, op2, op2));
42222 /* FALLTHRU */
42224 op1_l = gen_reg_rtx (qimode);
42225 op1_h = gen_reg_rtx (qimode);
42226 emit_insn (gen_il (op1_l, op1, op1));
42227 emit_insn (gen_ih (op1_h, op1, op1));
42228 full_interleave = qimode == V16QImode;
42229 break;
42231 case ASHIFT:
42232 case LSHIFTRT:
42233 uns_p = true;
42234 /* FALLTHRU */
42235 case ASHIFTRT:
42236 op1_l = gen_reg_rtx (himode);
42237 op1_h = gen_reg_rtx (himode);
42238 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
42239 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
42240 full_interleave = true;
42241 break;
42242 default:
42243 gcc_unreachable ();
42246 /* Perform the operation. */
42247 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
42248 1, OPTAB_DIRECT);
42249 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
42250 1, OPTAB_DIRECT);
42251 gcc_assert (res_l && res_h);
42253 /* Merge the data back into the right place. */
42254 d.target = dest;
42255 d.op0 = gen_lowpart (qimode, res_l);
42256 d.op1 = gen_lowpart (qimode, res_h);
42257 d.vmode = qimode;
42258 d.nelt = GET_MODE_NUNITS (qimode);
42259 d.one_operand_p = false;
42260 d.testing_p = false;
42262 if (full_interleave)
42264 /* For SSE2, we used an full interleave, so the desired
42265 results are in the even elements. */
42266 for (i = 0; i < 32; ++i)
42267 d.perm[i] = i * 2;
42269 else
42271 /* For AVX, the interleave used above was not cross-lane. So the
42272 extraction is evens but with the second and third quarter swapped.
42273 Happily, that is even one insn shorter than even extraction. */
42274 for (i = 0; i < 32; ++i)
42275 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
42278 ok = ix86_expand_vec_perm_const_1 (&d);
42279 gcc_assert (ok);
42281 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42282 gen_rtx_fmt_ee (code, qimode, op1, op2));
42285 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
42286 if op is CONST_VECTOR with all odd elements equal to their
42287 preceding element. */
42289 static bool
42290 const_vector_equal_evenodd_p (rtx op)
42292 enum machine_mode mode = GET_MODE (op);
42293 int i, nunits = GET_MODE_NUNITS (mode);
42294 if (GET_CODE (op) != CONST_VECTOR
42295 || nunits != CONST_VECTOR_NUNITS (op))
42296 return false;
42297 for (i = 0; i < nunits; i += 2)
42298 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
42299 return false;
42300 return true;
42303 void
42304 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
42305 bool uns_p, bool odd_p)
42307 enum machine_mode mode = GET_MODE (op1);
42308 enum machine_mode wmode = GET_MODE (dest);
42309 rtx x;
42310 rtx orig_op1 = op1, orig_op2 = op2;
42312 if (!nonimmediate_operand (op1, mode))
42313 op1 = force_reg (mode, op1);
42314 if (!nonimmediate_operand (op2, mode))
42315 op2 = force_reg (mode, op2);
42317 /* We only play even/odd games with vectors of SImode. */
42318 gcc_assert (mode == V4SImode || mode == V8SImode);
42320 /* If we're looking for the odd results, shift those members down to
42321 the even slots. For some cpus this is faster than a PSHUFD. */
42322 if (odd_p)
42324 /* For XOP use vpmacsdqh, but only for smult, as it is only
42325 signed. */
42326 if (TARGET_XOP && mode == V4SImode && !uns_p)
42328 x = force_reg (wmode, CONST0_RTX (wmode));
42329 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
42330 return;
42333 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
42334 if (!const_vector_equal_evenodd_p (orig_op1))
42335 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
42336 x, NULL, 1, OPTAB_DIRECT);
42337 if (!const_vector_equal_evenodd_p (orig_op2))
42338 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
42339 x, NULL, 1, OPTAB_DIRECT);
42340 op1 = gen_lowpart (mode, op1);
42341 op2 = gen_lowpart (mode, op2);
42344 if (mode == V8SImode)
42346 if (uns_p)
42347 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
42348 else
42349 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
42351 else if (uns_p)
42352 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
42353 else if (TARGET_SSE4_1)
42354 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
42355 else
42357 rtx s1, s2, t0, t1, t2;
42359 /* The easiest way to implement this without PMULDQ is to go through
42360 the motions as if we are performing a full 64-bit multiply. With
42361 the exception that we need to do less shuffling of the elements. */
42363 /* Compute the sign-extension, aka highparts, of the two operands. */
42364 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42365 op1, pc_rtx, pc_rtx);
42366 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42367 op2, pc_rtx, pc_rtx);
42369 /* Multiply LO(A) * HI(B), and vice-versa. */
42370 t1 = gen_reg_rtx (wmode);
42371 t2 = gen_reg_rtx (wmode);
42372 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
42373 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
42375 /* Multiply LO(A) * LO(B). */
42376 t0 = gen_reg_rtx (wmode);
42377 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
42379 /* Combine and shift the highparts into place. */
42380 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
42381 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
42382 1, OPTAB_DIRECT);
42384 /* Combine high and low parts. */
42385 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
42386 return;
42388 emit_insn (x);
42391 void
42392 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
42393 bool uns_p, bool high_p)
42395 enum machine_mode wmode = GET_MODE (dest);
42396 enum machine_mode mode = GET_MODE (op1);
42397 rtx t1, t2, t3, t4, mask;
42399 switch (mode)
42401 case V4SImode:
42402 t1 = gen_reg_rtx (mode);
42403 t2 = gen_reg_rtx (mode);
42404 if (TARGET_XOP && !uns_p)
42406 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
42407 shuffle the elements once so that all elements are in the right
42408 place for immediate use: { A C B D }. */
42409 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
42410 const1_rtx, GEN_INT (3)));
42411 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
42412 const1_rtx, GEN_INT (3)));
42414 else
42416 /* Put the elements into place for the multiply. */
42417 ix86_expand_vec_interleave (t1, op1, op1, high_p);
42418 ix86_expand_vec_interleave (t2, op2, op2, high_p);
42419 high_p = false;
42421 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
42422 break;
42424 case V8SImode:
42425 /* Shuffle the elements between the lanes. After this we
42426 have { A B E F | C D G H } for each operand. */
42427 t1 = gen_reg_rtx (V4DImode);
42428 t2 = gen_reg_rtx (V4DImode);
42429 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
42430 const0_rtx, const2_rtx,
42431 const1_rtx, GEN_INT (3)));
42432 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
42433 const0_rtx, const2_rtx,
42434 const1_rtx, GEN_INT (3)));
42436 /* Shuffle the elements within the lanes. After this we
42437 have { A A B B | C C D D } or { E E F F | G G H H }. */
42438 t3 = gen_reg_rtx (V8SImode);
42439 t4 = gen_reg_rtx (V8SImode);
42440 mask = GEN_INT (high_p
42441 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
42442 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
42443 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
42444 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
42446 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
42447 break;
42449 case V8HImode:
42450 case V16HImode:
42451 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
42452 uns_p, OPTAB_DIRECT);
42453 t2 = expand_binop (mode,
42454 uns_p ? umul_highpart_optab : smul_highpart_optab,
42455 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
42456 gcc_assert (t1 && t2);
42458 t3 = gen_reg_rtx (mode);
42459 ix86_expand_vec_interleave (t3, t1, t2, high_p);
42460 emit_move_insn (dest, gen_lowpart (wmode, t3));
42461 break;
42463 case V16QImode:
42464 case V32QImode:
42465 t1 = gen_reg_rtx (wmode);
42466 t2 = gen_reg_rtx (wmode);
42467 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42468 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42470 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42471 break;
42473 default:
42474 gcc_unreachable ();
42478 void
42479 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42481 rtx res_1, res_2, res_3, res_4;
42483 res_1 = gen_reg_rtx (V4SImode);
42484 res_2 = gen_reg_rtx (V4SImode);
42485 res_3 = gen_reg_rtx (V2DImode);
42486 res_4 = gen_reg_rtx (V2DImode);
42487 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42488 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42490 /* Move the results in element 2 down to element 1; we don't care
42491 what goes in elements 2 and 3. Then we can merge the parts
42492 back together with an interleave.
42494 Note that two other sequences were tried:
42495 (1) Use interleaves at the start instead of psrldq, which allows
42496 us to use a single shufps to merge things back at the end.
42497 (2) Use shufps here to combine the two vectors, then pshufd to
42498 put the elements in the correct order.
42499 In both cases the cost of the reformatting stall was too high
42500 and the overall sequence slower. */
42502 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42503 const0_rtx, const2_rtx,
42504 const0_rtx, const0_rtx));
42505 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42506 const0_rtx, const2_rtx,
42507 const0_rtx, const0_rtx));
42508 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42510 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42513 void
42514 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42516 enum machine_mode mode = GET_MODE (op0);
42517 rtx t1, t2, t3, t4, t5, t6;
42519 if (TARGET_XOP && mode == V2DImode)
42521 /* op1: A,B,C,D, op2: E,F,G,H */
42522 op1 = gen_lowpart (V4SImode, op1);
42523 op2 = gen_lowpart (V4SImode, op2);
42525 t1 = gen_reg_rtx (V4SImode);
42526 t2 = gen_reg_rtx (V4SImode);
42527 t3 = gen_reg_rtx (V2DImode);
42528 t4 = gen_reg_rtx (V2DImode);
42530 /* t1: B,A,D,C */
42531 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42532 GEN_INT (1),
42533 GEN_INT (0),
42534 GEN_INT (3),
42535 GEN_INT (2)));
42537 /* t2: (B*E),(A*F),(D*G),(C*H) */
42538 emit_insn (gen_mulv4si3 (t2, t1, op2));
42540 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42541 emit_insn (gen_xop_phadddq (t3, t2));
42543 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42544 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42546 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42547 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42549 else
42551 enum machine_mode nmode;
42552 rtx (*umul) (rtx, rtx, rtx);
42554 if (mode == V2DImode)
42556 umul = gen_vec_widen_umult_even_v4si;
42557 nmode = V4SImode;
42559 else if (mode == V4DImode)
42561 umul = gen_vec_widen_umult_even_v8si;
42562 nmode = V8SImode;
42564 else
42565 gcc_unreachable ();
42568 /* Multiply low parts. */
42569 t1 = gen_reg_rtx (mode);
42570 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42572 /* Shift input vectors right 32 bits so we can multiply high parts. */
42573 t6 = GEN_INT (32);
42574 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42575 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42577 /* Multiply high parts by low parts. */
42578 t4 = gen_reg_rtx (mode);
42579 t5 = gen_reg_rtx (mode);
42580 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42581 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42583 /* Combine and shift the highparts back. */
42584 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42585 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42587 /* Combine high and low parts. */
42588 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42591 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42592 gen_rtx_MULT (mode, op1, op2));
42595 /* Calculate integer abs() using only SSE2 instructions. */
42597 void
42598 ix86_expand_sse2_abs (rtx target, rtx input)
42600 enum machine_mode mode = GET_MODE (target);
42601 rtx tmp0, tmp1, x;
42603 switch (mode)
42605 /* For 32-bit signed integer X, the best way to calculate the absolute
42606 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42607 case V4SImode:
42608 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42609 GEN_INT (GET_MODE_BITSIZE
42610 (GET_MODE_INNER (mode)) - 1),
42611 NULL, 0, OPTAB_DIRECT);
42612 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42613 NULL, 0, OPTAB_DIRECT);
42614 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42615 target, 0, OPTAB_DIRECT);
42616 break;
42618 /* For 16-bit signed integer X, the best way to calculate the absolute
42619 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42620 case V8HImode:
42621 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42623 x = expand_simple_binop (mode, SMAX, tmp0, input,
42624 target, 0, OPTAB_DIRECT);
42625 break;
42627 /* For 8-bit signed integer X, the best way to calculate the absolute
42628 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42629 as SSE2 provides the PMINUB insn. */
42630 case V16QImode:
42631 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42633 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42634 target, 0, OPTAB_DIRECT);
42635 break;
42637 default:
42638 gcc_unreachable ();
42641 if (x != target)
42642 emit_move_insn (target, x);
42645 /* Expand an insert into a vector register through pinsr insn.
42646 Return true if successful. */
42648 bool
42649 ix86_expand_pinsr (rtx *operands)
42651 rtx dst = operands[0];
42652 rtx src = operands[3];
42654 unsigned int size = INTVAL (operands[1]);
42655 unsigned int pos = INTVAL (operands[2]);
42657 if (GET_CODE (dst) == SUBREG)
42659 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42660 dst = SUBREG_REG (dst);
42663 if (GET_CODE (src) == SUBREG)
42664 src = SUBREG_REG (src);
42666 switch (GET_MODE (dst))
42668 case V16QImode:
42669 case V8HImode:
42670 case V4SImode:
42671 case V2DImode:
42673 enum machine_mode srcmode, dstmode;
42674 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42676 srcmode = mode_for_size (size, MODE_INT, 0);
42678 switch (srcmode)
42680 case QImode:
42681 if (!TARGET_SSE4_1)
42682 return false;
42683 dstmode = V16QImode;
42684 pinsr = gen_sse4_1_pinsrb;
42685 break;
42687 case HImode:
42688 if (!TARGET_SSE2)
42689 return false;
42690 dstmode = V8HImode;
42691 pinsr = gen_sse2_pinsrw;
42692 break;
42694 case SImode:
42695 if (!TARGET_SSE4_1)
42696 return false;
42697 dstmode = V4SImode;
42698 pinsr = gen_sse4_1_pinsrd;
42699 break;
42701 case DImode:
42702 gcc_assert (TARGET_64BIT);
42703 if (!TARGET_SSE4_1)
42704 return false;
42705 dstmode = V2DImode;
42706 pinsr = gen_sse4_1_pinsrq;
42707 break;
42709 default:
42710 return false;
42713 rtx d = dst;
42714 if (GET_MODE (dst) != dstmode)
42715 d = gen_reg_rtx (dstmode);
42716 src = gen_lowpart (srcmode, src);
42718 pos /= size;
42720 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42721 GEN_INT (1 << pos)));
42722 if (d != dst)
42723 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42724 return true;
42727 default:
42728 return false;
42732 /* This function returns the calling abi specific va_list type node.
42733 It returns the FNDECL specific va_list type. */
42735 static tree
42736 ix86_fn_abi_va_list (tree fndecl)
42738 if (!TARGET_64BIT)
42739 return va_list_type_node;
42740 gcc_assert (fndecl != NULL_TREE);
42742 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42743 return ms_va_list_type_node;
42744 else
42745 return sysv_va_list_type_node;
42748 /* Returns the canonical va_list type specified by TYPE. If there
42749 is no valid TYPE provided, it return NULL_TREE. */
42751 static tree
42752 ix86_canonical_va_list_type (tree type)
42754 tree wtype, htype;
42756 /* Resolve references and pointers to va_list type. */
42757 if (TREE_CODE (type) == MEM_REF)
42758 type = TREE_TYPE (type);
42759 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42760 type = TREE_TYPE (type);
42761 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42762 type = TREE_TYPE (type);
42764 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42766 wtype = va_list_type_node;
42767 gcc_assert (wtype != NULL_TREE);
42768 htype = type;
42769 if (TREE_CODE (wtype) == ARRAY_TYPE)
42771 /* If va_list is an array type, the argument may have decayed
42772 to a pointer type, e.g. by being passed to another function.
42773 In that case, unwrap both types so that we can compare the
42774 underlying records. */
42775 if (TREE_CODE (htype) == ARRAY_TYPE
42776 || POINTER_TYPE_P (htype))
42778 wtype = TREE_TYPE (wtype);
42779 htype = TREE_TYPE (htype);
42782 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42783 return va_list_type_node;
42784 wtype = sysv_va_list_type_node;
42785 gcc_assert (wtype != NULL_TREE);
42786 htype = type;
42787 if (TREE_CODE (wtype) == ARRAY_TYPE)
42789 /* If va_list is an array type, the argument may have decayed
42790 to a pointer type, e.g. by being passed to another function.
42791 In that case, unwrap both types so that we can compare the
42792 underlying records. */
42793 if (TREE_CODE (htype) == ARRAY_TYPE
42794 || POINTER_TYPE_P (htype))
42796 wtype = TREE_TYPE (wtype);
42797 htype = TREE_TYPE (htype);
42800 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42801 return sysv_va_list_type_node;
42802 wtype = ms_va_list_type_node;
42803 gcc_assert (wtype != NULL_TREE);
42804 htype = type;
42805 if (TREE_CODE (wtype) == ARRAY_TYPE)
42807 /* If va_list is an array type, the argument may have decayed
42808 to a pointer type, e.g. by being passed to another function.
42809 In that case, unwrap both types so that we can compare the
42810 underlying records. */
42811 if (TREE_CODE (htype) == ARRAY_TYPE
42812 || POINTER_TYPE_P (htype))
42814 wtype = TREE_TYPE (wtype);
42815 htype = TREE_TYPE (htype);
42818 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42819 return ms_va_list_type_node;
42820 return NULL_TREE;
42822 return std_canonical_va_list_type (type);
42825 /* Iterate through the target-specific builtin types for va_list.
42826 IDX denotes the iterator, *PTREE is set to the result type of
42827 the va_list builtin, and *PNAME to its internal type.
42828 Returns zero if there is no element for this index, otherwise
42829 IDX should be increased upon the next call.
42830 Note, do not iterate a base builtin's name like __builtin_va_list.
42831 Used from c_common_nodes_and_builtins. */
42833 static int
42834 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42836 if (TARGET_64BIT)
42838 switch (idx)
42840 default:
42841 break;
42843 case 0:
42844 *ptree = ms_va_list_type_node;
42845 *pname = "__builtin_ms_va_list";
42846 return 1;
42848 case 1:
42849 *ptree = sysv_va_list_type_node;
42850 *pname = "__builtin_sysv_va_list";
42851 return 1;
42855 return 0;
42858 #undef TARGET_SCHED_DISPATCH
42859 #define TARGET_SCHED_DISPATCH has_dispatch
42860 #undef TARGET_SCHED_DISPATCH_DO
42861 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42862 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42863 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42864 #undef TARGET_SCHED_REORDER
42865 #define TARGET_SCHED_REORDER ix86_sched_reorder
42866 #undef TARGET_SCHED_ADJUST_PRIORITY
42867 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42868 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42869 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42870 ix86_dependencies_evaluation_hook
42872 /* The size of the dispatch window is the total number of bytes of
42873 object code allowed in a window. */
42874 #define DISPATCH_WINDOW_SIZE 16
42876 /* Number of dispatch windows considered for scheduling. */
42877 #define MAX_DISPATCH_WINDOWS 3
42879 /* Maximum number of instructions in a window. */
42880 #define MAX_INSN 4
42882 /* Maximum number of immediate operands in a window. */
42883 #define MAX_IMM 4
42885 /* Maximum number of immediate bits allowed in a window. */
42886 #define MAX_IMM_SIZE 128
42888 /* Maximum number of 32 bit immediates allowed in a window. */
42889 #define MAX_IMM_32 4
42891 /* Maximum number of 64 bit immediates allowed in a window. */
42892 #define MAX_IMM_64 2
42894 /* Maximum total of loads or prefetches allowed in a window. */
42895 #define MAX_LOAD 2
42897 /* Maximum total of stores allowed in a window. */
42898 #define MAX_STORE 1
42900 #undef BIG
42901 #define BIG 100
42904 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42905 enum dispatch_group {
42906 disp_no_group = 0,
42907 disp_load,
42908 disp_store,
42909 disp_load_store,
42910 disp_prefetch,
42911 disp_imm,
42912 disp_imm_32,
42913 disp_imm_64,
42914 disp_branch,
42915 disp_cmp,
42916 disp_jcc,
42917 disp_last
42920 /* Number of allowable groups in a dispatch window. It is an array
42921 indexed by dispatch_group enum. 100 is used as a big number,
42922 because the number of these kind of operations does not have any
42923 effect in dispatch window, but we need them for other reasons in
42924 the table. */
42925 static unsigned int num_allowable_groups[disp_last] = {
42926 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42929 char group_name[disp_last + 1][16] = {
42930 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42931 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42932 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42935 /* Instruction path. */
42936 enum insn_path {
42937 no_path = 0,
42938 path_single, /* Single micro op. */
42939 path_double, /* Double micro op. */
42940 path_multi, /* Instructions with more than 2 micro op.. */
42941 last_path
42944 /* sched_insn_info defines a window to the instructions scheduled in
42945 the basic block. It contains a pointer to the insn_info table and
42946 the instruction scheduled.
42948 Windows are allocated for each basic block and are linked
42949 together. */
42950 typedef struct sched_insn_info_s {
42951 rtx insn;
42952 enum dispatch_group group;
42953 enum insn_path path;
42954 int byte_len;
42955 int imm_bytes;
42956 } sched_insn_info;
42958 /* Linked list of dispatch windows. This is a two way list of
42959 dispatch windows of a basic block. It contains information about
42960 the number of uops in the window and the total number of
42961 instructions and of bytes in the object code for this dispatch
42962 window. */
42963 typedef struct dispatch_windows_s {
42964 int num_insn; /* Number of insn in the window. */
42965 int num_uops; /* Number of uops in the window. */
42966 int window_size; /* Number of bytes in the window. */
42967 int window_num; /* Window number between 0 or 1. */
42968 int num_imm; /* Number of immediates in an insn. */
42969 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42970 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42971 int imm_size; /* Total immediates in the window. */
42972 int num_loads; /* Total memory loads in the window. */
42973 int num_stores; /* Total memory stores in the window. */
42974 int violation; /* Violation exists in window. */
42975 sched_insn_info *window; /* Pointer to the window. */
42976 struct dispatch_windows_s *next;
42977 struct dispatch_windows_s *prev;
42978 } dispatch_windows;
42980 /* Immediate valuse used in an insn. */
42981 typedef struct imm_info_s
42983 int imm;
42984 int imm32;
42985 int imm64;
42986 } imm_info;
42988 static dispatch_windows *dispatch_window_list;
42989 static dispatch_windows *dispatch_window_list1;
42991 /* Get dispatch group of insn. */
42993 static enum dispatch_group
42994 get_mem_group (rtx insn)
42996 enum attr_memory memory;
42998 if (INSN_CODE (insn) < 0)
42999 return disp_no_group;
43000 memory = get_attr_memory (insn);
43001 if (memory == MEMORY_STORE)
43002 return disp_store;
43004 if (memory == MEMORY_LOAD)
43005 return disp_load;
43007 if (memory == MEMORY_BOTH)
43008 return disp_load_store;
43010 return disp_no_group;
43013 /* Return true if insn is a compare instruction. */
43015 static bool
43016 is_cmp (rtx insn)
43018 enum attr_type type;
43020 type = get_attr_type (insn);
43021 return (type == TYPE_TEST
43022 || type == TYPE_ICMP
43023 || type == TYPE_FCMP
43024 || GET_CODE (PATTERN (insn)) == COMPARE);
43027 /* Return true if a dispatch violation encountered. */
43029 static bool
43030 dispatch_violation (void)
43032 if (dispatch_window_list->next)
43033 return dispatch_window_list->next->violation;
43034 return dispatch_window_list->violation;
43037 /* Return true if insn is a branch instruction. */
43039 static bool
43040 is_branch (rtx insn)
43042 return (CALL_P (insn) || JUMP_P (insn));
43045 /* Return true if insn is a prefetch instruction. */
43047 static bool
43048 is_prefetch (rtx insn)
43050 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
43053 /* This function initializes a dispatch window and the list container holding a
43054 pointer to the window. */
43056 static void
43057 init_window (int window_num)
43059 int i;
43060 dispatch_windows *new_list;
43062 if (window_num == 0)
43063 new_list = dispatch_window_list;
43064 else
43065 new_list = dispatch_window_list1;
43067 new_list->num_insn = 0;
43068 new_list->num_uops = 0;
43069 new_list->window_size = 0;
43070 new_list->next = NULL;
43071 new_list->prev = NULL;
43072 new_list->window_num = window_num;
43073 new_list->num_imm = 0;
43074 new_list->num_imm_32 = 0;
43075 new_list->num_imm_64 = 0;
43076 new_list->imm_size = 0;
43077 new_list->num_loads = 0;
43078 new_list->num_stores = 0;
43079 new_list->violation = false;
43081 for (i = 0; i < MAX_INSN; i++)
43083 new_list->window[i].insn = NULL;
43084 new_list->window[i].group = disp_no_group;
43085 new_list->window[i].path = no_path;
43086 new_list->window[i].byte_len = 0;
43087 new_list->window[i].imm_bytes = 0;
43089 return;
43092 /* This function allocates and initializes a dispatch window and the
43093 list container holding a pointer to the window. */
43095 static dispatch_windows *
43096 allocate_window (void)
43098 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
43099 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
43101 return new_list;
43104 /* This routine initializes the dispatch scheduling information. It
43105 initiates building dispatch scheduler tables and constructs the
43106 first dispatch window. */
43108 static void
43109 init_dispatch_sched (void)
43111 /* Allocate a dispatch list and a window. */
43112 dispatch_window_list = allocate_window ();
43113 dispatch_window_list1 = allocate_window ();
43114 init_window (0);
43115 init_window (1);
43118 /* This function returns true if a branch is detected. End of a basic block
43119 does not have to be a branch, but here we assume only branches end a
43120 window. */
43122 static bool
43123 is_end_basic_block (enum dispatch_group group)
43125 return group == disp_branch;
43128 /* This function is called when the end of a window processing is reached. */
43130 static void
43131 process_end_window (void)
43133 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
43134 if (dispatch_window_list->next)
43136 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
43137 gcc_assert (dispatch_window_list->window_size
43138 + dispatch_window_list1->window_size <= 48);
43139 init_window (1);
43141 init_window (0);
43144 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
43145 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
43146 for 48 bytes of instructions. Note that these windows are not dispatch
43147 windows that their sizes are DISPATCH_WINDOW_SIZE. */
43149 static dispatch_windows *
43150 allocate_next_window (int window_num)
43152 if (window_num == 0)
43154 if (dispatch_window_list->next)
43155 init_window (1);
43156 init_window (0);
43157 return dispatch_window_list;
43160 dispatch_window_list->next = dispatch_window_list1;
43161 dispatch_window_list1->prev = dispatch_window_list;
43163 return dispatch_window_list1;
43166 /* Increment the number of immediate operands of an instruction. */
43168 static int
43169 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
43171 if (*in_rtx == 0)
43172 return 0;
43174 switch ( GET_CODE (*in_rtx))
43176 case CONST:
43177 case SYMBOL_REF:
43178 case CONST_INT:
43179 (imm_values->imm)++;
43180 if (x86_64_immediate_operand (*in_rtx, SImode))
43181 (imm_values->imm32)++;
43182 else
43183 (imm_values->imm64)++;
43184 break;
43186 case CONST_DOUBLE:
43187 (imm_values->imm)++;
43188 (imm_values->imm64)++;
43189 break;
43191 case CODE_LABEL:
43192 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
43194 (imm_values->imm)++;
43195 (imm_values->imm32)++;
43197 break;
43199 default:
43200 break;
43203 return 0;
43206 /* Compute number of immediate operands of an instruction. */
43208 static void
43209 find_constant (rtx in_rtx, imm_info *imm_values)
43211 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
43212 (rtx_function) find_constant_1, (void *) imm_values);
43215 /* Return total size of immediate operands of an instruction along with number
43216 of corresponding immediate-operands. It initializes its parameters to zero
43217 befor calling FIND_CONSTANT.
43218 INSN is the input instruction. IMM is the total of immediates.
43219 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
43220 bit immediates. */
43222 static int
43223 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
43225 imm_info imm_values = {0, 0, 0};
43227 find_constant (insn, &imm_values);
43228 *imm = imm_values.imm;
43229 *imm32 = imm_values.imm32;
43230 *imm64 = imm_values.imm64;
43231 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
43234 /* This function indicates if an operand of an instruction is an
43235 immediate. */
43237 static bool
43238 has_immediate (rtx insn)
43240 int num_imm_operand;
43241 int num_imm32_operand;
43242 int num_imm64_operand;
43244 if (insn)
43245 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43246 &num_imm64_operand);
43247 return false;
43250 /* Return single or double path for instructions. */
43252 static enum insn_path
43253 get_insn_path (rtx insn)
43255 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
43257 if ((int)path == 0)
43258 return path_single;
43260 if ((int)path == 1)
43261 return path_double;
43263 return path_multi;
43266 /* Return insn dispatch group. */
43268 static enum dispatch_group
43269 get_insn_group (rtx insn)
43271 enum dispatch_group group = get_mem_group (insn);
43272 if (group)
43273 return group;
43275 if (is_branch (insn))
43276 return disp_branch;
43278 if (is_cmp (insn))
43279 return disp_cmp;
43281 if (has_immediate (insn))
43282 return disp_imm;
43284 if (is_prefetch (insn))
43285 return disp_prefetch;
43287 return disp_no_group;
43290 /* Count number of GROUP restricted instructions in a dispatch
43291 window WINDOW_LIST. */
43293 static int
43294 count_num_restricted (rtx insn, dispatch_windows *window_list)
43296 enum dispatch_group group = get_insn_group (insn);
43297 int imm_size;
43298 int num_imm_operand;
43299 int num_imm32_operand;
43300 int num_imm64_operand;
43302 if (group == disp_no_group)
43303 return 0;
43305 if (group == disp_imm)
43307 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43308 &num_imm64_operand);
43309 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
43310 || num_imm_operand + window_list->num_imm > MAX_IMM
43311 || (num_imm32_operand > 0
43312 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
43313 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
43314 || (num_imm64_operand > 0
43315 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
43316 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
43317 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
43318 && num_imm64_operand > 0
43319 && ((window_list->num_imm_64 > 0
43320 && window_list->num_insn >= 2)
43321 || window_list->num_insn >= 3)))
43322 return BIG;
43324 return 1;
43327 if ((group == disp_load_store
43328 && (window_list->num_loads >= MAX_LOAD
43329 || window_list->num_stores >= MAX_STORE))
43330 || ((group == disp_load
43331 || group == disp_prefetch)
43332 && window_list->num_loads >= MAX_LOAD)
43333 || (group == disp_store
43334 && window_list->num_stores >= MAX_STORE))
43335 return BIG;
43337 return 1;
43340 /* This function returns true if insn satisfies dispatch rules on the
43341 last window scheduled. */
43343 static bool
43344 fits_dispatch_window (rtx insn)
43346 dispatch_windows *window_list = dispatch_window_list;
43347 dispatch_windows *window_list_next = dispatch_window_list->next;
43348 unsigned int num_restrict;
43349 enum dispatch_group group = get_insn_group (insn);
43350 enum insn_path path = get_insn_path (insn);
43351 int sum;
43353 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
43354 instructions should be given the lowest priority in the
43355 scheduling process in Haifa scheduler to make sure they will be
43356 scheduled in the same dispatch window as the reference to them. */
43357 if (group == disp_jcc || group == disp_cmp)
43358 return false;
43360 /* Check nonrestricted. */
43361 if (group == disp_no_group || group == disp_branch)
43362 return true;
43364 /* Get last dispatch window. */
43365 if (window_list_next)
43366 window_list = window_list_next;
43368 if (window_list->window_num == 1)
43370 sum = window_list->prev->window_size + window_list->window_size;
43372 if (sum == 32
43373 || (min_insn_size (insn) + sum) >= 48)
43374 /* Window 1 is full. Go for next window. */
43375 return true;
43378 num_restrict = count_num_restricted (insn, window_list);
43380 if (num_restrict > num_allowable_groups[group])
43381 return false;
43383 /* See if it fits in the first window. */
43384 if (window_list->window_num == 0)
43386 /* The first widow should have only single and double path
43387 uops. */
43388 if (path == path_double
43389 && (window_list->num_uops + 2) > MAX_INSN)
43390 return false;
43391 else if (path != path_single)
43392 return false;
43394 return true;
43397 /* Add an instruction INSN with NUM_UOPS micro-operations to the
43398 dispatch window WINDOW_LIST. */
43400 static void
43401 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
43403 int byte_len = min_insn_size (insn);
43404 int num_insn = window_list->num_insn;
43405 int imm_size;
43406 sched_insn_info *window = window_list->window;
43407 enum dispatch_group group = get_insn_group (insn);
43408 enum insn_path path = get_insn_path (insn);
43409 int num_imm_operand;
43410 int num_imm32_operand;
43411 int num_imm64_operand;
43413 if (!window_list->violation && group != disp_cmp
43414 && !fits_dispatch_window (insn))
43415 window_list->violation = true;
43417 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43418 &num_imm64_operand);
43420 /* Initialize window with new instruction. */
43421 window[num_insn].insn = insn;
43422 window[num_insn].byte_len = byte_len;
43423 window[num_insn].group = group;
43424 window[num_insn].path = path;
43425 window[num_insn].imm_bytes = imm_size;
43427 window_list->window_size += byte_len;
43428 window_list->num_insn = num_insn + 1;
43429 window_list->num_uops = window_list->num_uops + num_uops;
43430 window_list->imm_size += imm_size;
43431 window_list->num_imm += num_imm_operand;
43432 window_list->num_imm_32 += num_imm32_operand;
43433 window_list->num_imm_64 += num_imm64_operand;
43435 if (group == disp_store)
43436 window_list->num_stores += 1;
43437 else if (group == disp_load
43438 || group == disp_prefetch)
43439 window_list->num_loads += 1;
43440 else if (group == disp_load_store)
43442 window_list->num_stores += 1;
43443 window_list->num_loads += 1;
43447 /* Adds a scheduled instruction, INSN, to the current dispatch window.
43448 If the total bytes of instructions or the number of instructions in
43449 the window exceed allowable, it allocates a new window. */
43451 static void
43452 add_to_dispatch_window (rtx insn)
43454 int byte_len;
43455 dispatch_windows *window_list;
43456 dispatch_windows *next_list;
43457 dispatch_windows *window0_list;
43458 enum insn_path path;
43459 enum dispatch_group insn_group;
43460 bool insn_fits;
43461 int num_insn;
43462 int num_uops;
43463 int window_num;
43464 int insn_num_uops;
43465 int sum;
43467 if (INSN_CODE (insn) < 0)
43468 return;
43470 byte_len = min_insn_size (insn);
43471 window_list = dispatch_window_list;
43472 next_list = window_list->next;
43473 path = get_insn_path (insn);
43474 insn_group = get_insn_group (insn);
43476 /* Get the last dispatch window. */
43477 if (next_list)
43478 window_list = dispatch_window_list->next;
43480 if (path == path_single)
43481 insn_num_uops = 1;
43482 else if (path == path_double)
43483 insn_num_uops = 2;
43484 else
43485 insn_num_uops = (int) path;
43487 /* If current window is full, get a new window.
43488 Window number zero is full, if MAX_INSN uops are scheduled in it.
43489 Window number one is full, if window zero's bytes plus window
43490 one's bytes is 32, or if the bytes of the new instruction added
43491 to the total makes it greater than 48, or it has already MAX_INSN
43492 instructions in it. */
43493 num_insn = window_list->num_insn;
43494 num_uops = window_list->num_uops;
43495 window_num = window_list->window_num;
43496 insn_fits = fits_dispatch_window (insn);
43498 if (num_insn >= MAX_INSN
43499 || num_uops + insn_num_uops > MAX_INSN
43500 || !(insn_fits))
43502 window_num = ~window_num & 1;
43503 window_list = allocate_next_window (window_num);
43506 if (window_num == 0)
43508 add_insn_window (insn, window_list, insn_num_uops);
43509 if (window_list->num_insn >= MAX_INSN
43510 && insn_group == disp_branch)
43512 process_end_window ();
43513 return;
43516 else if (window_num == 1)
43518 window0_list = window_list->prev;
43519 sum = window0_list->window_size + window_list->window_size;
43520 if (sum == 32
43521 || (byte_len + sum) >= 48)
43523 process_end_window ();
43524 window_list = dispatch_window_list;
43527 add_insn_window (insn, window_list, insn_num_uops);
43529 else
43530 gcc_unreachable ();
43532 if (is_end_basic_block (insn_group))
43534 /* End of basic block is reached do end-basic-block process. */
43535 process_end_window ();
43536 return;
43540 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43542 DEBUG_FUNCTION static void
43543 debug_dispatch_window_file (FILE *file, int window_num)
43545 dispatch_windows *list;
43546 int i;
43548 if (window_num == 0)
43549 list = dispatch_window_list;
43550 else
43551 list = dispatch_window_list1;
43553 fprintf (file, "Window #%d:\n", list->window_num);
43554 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43555 list->num_insn, list->num_uops, list->window_size);
43556 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43557 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43559 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43560 list->num_stores);
43561 fprintf (file, " insn info:\n");
43563 for (i = 0; i < MAX_INSN; i++)
43565 if (!list->window[i].insn)
43566 break;
43567 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43568 i, group_name[list->window[i].group],
43569 i, (void *)list->window[i].insn,
43570 i, list->window[i].path,
43571 i, list->window[i].byte_len,
43572 i, list->window[i].imm_bytes);
43576 /* Print to stdout a dispatch window. */
43578 DEBUG_FUNCTION void
43579 debug_dispatch_window (int window_num)
43581 debug_dispatch_window_file (stdout, window_num);
43584 /* Print INSN dispatch information to FILE. */
43586 DEBUG_FUNCTION static void
43587 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43589 int byte_len;
43590 enum insn_path path;
43591 enum dispatch_group group;
43592 int imm_size;
43593 int num_imm_operand;
43594 int num_imm32_operand;
43595 int num_imm64_operand;
43597 if (INSN_CODE (insn) < 0)
43598 return;
43600 byte_len = min_insn_size (insn);
43601 path = get_insn_path (insn);
43602 group = get_insn_group (insn);
43603 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43604 &num_imm64_operand);
43606 fprintf (file, " insn info:\n");
43607 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43608 group_name[group], path, byte_len);
43609 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43610 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43613 /* Print to STDERR the status of the ready list with respect to
43614 dispatch windows. */
43616 DEBUG_FUNCTION void
43617 debug_ready_dispatch (void)
43619 int i;
43620 int no_ready = number_in_ready ();
43622 fprintf (stdout, "Number of ready: %d\n", no_ready);
43624 for (i = 0; i < no_ready; i++)
43625 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43628 /* This routine is the driver of the dispatch scheduler. */
43630 static void
43631 do_dispatch (rtx insn, int mode)
43633 if (mode == DISPATCH_INIT)
43634 init_dispatch_sched ();
43635 else if (mode == ADD_TO_DISPATCH_WINDOW)
43636 add_to_dispatch_window (insn);
43639 /* Return TRUE if Dispatch Scheduling is supported. */
43641 static bool
43642 has_dispatch (rtx insn, int action)
43644 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 || TARGET_BDVER4)
43645 && flag_dispatch_scheduler)
43646 switch (action)
43648 default:
43649 return false;
43651 case IS_DISPATCH_ON:
43652 return true;
43653 break;
43655 case IS_CMP:
43656 return is_cmp (insn);
43658 case DISPATCH_VIOLATION:
43659 return dispatch_violation ();
43661 case FITS_DISPATCH_WINDOW:
43662 return fits_dispatch_window (insn);
43665 return false;
43668 /* Implementation of reassociation_width target hook used by
43669 reassoc phase to identify parallelism level in reassociated
43670 tree. Statements tree_code is passed in OPC. Arguments type
43671 is passed in MODE.
43673 Currently parallel reassociation is enabled for Atom
43674 processors only and we set reassociation width to be 2
43675 because Atom may issue up to 2 instructions per cycle.
43677 Return value should be fixed if parallel reassociation is
43678 enabled for other processors. */
43680 static int
43681 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43682 enum machine_mode mode)
43684 int res = 1;
43686 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43687 res = 2;
43688 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43689 res = 2;
43691 return res;
43694 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43695 place emms and femms instructions. */
43697 static enum machine_mode
43698 ix86_preferred_simd_mode (enum machine_mode mode)
43700 if (!TARGET_SSE)
43701 return word_mode;
43703 switch (mode)
43705 case QImode:
43706 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43707 case HImode:
43708 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43709 case SImode:
43710 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43711 case DImode:
43712 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43714 case SFmode:
43715 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43716 return V8SFmode;
43717 else
43718 return V4SFmode;
43720 case DFmode:
43721 if (!TARGET_VECTORIZE_DOUBLE)
43722 return word_mode;
43723 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43724 return V4DFmode;
43725 else if (TARGET_SSE2)
43726 return V2DFmode;
43727 /* FALLTHRU */
43729 default:
43730 return word_mode;
43734 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43735 vectors. */
43737 static unsigned int
43738 ix86_autovectorize_vector_sizes (void)
43740 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43745 /* Return class of registers which could be used for pseudo of MODE
43746 and of class RCLASS for spilling instead of memory. Return NO_REGS
43747 if it is not possible or non-profitable. */
43748 static reg_class_t
43749 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43751 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43752 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43753 && INTEGER_CLASS_P (rclass))
43754 return ALL_SSE_REGS;
43755 return NO_REGS;
43758 /* Implement targetm.vectorize.init_cost. */
43760 static void *
43761 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43763 unsigned *cost = XNEWVEC (unsigned, 3);
43764 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43765 return cost;
43768 /* Implement targetm.vectorize.add_stmt_cost. */
43770 static unsigned
43771 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43772 struct _stmt_vec_info *stmt_info, int misalign,
43773 enum vect_cost_model_location where)
43775 unsigned *cost = (unsigned *) data;
43776 unsigned retval = 0;
43778 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43779 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43781 /* Statements in an inner loop relative to the loop being
43782 vectorized are weighted more heavily. The value here is
43783 arbitrary and could potentially be improved with analysis. */
43784 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43785 count *= 50; /* FIXME. */
43787 retval = (unsigned) (count * stmt_cost);
43788 cost[where] += retval;
43790 return retval;
43793 /* Implement targetm.vectorize.finish_cost. */
43795 static void
43796 ix86_finish_cost (void *data, unsigned *prologue_cost,
43797 unsigned *body_cost, unsigned *epilogue_cost)
43799 unsigned *cost = (unsigned *) data;
43800 *prologue_cost = cost[vect_prologue];
43801 *body_cost = cost[vect_body];
43802 *epilogue_cost = cost[vect_epilogue];
43805 /* Implement targetm.vectorize.destroy_cost_data. */
43807 static void
43808 ix86_destroy_cost_data (void *data)
43810 free (data);
43813 /* Validate target specific memory model bits in VAL. */
43815 static unsigned HOST_WIDE_INT
43816 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43818 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43819 bool strong;
43821 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43822 |MEMMODEL_MASK)
43823 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43825 warning (OPT_Winvalid_memory_model,
43826 "Unknown architecture specific memory model");
43827 return MEMMODEL_SEQ_CST;
43829 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43830 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43832 warning (OPT_Winvalid_memory_model,
43833 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43834 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43836 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43838 warning (OPT_Winvalid_memory_model,
43839 "HLE_RELEASE not used with RELEASE or stronger memory model");
43840 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43842 return val;
43845 /* Set CLONEI->vecsize_mangle, CLONEI->vecsize_int,
43846 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
43847 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
43848 or number of vecsize_mangle variants that should be emitted. */
43850 static int
43851 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
43852 struct cgraph_simd_clone *clonei,
43853 tree base_type, int num)
43855 int ret = 1;
43857 if (clonei->simdlen
43858 && (clonei->simdlen < 2
43859 || clonei->simdlen > 16
43860 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
43862 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
43863 "unsupported simdlen %d", clonei->simdlen);
43864 return 0;
43867 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
43868 if (TREE_CODE (ret_type) != VOID_TYPE)
43869 switch (TYPE_MODE (ret_type))
43871 case QImode:
43872 case HImode:
43873 case SImode:
43874 case DImode:
43875 case SFmode:
43876 case DFmode:
43877 /* case SCmode: */
43878 /* case DCmode: */
43879 break;
43880 default:
43881 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
43882 "unsupported return type %qT for simd\n", ret_type);
43883 return 0;
43886 tree t;
43887 int i;
43889 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
43890 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
43891 switch (TYPE_MODE (TREE_TYPE (t)))
43893 case QImode:
43894 case HImode:
43895 case SImode:
43896 case DImode:
43897 case SFmode:
43898 case DFmode:
43899 /* case SCmode: */
43900 /* case DCmode: */
43901 break;
43902 default:
43903 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
43904 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
43905 return 0;
43908 if (clonei->cilk_elemental)
43910 /* Parse here processor clause. If not present, default to 'b'. */
43911 clonei->vecsize_mangle = 'b';
43913 else if (!TREE_PUBLIC (node->decl))
43915 /* If the function isn't exported, we can pick up just one ISA
43916 for the clones. */
43917 if (TARGET_AVX2)
43918 clonei->vecsize_mangle = 'd';
43919 else if (TARGET_AVX)
43920 clonei->vecsize_mangle = 'c';
43921 else
43922 clonei->vecsize_mangle = 'b';
43923 ret = 1;
43925 else
43927 clonei->vecsize_mangle = "bcd"[num];
43928 ret = 3;
43930 switch (clonei->vecsize_mangle)
43932 case 'b':
43933 clonei->vecsize_int = 128;
43934 clonei->vecsize_float = 128;
43935 break;
43936 case 'c':
43937 clonei->vecsize_int = 128;
43938 clonei->vecsize_float = 256;
43939 break;
43940 case 'd':
43941 clonei->vecsize_int = 256;
43942 clonei->vecsize_float = 256;
43943 break;
43945 if (clonei->simdlen == 0)
43947 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
43948 clonei->simdlen = clonei->vecsize_int;
43949 else
43950 clonei->simdlen = clonei->vecsize_float;
43951 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
43952 if (clonei->simdlen > 16)
43953 clonei->simdlen = 16;
43955 return ret;
43958 /* Add target attribute to SIMD clone NODE if needed. */
43960 static void
43961 ix86_simd_clone_adjust (struct cgraph_node *node)
43963 const char *str = NULL;
43964 gcc_assert (node->decl == cfun->decl);
43965 switch (node->simdclone->vecsize_mangle)
43967 case 'b':
43968 if (!TARGET_SSE2)
43969 str = "sse2";
43970 break;
43971 case 'c':
43972 if (!TARGET_AVX)
43973 str = "avx";
43974 break;
43975 case 'd':
43976 if (!TARGET_AVX2)
43977 str = "avx2";
43978 break;
43979 default:
43980 gcc_unreachable ();
43982 if (str == NULL)
43983 return;
43984 push_cfun (NULL);
43985 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
43986 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
43987 gcc_assert (ok);
43988 pop_cfun ();
43989 ix86_previous_fndecl = NULL_TREE;
43990 ix86_set_current_function (node->decl);
43993 /* If SIMD clone NODE can't be used in a vectorized loop
43994 in current function, return -1, otherwise return a badness of using it
43995 (0 if it is most desirable from vecsize_mangle point of view, 1
43996 slightly less desirable, etc.). */
43998 static int
43999 ix86_simd_clone_usable (struct cgraph_node *node)
44001 switch (node->simdclone->vecsize_mangle)
44003 case 'b':
44004 if (!TARGET_SSE2)
44005 return -1;
44006 if (!TARGET_AVX)
44007 return 0;
44008 return TARGET_AVX2 ? 2 : 1;
44009 case 'c':
44010 if (!TARGET_AVX)
44011 return -1;
44012 return TARGET_AVX2 ? 1 : 0;
44013 break;
44014 case 'd':
44015 if (!TARGET_AVX2)
44016 return -1;
44017 return 0;
44018 default:
44019 gcc_unreachable ();
44023 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
44025 static bool
44026 ix86_float_exceptions_rounding_supported_p (void)
44028 /* For x87 floating point with standard excess precision handling,
44029 there is no adddf3 pattern (since x87 floating point only has
44030 XFmode operations) so the default hook implementation gets this
44031 wrong. */
44032 return TARGET_80387 || TARGET_SSE_MATH;
44035 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
44037 static void
44038 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
44040 if (!TARGET_80387 && !TARGET_SSE_MATH)
44041 return;
44042 tree exceptions_var = create_tmp_var (integer_type_node, NULL);
44043 if (TARGET_80387)
44045 tree fenv_index_type = build_index_type (size_int (6));
44046 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
44047 tree fenv_var = create_tmp_var (fenv_type, NULL);
44048 mark_addressable (fenv_var);
44049 tree fenv_ptr = build_pointer_type (fenv_type);
44050 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
44051 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
44052 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
44053 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
44054 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
44055 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
44056 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
44057 tree hold_fnclex = build_call_expr (fnclex, 0);
44058 *hold = build2 (COMPOUND_EXPR, void_type_node, hold_fnstenv,
44059 hold_fnclex);
44060 *clear = build_call_expr (fnclex, 0);
44061 tree sw_var = create_tmp_var (short_unsigned_type_node, NULL);
44062 mark_addressable (sw_var);
44063 tree su_ptr = build_pointer_type (short_unsigned_type_node);
44064 tree sw_addr = build1 (ADDR_EXPR, su_ptr, sw_var);
44065 tree fnstsw_call = build_call_expr (fnstsw, 1, sw_addr);
44066 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
44067 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
44068 exceptions_var, exceptions_x87);
44069 *update = build2 (COMPOUND_EXPR, integer_type_node,
44070 fnstsw_call, update_mod);
44071 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
44072 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
44074 if (TARGET_SSE_MATH)
44076 tree mxcsr_orig_var = create_tmp_var (unsigned_type_node, NULL);
44077 tree mxcsr_mod_var = create_tmp_var (unsigned_type_node, NULL);
44078 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
44079 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
44080 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
44081 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
44082 mxcsr_orig_var, stmxcsr_hold_call);
44083 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
44084 mxcsr_orig_var,
44085 build_int_cst (unsigned_type_node, 0x1f80));
44086 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
44087 build_int_cst (unsigned_type_node, 0xffffffc0));
44088 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
44089 mxcsr_mod_var, hold_mod_val);
44090 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
44091 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
44092 hold_assign_orig, hold_assign_mod);
44093 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
44094 ldmxcsr_hold_call);
44095 if (*hold)
44096 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
44097 else
44098 *hold = hold_all;
44099 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
44100 if (*clear)
44101 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
44102 ldmxcsr_clear_call);
44103 else
44104 *clear = ldmxcsr_clear_call;
44105 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
44106 tree exceptions_sse = fold_convert (integer_type_node,
44107 stxmcsr_update_call);
44108 if (*update)
44110 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
44111 exceptions_var, exceptions_sse);
44112 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
44113 exceptions_var, exceptions_mod);
44114 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
44115 exceptions_assign);
44117 else
44118 *update = build2 (MODIFY_EXPR, integer_type_node,
44119 exceptions_var, exceptions_sse);
44120 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
44121 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
44122 ldmxcsr_update_call);
44124 tree atomic_feraiseexcept
44125 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
44126 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
44127 1, exceptions_var);
44128 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
44129 atomic_feraiseexcept_call);
44132 /* Initialize the GCC target structure. */
44133 #undef TARGET_RETURN_IN_MEMORY
44134 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
44136 #undef TARGET_LEGITIMIZE_ADDRESS
44137 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
44139 #undef TARGET_ATTRIBUTE_TABLE
44140 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
44141 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
44142 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
44143 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44144 # undef TARGET_MERGE_DECL_ATTRIBUTES
44145 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
44146 #endif
44148 #undef TARGET_COMP_TYPE_ATTRIBUTES
44149 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
44151 #undef TARGET_INIT_BUILTINS
44152 #define TARGET_INIT_BUILTINS ix86_init_builtins
44153 #undef TARGET_BUILTIN_DECL
44154 #define TARGET_BUILTIN_DECL ix86_builtin_decl
44155 #undef TARGET_EXPAND_BUILTIN
44156 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
44158 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
44159 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
44160 ix86_builtin_vectorized_function
44162 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
44163 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
44165 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
44166 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
44168 #undef TARGET_VECTORIZE_BUILTIN_GATHER
44169 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
44171 #undef TARGET_BUILTIN_RECIPROCAL
44172 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
44174 #undef TARGET_ASM_FUNCTION_EPILOGUE
44175 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
44177 #undef TARGET_ENCODE_SECTION_INFO
44178 #ifndef SUBTARGET_ENCODE_SECTION_INFO
44179 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
44180 #else
44181 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
44182 #endif
44184 #undef TARGET_ASM_OPEN_PAREN
44185 #define TARGET_ASM_OPEN_PAREN ""
44186 #undef TARGET_ASM_CLOSE_PAREN
44187 #define TARGET_ASM_CLOSE_PAREN ""
44189 #undef TARGET_ASM_BYTE_OP
44190 #define TARGET_ASM_BYTE_OP ASM_BYTE
44192 #undef TARGET_ASM_ALIGNED_HI_OP
44193 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
44194 #undef TARGET_ASM_ALIGNED_SI_OP
44195 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
44196 #ifdef ASM_QUAD
44197 #undef TARGET_ASM_ALIGNED_DI_OP
44198 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
44199 #endif
44201 #undef TARGET_PROFILE_BEFORE_PROLOGUE
44202 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
44204 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
44205 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
44207 #undef TARGET_ASM_UNALIGNED_HI_OP
44208 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
44209 #undef TARGET_ASM_UNALIGNED_SI_OP
44210 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
44211 #undef TARGET_ASM_UNALIGNED_DI_OP
44212 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
44214 #undef TARGET_PRINT_OPERAND
44215 #define TARGET_PRINT_OPERAND ix86_print_operand
44216 #undef TARGET_PRINT_OPERAND_ADDRESS
44217 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
44218 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
44219 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
44220 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
44221 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
44223 #undef TARGET_SCHED_INIT_GLOBAL
44224 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
44225 #undef TARGET_SCHED_ADJUST_COST
44226 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
44227 #undef TARGET_SCHED_ISSUE_RATE
44228 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
44229 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
44230 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
44231 ia32_multipass_dfa_lookahead
44232 #undef TARGET_SCHED_MACRO_FUSION_P
44233 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
44234 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
44235 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
44237 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
44238 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
44240 #undef TARGET_MEMMODEL_CHECK
44241 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
44243 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
44244 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
44246 #ifdef HAVE_AS_TLS
44247 #undef TARGET_HAVE_TLS
44248 #define TARGET_HAVE_TLS true
44249 #endif
44250 #undef TARGET_CANNOT_FORCE_CONST_MEM
44251 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
44252 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
44253 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
44255 #undef TARGET_DELEGITIMIZE_ADDRESS
44256 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
44258 #undef TARGET_MS_BITFIELD_LAYOUT_P
44259 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
44261 #if TARGET_MACHO
44262 #undef TARGET_BINDS_LOCAL_P
44263 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
44264 #endif
44265 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44266 #undef TARGET_BINDS_LOCAL_P
44267 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
44268 #endif
44270 #undef TARGET_ASM_OUTPUT_MI_THUNK
44271 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
44272 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
44273 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
44275 #undef TARGET_ASM_FILE_START
44276 #define TARGET_ASM_FILE_START x86_file_start
44278 #undef TARGET_OPTION_OVERRIDE
44279 #define TARGET_OPTION_OVERRIDE ix86_option_override
44281 #undef TARGET_REGISTER_MOVE_COST
44282 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
44283 #undef TARGET_MEMORY_MOVE_COST
44284 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
44285 #undef TARGET_RTX_COSTS
44286 #define TARGET_RTX_COSTS ix86_rtx_costs
44287 #undef TARGET_ADDRESS_COST
44288 #define TARGET_ADDRESS_COST ix86_address_cost
44290 #undef TARGET_FIXED_CONDITION_CODE_REGS
44291 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
44292 #undef TARGET_CC_MODES_COMPATIBLE
44293 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
44295 #undef TARGET_MACHINE_DEPENDENT_REORG
44296 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
44298 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
44299 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
44301 #undef TARGET_BUILD_BUILTIN_VA_LIST
44302 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
44304 #undef TARGET_FOLD_BUILTIN
44305 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
44307 #undef TARGET_COMPARE_VERSION_PRIORITY
44308 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
44310 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
44311 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
44312 ix86_generate_version_dispatcher_body
44314 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
44315 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
44316 ix86_get_function_versions_dispatcher
44318 #undef TARGET_ENUM_VA_LIST_P
44319 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
44321 #undef TARGET_FN_ABI_VA_LIST
44322 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
44324 #undef TARGET_CANONICAL_VA_LIST_TYPE
44325 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
44327 #undef TARGET_EXPAND_BUILTIN_VA_START
44328 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
44330 #undef TARGET_MD_ASM_CLOBBERS
44331 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
44333 #undef TARGET_PROMOTE_PROTOTYPES
44334 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
44335 #undef TARGET_STRUCT_VALUE_RTX
44336 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
44337 #undef TARGET_SETUP_INCOMING_VARARGS
44338 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
44339 #undef TARGET_MUST_PASS_IN_STACK
44340 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
44341 #undef TARGET_FUNCTION_ARG_ADVANCE
44342 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
44343 #undef TARGET_FUNCTION_ARG
44344 #define TARGET_FUNCTION_ARG ix86_function_arg
44345 #undef TARGET_FUNCTION_ARG_BOUNDARY
44346 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
44347 #undef TARGET_PASS_BY_REFERENCE
44348 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
44349 #undef TARGET_INTERNAL_ARG_POINTER
44350 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
44351 #undef TARGET_UPDATE_STACK_BOUNDARY
44352 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
44353 #undef TARGET_GET_DRAP_RTX
44354 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
44355 #undef TARGET_STRICT_ARGUMENT_NAMING
44356 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
44357 #undef TARGET_STATIC_CHAIN
44358 #define TARGET_STATIC_CHAIN ix86_static_chain
44359 #undef TARGET_TRAMPOLINE_INIT
44360 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
44361 #undef TARGET_RETURN_POPS_ARGS
44362 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
44364 #undef TARGET_LEGITIMATE_COMBINED_INSN
44365 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
44367 #undef TARGET_ASAN_SHADOW_OFFSET
44368 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
44370 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
44371 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
44373 #undef TARGET_SCALAR_MODE_SUPPORTED_P
44374 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
44376 #undef TARGET_VECTOR_MODE_SUPPORTED_P
44377 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
44379 #undef TARGET_C_MODE_FOR_SUFFIX
44380 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
44382 #ifdef HAVE_AS_TLS
44383 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
44384 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
44385 #endif
44387 #ifdef SUBTARGET_INSERT_ATTRIBUTES
44388 #undef TARGET_INSERT_ATTRIBUTES
44389 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
44390 #endif
44392 #undef TARGET_MANGLE_TYPE
44393 #define TARGET_MANGLE_TYPE ix86_mangle_type
44395 #if !TARGET_MACHO
44396 #undef TARGET_STACK_PROTECT_FAIL
44397 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
44398 #endif
44400 #undef TARGET_FUNCTION_VALUE
44401 #define TARGET_FUNCTION_VALUE ix86_function_value
44403 #undef TARGET_FUNCTION_VALUE_REGNO_P
44404 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
44406 #undef TARGET_PROMOTE_FUNCTION_MODE
44407 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
44409 #undef TARGET_MEMBER_TYPE_FORCES_BLK
44410 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
44412 #undef TARGET_INSTANTIATE_DECLS
44413 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
44415 #undef TARGET_SECONDARY_RELOAD
44416 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
44418 #undef TARGET_CLASS_MAX_NREGS
44419 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
44421 #undef TARGET_PREFERRED_RELOAD_CLASS
44422 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
44423 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
44424 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
44425 #undef TARGET_CLASS_LIKELY_SPILLED_P
44426 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
44428 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
44429 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
44430 ix86_builtin_vectorization_cost
44431 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
44432 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
44433 ix86_vectorize_vec_perm_const_ok
44434 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
44435 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
44436 ix86_preferred_simd_mode
44437 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
44438 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
44439 ix86_autovectorize_vector_sizes
44440 #undef TARGET_VECTORIZE_INIT_COST
44441 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
44442 #undef TARGET_VECTORIZE_ADD_STMT_COST
44443 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
44444 #undef TARGET_VECTORIZE_FINISH_COST
44445 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
44446 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
44447 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
44449 #undef TARGET_SET_CURRENT_FUNCTION
44450 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
44452 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
44453 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
44455 #undef TARGET_OPTION_SAVE
44456 #define TARGET_OPTION_SAVE ix86_function_specific_save
44458 #undef TARGET_OPTION_RESTORE
44459 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
44461 #undef TARGET_OPTION_PRINT
44462 #define TARGET_OPTION_PRINT ix86_function_specific_print
44464 #undef TARGET_OPTION_FUNCTION_VERSIONS
44465 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
44467 #undef TARGET_CAN_INLINE_P
44468 #define TARGET_CAN_INLINE_P ix86_can_inline_p
44470 #undef TARGET_EXPAND_TO_RTL_HOOK
44471 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
44473 #undef TARGET_LEGITIMATE_ADDRESS_P
44474 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
44476 #undef TARGET_LRA_P
44477 #define TARGET_LRA_P hook_bool_void_true
44479 #undef TARGET_REGISTER_PRIORITY
44480 #define TARGET_REGISTER_PRIORITY ix86_register_priority
44482 #undef TARGET_REGISTER_USAGE_LEVELING_P
44483 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
44485 #undef TARGET_LEGITIMATE_CONSTANT_P
44486 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
44488 #undef TARGET_FRAME_POINTER_REQUIRED
44489 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
44491 #undef TARGET_CAN_ELIMINATE
44492 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
44494 #undef TARGET_EXTRA_LIVE_ON_ENTRY
44495 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
44497 #undef TARGET_ASM_CODE_END
44498 #define TARGET_ASM_CODE_END ix86_code_end
44500 #undef TARGET_CONDITIONAL_REGISTER_USAGE
44501 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
44503 #if TARGET_MACHO
44504 #undef TARGET_INIT_LIBFUNCS
44505 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
44506 #endif
44508 #undef TARGET_SPILL_CLASS
44509 #define TARGET_SPILL_CLASS ix86_spill_class
44511 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
44512 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
44513 ix86_simd_clone_compute_vecsize_and_simdlen
44515 #undef TARGET_SIMD_CLONE_ADJUST
44516 #define TARGET_SIMD_CLONE_ADJUST \
44517 ix86_simd_clone_adjust
44519 #undef TARGET_SIMD_CLONE_USABLE
44520 #define TARGET_SIMD_CLONE_USABLE \
44521 ix86_simd_clone_usable
44523 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
44524 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
44525 ix86_float_exceptions_rounding_supported_p
44527 struct gcc_target targetm = TARGET_INITIALIZER;
44529 #include "gt-i386.h"